|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9149207505920933, |
|
"eval_steps": 500, |
|
"global_step": 8000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0036436509382401167, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.987852283770651e-05, |
|
"loss": 3.4902, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007287301876480233, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.975704567541302e-05, |
|
"loss": 3.3432, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01093095281472035, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.963556851311953e-05, |
|
"loss": 3.2381, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014574603752960467, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.951409135082604e-05, |
|
"loss": 3.2931, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.018218254691200583, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.939261418853257e-05, |
|
"loss": 3.3235, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0218619056294407, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 9.927113702623908e-05, |
|
"loss": 3.2988, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.025505556567680818, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.914965986394558e-05, |
|
"loss": 3.2927, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.029149207505920934, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.90281827016521e-05, |
|
"loss": 3.275, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03279285844416105, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.89067055393586e-05, |
|
"loss": 3.316, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.036436509382401165, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.878522837706513e-05, |
|
"loss": 3.2611, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04008016032064128, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 9.866375121477162e-05, |
|
"loss": 3.268, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0437238112588814, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 9.854227405247813e-05, |
|
"loss": 3.3032, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04736746219712151, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.842079689018465e-05, |
|
"loss": 3.3334, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.051011113135361635, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 9.829931972789116e-05, |
|
"loss": 3.1943, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05465476407360175, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.817784256559767e-05, |
|
"loss": 3.2574, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05829841501184187, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.805636540330418e-05, |
|
"loss": 3.3747, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.06194206595008198, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9.793488824101069e-05, |
|
"loss": 3.2992, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0655857168883221, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 9.781341107871722e-05, |
|
"loss": 3.2342, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06922936782656222, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.769193391642371e-05, |
|
"loss": 3.356, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07287301876480233, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.757045675413022e-05, |
|
"loss": 3.3618, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07651666970304245, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.744897959183674e-05, |
|
"loss": 3.2931, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.08016032064128256, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.732750242954325e-05, |
|
"loss": 3.3246, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08380397157952268, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.720602526724975e-05, |
|
"loss": 3.3181, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0874476225177628, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.708454810495627e-05, |
|
"loss": 3.2757, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09109127345600292, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.696307094266278e-05, |
|
"loss": 3.2753, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09473492439424303, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.68415937803693e-05, |
|
"loss": 3.3207, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09837857533248315, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.67201166180758e-05, |
|
"loss": 3.3035, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10202222627072327, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.659863945578231e-05, |
|
"loss": 3.3025, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10566587720896338, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.647716229348883e-05, |
|
"loss": 3.2066, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1093095281472035, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 9.635568513119534e-05, |
|
"loss": 3.2757, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11295317908544361, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 9.623420796890185e-05, |
|
"loss": 3.1904, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11659683002368373, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.611273080660836e-05, |
|
"loss": 3.1947, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12024048096192384, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.599125364431487e-05, |
|
"loss": 3.2016, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12388413190016397, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.58697764820214e-05, |
|
"loss": 3.329, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12752778283840407, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 9.574829931972789e-05, |
|
"loss": 3.2483, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1311714337766442, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.56268221574344e-05, |
|
"loss": 3.2388, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13481508471488432, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.550534499514092e-05, |
|
"loss": 3.2722, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.13845873565312444, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.538386783284743e-05, |
|
"loss": 3.2672, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14210238659136454, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 9.526239067055394e-05, |
|
"loss": 3.3378, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14574603752960466, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 9.514091350826045e-05, |
|
"loss": 3.2637, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.14938968846784478, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.501943634596696e-05, |
|
"loss": 3.2879, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1530333394060849, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.489795918367348e-05, |
|
"loss": 3.2614, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.156676990344325, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.477648202137999e-05, |
|
"loss": 3.2469, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.16032064128256512, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.465500485908649e-05, |
|
"loss": 3.1614, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16396429222080525, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 9.453352769679301e-05, |
|
"loss": 3.2658, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16760794315904537, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 9.441205053449952e-05, |
|
"loss": 3.3253, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1712515940972855, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 9.429057337220603e-05, |
|
"loss": 3.2311, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1748952450355256, |
|
"grad_norm": 0.625, |
|
"learning_rate": 9.416909620991254e-05, |
|
"loss": 3.3117, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1785388959737657, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.404761904761905e-05, |
|
"loss": 3.3513, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18218254691200583, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.392614188532556e-05, |
|
"loss": 3.3071, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18582619785024596, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.380466472303208e-05, |
|
"loss": 3.3047, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.18946984878848605, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.368318756073858e-05, |
|
"loss": 3.1964, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.19311349972672617, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.35617103984451e-05, |
|
"loss": 3.2459, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1967571506649663, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.344023323615161e-05, |
|
"loss": 3.205, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.20040080160320642, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 9.331875607385812e-05, |
|
"loss": 3.2856, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.20404445254144654, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9.319727891156463e-05, |
|
"loss": 3.185, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.20768810347968664, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 9.307580174927114e-05, |
|
"loss": 3.3071, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.21133175441792676, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.295432458697765e-05, |
|
"loss": 3.2363, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.21497540535616688, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.283284742468417e-05, |
|
"loss": 3.2697, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.218619056294407, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.271137026239067e-05, |
|
"loss": 3.3037, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2222627072326471, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 9.258989310009719e-05, |
|
"loss": 3.2371, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.22590635817088722, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 9.24684159378037e-05, |
|
"loss": 3.3367, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.22955000910912735, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 9.234693877551021e-05, |
|
"loss": 3.2109, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.23319366004736747, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 9.222546161321672e-05, |
|
"loss": 3.2374, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2368373109856076, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.210398445092323e-05, |
|
"loss": 3.3066, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24048096192384769, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 9.198250728862974e-05, |
|
"loss": 3.2635, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2441246128620878, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.186103012633626e-05, |
|
"loss": 3.26, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.24776826380032793, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 9.173955296404276e-05, |
|
"loss": 3.2641, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.25141191473856805, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 9.161807580174927e-05, |
|
"loss": 3.2907, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.25505556567680815, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 9.149659863945579e-05, |
|
"loss": 3.2567, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2586992166150483, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.13751214771623e-05, |
|
"loss": 3.2838, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2623428675532884, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.125364431486881e-05, |
|
"loss": 3.2969, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2659865184915285, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.113216715257532e-05, |
|
"loss": 3.2212, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.26963016942976864, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 9.101068999028183e-05, |
|
"loss": 3.212, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.27327382036800874, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 9.088921282798835e-05, |
|
"loss": 3.3488, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2769174713062489, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.076773566569486e-05, |
|
"loss": 3.2143, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.280561122244489, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.064625850340136e-05, |
|
"loss": 3.2518, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2842047731827291, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.052478134110788e-05, |
|
"loss": 3.2638, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2878484241209692, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.040330417881439e-05, |
|
"loss": 3.2584, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2914920750592093, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 9.02818270165209e-05, |
|
"loss": 3.2841, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.29513572599744947, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 9.01603498542274e-05, |
|
"loss": 3.261, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.29877937693568957, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 9.003887269193392e-05, |
|
"loss": 3.2954, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.30242302787392966, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.991739552964044e-05, |
|
"loss": 3.2337, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3060666788121698, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 8.979591836734695e-05, |
|
"loss": 3.2881, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3097103297504099, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 8.967444120505344e-05, |
|
"loss": 3.3519, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.31335398068865, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.955296404275997e-05, |
|
"loss": 3.3147, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.31699763162689015, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 8.943148688046648e-05, |
|
"loss": 3.2304, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.32064128256513025, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.931000971817299e-05, |
|
"loss": 3.2526, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3242849335033704, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 8.91885325558795e-05, |
|
"loss": 3.309, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3279285844416105, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 8.9067055393586e-05, |
|
"loss": 3.2513, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3315722353798506, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 8.894557823129253e-05, |
|
"loss": 3.2135, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.33521588631809074, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 8.882410106899904e-05, |
|
"loss": 3.3048, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.33885953725633083, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.870262390670553e-05, |
|
"loss": 3.3047, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.342503188194571, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.858114674441206e-05, |
|
"loss": 3.2616, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3461468391328111, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.845966958211857e-05, |
|
"loss": 3.2697, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3497904900710512, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.833819241982508e-05, |
|
"loss": 3.2395, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3534341410092913, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.821671525753159e-05, |
|
"loss": 3.2137, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3570777919475314, |
|
"grad_norm": 0.625, |
|
"learning_rate": 8.80952380952381e-05, |
|
"loss": 3.2872, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.36072144288577157, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.797376093294462e-05, |
|
"loss": 3.2682, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.36436509382401167, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 8.785228377065113e-05, |
|
"loss": 3.204, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.36800874476225176, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 8.773080660835762e-05, |
|
"loss": 3.2472, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3716523957004919, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 8.760932944606415e-05, |
|
"loss": 3.2638, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.375296046638732, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.748785228377066e-05, |
|
"loss": 3.2803, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3789396975769721, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.736637512147716e-05, |
|
"loss": 3.273, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.38258334851521225, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 8.724489795918367e-05, |
|
"loss": 3.2854, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.38622699945345235, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.712342079689018e-05, |
|
"loss": 3.2373, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3898706503916925, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.700194363459671e-05, |
|
"loss": 3.2259, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.3935143013299326, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 8.688046647230322e-05, |
|
"loss": 3.2402, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3971579522681727, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.675898931000973e-05, |
|
"loss": 3.2379, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.40080160320641284, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 8.663751214771624e-05, |
|
"loss": 3.2564, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.40444525414465293, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 8.651603498542274e-05, |
|
"loss": 3.2342, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4080889050828931, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 8.639455782312925e-05, |
|
"loss": 3.3336, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.4117325560211332, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 8.627308066083576e-05, |
|
"loss": 3.2684, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4153762069593733, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.615160349854227e-05, |
|
"loss": 3.2581, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4190198578976134, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 8.603012633624878e-05, |
|
"loss": 3.3428, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4226635088358535, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.59086491739553e-05, |
|
"loss": 3.2331, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.42630715977409367, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 8.578717201166182e-05, |
|
"loss": 3.2203, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.42995081071233376, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.566569484936832e-05, |
|
"loss": 3.248, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.43359446165057386, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 8.554421768707483e-05, |
|
"loss": 3.3052, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.437238112588814, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 8.542274052478134e-05, |
|
"loss": 3.2036, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4408817635270541, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 8.530126336248787e-05, |
|
"loss": 3.2199, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.4445254144652942, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 8.517978620019436e-05, |
|
"loss": 3.2594, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.44816906540353435, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.505830903790087e-05, |
|
"loss": 3.26, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.45181271634177445, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 8.49368318756074e-05, |
|
"loss": 3.3623, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4554563672800146, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 8.48153547133139e-05, |
|
"loss": 3.2625, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4591000182182547, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.469387755102041e-05, |
|
"loss": 3.2738, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.4627436691564948, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.457240038872692e-05, |
|
"loss": 3.2688, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.46638732009473494, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 8.445092322643343e-05, |
|
"loss": 3.2392, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.47003097103297503, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 8.432944606413996e-05, |
|
"loss": 3.2414, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4736746219712152, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.420796890184645e-05, |
|
"loss": 3.2461, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4773182729094553, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 8.408649173955296e-05, |
|
"loss": 3.3459, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.48096192384769537, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.396501457725948e-05, |
|
"loss": 3.2631, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.4846055747859355, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 8.3843537414966e-05, |
|
"loss": 3.2883, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.4882492257241756, |
|
"grad_norm": 0.625, |
|
"learning_rate": 8.372206025267249e-05, |
|
"loss": 3.2085, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.49189287666241577, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 8.360058309037901e-05, |
|
"loss": 3.3132, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.49553652760065586, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.347910592808552e-05, |
|
"loss": 3.3076, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.49918017853889596, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 8.335762876579204e-05, |
|
"loss": 3.3183, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5028238294771361, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 8.323615160349854e-05, |
|
"loss": 3.1761, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5064674804153763, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.311467444120505e-05, |
|
"loss": 3.2079, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5101111313536163, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.299319727891157e-05, |
|
"loss": 3.2844, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5137547822918564, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 8.287172011661808e-05, |
|
"loss": 3.2492, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5173984332300966, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 8.275024295432459e-05, |
|
"loss": 3.2525, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5210420841683366, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 8.26287657920311e-05, |
|
"loss": 3.2449, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5246857351065768, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.250728862973761e-05, |
|
"loss": 3.2279, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5283293860448169, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.238581146744413e-05, |
|
"loss": 3.2751, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.531973036983057, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.226433430515063e-05, |
|
"loss": 3.2404, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5356166879212971, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 8.214285714285714e-05, |
|
"loss": 3.2911, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5392603388595373, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 8.202137998056366e-05, |
|
"loss": 3.2637, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5429039897977773, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 8.189990281827017e-05, |
|
"loss": 3.2004, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5465476407360175, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 8.177842565597668e-05, |
|
"loss": 3.2958, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5501912916742576, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 8.165694849368319e-05, |
|
"loss": 3.2371, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5538349426124978, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 8.15354713313897e-05, |
|
"loss": 3.2798, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.5574785935507378, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 8.141399416909622e-05, |
|
"loss": 3.2608, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.561122244488978, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 8.129251700680273e-05, |
|
"loss": 3.2374, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5647658954272181, |
|
"grad_norm": 0.625, |
|
"learning_rate": 8.117103984450923e-05, |
|
"loss": 3.189, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5684095463654582, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 8.104956268221575e-05, |
|
"loss": 3.2008, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5720531973036983, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 8.092808551992226e-05, |
|
"loss": 3.219, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.5756968482419385, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.080660835762877e-05, |
|
"loss": 3.2417, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5793404991801785, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 8.068513119533528e-05, |
|
"loss": 3.236, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.5829841501184186, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.056365403304179e-05, |
|
"loss": 3.3037, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5866278010566588, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.04421768707483e-05, |
|
"loss": 3.2412, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5902714519948989, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.032069970845482e-05, |
|
"loss": 3.2293, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.593915102933139, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 8.019922254616132e-05, |
|
"loss": 3.2208, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5975587538713791, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 8.007774538386784e-05, |
|
"loss": 3.2251, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6012024048096193, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.995626822157435e-05, |
|
"loss": 3.284, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6048460557478593, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.983479105928086e-05, |
|
"loss": 3.2404, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6084897066860995, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.971331389698737e-05, |
|
"loss": 3.3335, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6121333576243396, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 7.959183673469388e-05, |
|
"loss": 3.276, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6157770085625797, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.947035957240039e-05, |
|
"loss": 3.2263, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6194206595008198, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 7.934888241010691e-05, |
|
"loss": 3.1878, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.62306431043906, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.922740524781341e-05, |
|
"loss": 3.294, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6267079613773, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.910592808551993e-05, |
|
"loss": 3.2183, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6303516123155402, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.898445092322644e-05, |
|
"loss": 3.1985, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6339952632537803, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.886297376093295e-05, |
|
"loss": 3.1563, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6376389141920205, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.874149659863946e-05, |
|
"loss": 3.2806, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6412825651302605, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 7.862001943634597e-05, |
|
"loss": 3.2288, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6449262160685006, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.849854227405248e-05, |
|
"loss": 3.2785, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.6485698670067408, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 7.8377065111759e-05, |
|
"loss": 3.2952, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6522135179449808, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 7.82555879494655e-05, |
|
"loss": 3.1665, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.655857168883221, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 7.8134110787172e-05, |
|
"loss": 3.1984, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6595008198214611, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.801263362487853e-05, |
|
"loss": 3.2051, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6631444707597012, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.789115646258504e-05, |
|
"loss": 3.2141, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6667881216979413, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.776967930029155e-05, |
|
"loss": 3.312, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.6704317726361815, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.764820213799806e-05, |
|
"loss": 3.2473, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.6740754235744215, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.752672497570457e-05, |
|
"loss": 3.2924, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.6777190745126617, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 7.740524781341109e-05, |
|
"loss": 3.2799, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.6813627254509018, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.72837706511176e-05, |
|
"loss": 3.2251, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.685006376389142, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 7.71622934888241e-05, |
|
"loss": 3.209, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.688650027327382, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.704081632653062e-05, |
|
"loss": 3.2312, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.6922936782656222, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 7.691933916423713e-05, |
|
"loss": 3.2487, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6959373292038623, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 7.679786200194364e-05, |
|
"loss": 3.3157, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6995809801421023, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.667638483965015e-05, |
|
"loss": 3.299, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7032246310803425, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.655490767735666e-05, |
|
"loss": 3.2755, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7068682820185826, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.643343051506318e-05, |
|
"loss": 3.317, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.7105119329568227, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.631195335276969e-05, |
|
"loss": 3.1871, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7141555838950628, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.619047619047618e-05, |
|
"loss": 3.2405, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.717799234833303, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 7.606899902818271e-05, |
|
"loss": 3.3068, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.7214428857715431, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.594752186588922e-05, |
|
"loss": 3.335, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7250865367097832, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.582604470359573e-05, |
|
"loss": 3.2617, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7287301876480233, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 7.570456754130224e-05, |
|
"loss": 3.2335, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7323738385862635, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 7.558309037900875e-05, |
|
"loss": 3.2604, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.7360174895245035, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 7.546161321671527e-05, |
|
"loss": 3.2632, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.7396611404627437, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.534013605442178e-05, |
|
"loss": 3.2184, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.7433047914009838, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.521865889212827e-05, |
|
"loss": 3.2848, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.7469484423392239, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.50971817298348e-05, |
|
"loss": 3.2473, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.750592093277464, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.49757045675413e-05, |
|
"loss": 3.195, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.7542357442157042, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.485422740524782e-05, |
|
"loss": 3.2248, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.7578793951539442, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 7.473275024295433e-05, |
|
"loss": 3.1511, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.7615230460921844, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 7.461127308066083e-05, |
|
"loss": 3.2719, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.7651666970304245, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 7.448979591836736e-05, |
|
"loss": 3.2339, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7688103479686647, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 7.436831875607387e-05, |
|
"loss": 3.2863, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.7724539989069047, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 7.424684159378036e-05, |
|
"loss": 3.2057, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.7760976498451448, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.412536443148689e-05, |
|
"loss": 3.2397, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.779741300783385, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.40038872691934e-05, |
|
"loss": 3.2323, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.783384951721625, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.38824101068999e-05, |
|
"loss": 3.2764, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.7870286026598652, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 7.376093294460641e-05, |
|
"loss": 3.2668, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.7906722535981053, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 7.363945578231292e-05, |
|
"loss": 3.2953, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.7943159045363454, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 7.351797862001945e-05, |
|
"loss": 3.1915, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.7979595554745855, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 7.339650145772596e-05, |
|
"loss": 3.2622, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.8016032064128257, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.327502429543247e-05, |
|
"loss": 3.2522, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8052468573510657, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 7.315354713313898e-05, |
|
"loss": 3.1673, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.8088905082893059, |
|
"grad_norm": 0.625, |
|
"learning_rate": 7.303206997084548e-05, |
|
"loss": 3.2722, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.812534159227546, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.2910592808552e-05, |
|
"loss": 3.2377, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.8161778101657862, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.27891156462585e-05, |
|
"loss": 3.179, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8198214611040262, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 7.266763848396501e-05, |
|
"loss": 3.2588, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.8234651120422664, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 7.254616132167152e-05, |
|
"loss": 3.2664, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.8271087629805065, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.242468415937805e-05, |
|
"loss": 3.2515, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.8307524139187465, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 7.230320699708455e-05, |
|
"loss": 3.2102, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.8343960648569867, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.218172983479106e-05, |
|
"loss": 3.246, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.8380397157952268, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 7.206025267249757e-05, |
|
"loss": 3.3321, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8416833667334669, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 7.193877551020408e-05, |
|
"loss": 3.0889, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.845327017671707, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 7.18172983479106e-05, |
|
"loss": 3.2811, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.8489706686099472, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.16958211856171e-05, |
|
"loss": 3.1688, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.8526143195481873, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.157434402332361e-05, |
|
"loss": 3.2495, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.8562579704864274, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 7.145286686103013e-05, |
|
"loss": 3.1742, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8599016214246675, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.133138969873664e-05, |
|
"loss": 3.2293, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.8635452723629077, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 7.120991253644315e-05, |
|
"loss": 3.2574, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.8671889233011477, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 7.108843537414966e-05, |
|
"loss": 3.2496, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.8708325742393879, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 7.096695821185617e-05, |
|
"loss": 3.2527, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.874476225177628, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 7.08454810495627e-05, |
|
"loss": 3.1984, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8781198761158681, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 7.072400388726919e-05, |
|
"loss": 3.2517, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.8817635270541082, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.06025267249757e-05, |
|
"loss": 3.2105, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.8854071779923484, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 7.048104956268222e-05, |
|
"loss": 3.2125, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.8890508289305884, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.035957240038873e-05, |
|
"loss": 3.255, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.8926944798688285, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 7.023809523809524e-05, |
|
"loss": 3.3331, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8963381308070687, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 7.011661807580175e-05, |
|
"loss": 3.3545, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.8999817817453089, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 6.999514091350826e-05, |
|
"loss": 3.2776, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.9036254326835489, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.987366375121478e-05, |
|
"loss": 3.2331, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.907269083621789, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.975218658892128e-05, |
|
"loss": 3.2803, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.9109127345600292, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.963070942662779e-05, |
|
"loss": 3.256, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9145563854982692, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 6.950923226433431e-05, |
|
"loss": 3.2896, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.9182000364365094, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 6.938775510204082e-05, |
|
"loss": 3.2555, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.9218436873747495, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.926627793974733e-05, |
|
"loss": 3.2682, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.9254873383129896, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.914480077745384e-05, |
|
"loss": 3.1564, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.9291309892512297, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.902332361516035e-05, |
|
"loss": 3.1445, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.9327746401894699, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 6.890184645286687e-05, |
|
"loss": 3.2515, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.9364182911277099, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.878036929057337e-05, |
|
"loss": 3.1962, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.9400619420659501, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 6.865889212827988e-05, |
|
"loss": 3.3199, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.9437055930041902, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 6.85374149659864e-05, |
|
"loss": 3.264, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.9473492439424304, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 6.841593780369291e-05, |
|
"loss": 3.1853, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9509928948806704, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 6.829446064139942e-05, |
|
"loss": 3.3017, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.9546365458189106, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.817298347910593e-05, |
|
"loss": 3.2358, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.9582801967571507, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 6.805150631681244e-05, |
|
"loss": 3.2854, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.9619238476953907, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 6.793002915451895e-05, |
|
"loss": 3.1873, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.9655674986336309, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 6.780855199222547e-05, |
|
"loss": 3.2274, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.969211149571871, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 6.768707482993197e-05, |
|
"loss": 3.2037, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.9728548005101111, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 6.756559766763849e-05, |
|
"loss": 3.3132, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.9764984514483512, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 6.7444120505345e-05, |
|
"loss": 3.2734, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.9801421023865914, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 6.732264334305151e-05, |
|
"loss": 3.1784, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.9837857533248315, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 6.720116618075802e-05, |
|
"loss": 3.2181, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9874294042630716, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 6.707968901846453e-05, |
|
"loss": 3.2676, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.9910730552013117, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.695821185617104e-05, |
|
"loss": 3.1952, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.9947167061395519, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 6.683673469387756e-05, |
|
"loss": 3.3135, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.9983603570777919, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 6.671525753158406e-05, |
|
"loss": 3.2643, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.002004008016032, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 6.659378036929058e-05, |
|
"loss": 3.1996, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.0056476589542722, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.647230320699709e-05, |
|
"loss": 3.0862, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.0092913098925123, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.63508260447036e-05, |
|
"loss": 3.1886, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.0129349608307525, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.622934888241011e-05, |
|
"loss": 3.1478, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.0165786117689926, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 6.610787172011662e-05, |
|
"loss": 3.1577, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.0202222627072326, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.598639455782313e-05, |
|
"loss": 3.148, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.0238659136454729, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 6.586491739552965e-05, |
|
"loss": 3.1971, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.027509564583713, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 6.574344023323615e-05, |
|
"loss": 3.1351, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.031153215521953, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.562196307094267e-05, |
|
"loss": 3.2304, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.0347968664601932, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.550048590864918e-05, |
|
"loss": 3.1582, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.0384405173984332, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.537900874635569e-05, |
|
"loss": 3.1183, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.0420841683366733, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 6.52575315840622e-05, |
|
"loss": 3.2056, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.0457278192749135, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 6.513605442176871e-05, |
|
"loss": 3.1694, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.0493714702131536, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 6.501457725947522e-05, |
|
"loss": 3.1428, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.0530151211513936, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 6.489310009718174e-05, |
|
"loss": 3.1052, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.0566587720896339, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.477162293488824e-05, |
|
"loss": 3.1195, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.060302423027874, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.465014577259475e-05, |
|
"loss": 3.2278, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.063946073966114, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.452866861030127e-05, |
|
"loss": 3.1563, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.0675897249043542, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 6.440719144800778e-05, |
|
"loss": 3.1505, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.0712333758425943, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 6.428571428571429e-05, |
|
"loss": 3.1681, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.0748770267808343, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 6.41642371234208e-05, |
|
"loss": 3.17, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.0785206777190746, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 6.40427599611273e-05, |
|
"loss": 3.1775, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.0821643286573146, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.392128279883383e-05, |
|
"loss": 3.0921, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.0858079795955549, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.379980563654034e-05, |
|
"loss": 3.1666, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.089451630533795, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 6.367832847424684e-05, |
|
"loss": 3.1935, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.093095281472035, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.355685131195336e-05, |
|
"loss": 3.0588, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.096738932410275, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 6.343537414965987e-05, |
|
"loss": 3.1867, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.1003825833485152, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 6.331389698736638e-05, |
|
"loss": 3.162, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.1040262342867553, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 6.319241982507289e-05, |
|
"loss": 3.1737, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.1076698852249955, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 6.30709426627794e-05, |
|
"loss": 3.1974, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.1113135361632356, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 6.294946550048592e-05, |
|
"loss": 3.1584, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.1149571871014756, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 6.282798833819243e-05, |
|
"loss": 3.1856, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.1186008380397159, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.270651117589892e-05, |
|
"loss": 3.177, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.122244488977956, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 6.258503401360545e-05, |
|
"loss": 3.2028, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.125888139916196, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 6.246355685131196e-05, |
|
"loss": 3.2031, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.1295317908544362, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 6.234207968901847e-05, |
|
"loss": 3.0629, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.1331754417926763, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 6.222060252672498e-05, |
|
"loss": 3.0927, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.1368190927309163, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 6.209912536443149e-05, |
|
"loss": 3.2134, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.1404627436691566, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 6.197764820213801e-05, |
|
"loss": 3.2027, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.1441063946073966, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 6.185617103984452e-05, |
|
"loss": 3.1448, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.1477500455456366, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 6.173469387755101e-05, |
|
"loss": 3.1713, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.151393696483877, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.161321671525754e-05, |
|
"loss": 3.1612, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.155037347422117, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.149173955296405e-05, |
|
"loss": 3.1934, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.158680998360357, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 6.137026239067056e-05, |
|
"loss": 3.1231, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.1623246492985972, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.124878522837707e-05, |
|
"loss": 3.1606, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.1659683002368373, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.112730806608357e-05, |
|
"loss": 3.135, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.1696119511750775, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 6.10058309037901e-05, |
|
"loss": 3.1592, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.1732556021133176, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 6.08843537414966e-05, |
|
"loss": 3.2429, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.1768992530515576, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 6.076287657920311e-05, |
|
"loss": 3.1182, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.1805429039897977, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 6.0641399416909626e-05, |
|
"loss": 3.2273, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.184186554928038, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 6.0519922254616135e-05, |
|
"loss": 3.2101, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.187830205866278, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.0398445092322645e-05, |
|
"loss": 3.1181, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.1914738568045182, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.027696793002916e-05, |
|
"loss": 3.1349, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.1951175077427583, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 6.015549076773567e-05, |
|
"loss": 3.152, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.1987611586809983, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 6.003401360544217e-05, |
|
"loss": 3.1806, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.2024048096192386, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 5.991253644314869e-05, |
|
"loss": 3.1708, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.2060484605574786, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 5.97910592808552e-05, |
|
"loss": 3.114, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.2096921114957186, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 5.9669582118561715e-05, |
|
"loss": 3.1852, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.213335762433959, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.9548104956268225e-05, |
|
"loss": 3.2373, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.216979413372199, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.9426627793974734e-05, |
|
"loss": 3.2133, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.220623064310439, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 5.930515063168125e-05, |
|
"loss": 3.2556, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.2242667152486792, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 5.918367346938776e-05, |
|
"loss": 3.193, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.2279103661869193, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 5.906219630709426e-05, |
|
"loss": 3.1619, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.2315540171251593, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 5.8940719144800785e-05, |
|
"loss": 3.1265, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.2351976680633996, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 5.881924198250729e-05, |
|
"loss": 3.2705, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.2388413190016396, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.8697764820213804e-05, |
|
"loss": 3.1545, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.2424849699398797, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.8576287657920314e-05, |
|
"loss": 3.1632, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.24612862087812, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 5.845481049562682e-05, |
|
"loss": 3.1776, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.24977227181636, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 5.833333333333334e-05, |
|
"loss": 3.1733, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.2534159227546002, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 5.821185617103985e-05, |
|
"loss": 3.0226, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.2570595736928403, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.809037900874635e-05, |
|
"loss": 3.156, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.2607032246310803, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 5.7968901846452875e-05, |
|
"loss": 3.0929, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.2643468755693203, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 5.784742468415938e-05, |
|
"loss": 3.1027, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.2679905265075606, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.77259475218659e-05, |
|
"loss": 3.2188, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.2716341774458007, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 5.76044703595724e-05, |
|
"loss": 3.0835, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.275277828384041, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.748299319727891e-05, |
|
"loss": 3.0709, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.278921479322281, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.736151603498543e-05, |
|
"loss": 3.1397, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.282565130260521, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 5.724003887269194e-05, |
|
"loss": 3.1717, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.286208781198761, |
|
"grad_norm": 0.875, |
|
"learning_rate": 5.711856171039844e-05, |
|
"loss": 3.1881, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.2898524321370013, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 5.6997084548104964e-05, |
|
"loss": 3.1279, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.2934960830752413, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.6875607385811467e-05, |
|
"loss": 3.1212, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.2971397340134816, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 5.6754130223517976e-05, |
|
"loss": 3.1591, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.3007833849517216, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 5.663265306122449e-05, |
|
"loss": 3.1113, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.3044270358899617, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 5.6511175898931e-05, |
|
"loss": 3.2222, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.308070686828202, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 5.638969873663752e-05, |
|
"loss": 3.1596, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.311714337766442, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 5.626822157434403e-05, |
|
"loss": 3.1784, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.315357988704682, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 5.614674441205054e-05, |
|
"loss": 3.1464, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.3190016396429223, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 5.602526724975705e-05, |
|
"loss": 3.1616, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.3226452905811623, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.5903790087463556e-05, |
|
"loss": 3.1747, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.3262889415194024, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 5.5782312925170065e-05, |
|
"loss": 3.137, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.3299325924576426, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 5.566083576287658e-05, |
|
"loss": 3.1302, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.3335762433958827, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.553935860058309e-05, |
|
"loss": 3.2009, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.337219894334123, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.541788143828961e-05, |
|
"loss": 3.1738, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.340863545272363, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 5.529640427599612e-05, |
|
"loss": 3.0996, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.344507196210603, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 5.5174927113702626e-05, |
|
"loss": 3.2209, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.348150847148843, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.505344995140914e-05, |
|
"loss": 3.1315, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.3517944980870833, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.493197278911565e-05, |
|
"loss": 3.1241, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.3554381490253233, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 5.4810495626822155e-05, |
|
"loss": 3.2209, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.3590817999635636, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.468901846452867e-05, |
|
"loss": 3.1234, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.3627254509018036, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.456754130223518e-05, |
|
"loss": 3.0762, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.3663691018400437, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 5.444606413994169e-05, |
|
"loss": 3.1506, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.3700127527782837, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 5.4324586977648206e-05, |
|
"loss": 3.1047, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.373656403716524, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 5.4203109815354715e-05, |
|
"loss": 3.1776, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.377300054654764, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 5.408163265306123e-05, |
|
"loss": 3.1998, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.3809437055930043, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 5.396015549076774e-05, |
|
"loss": 3.3064, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.3845873565312443, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 5.3838678328474244e-05, |
|
"loss": 3.1491, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.3882310074694844, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 5.371720116618077e-05, |
|
"loss": 3.2158, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.3918746584077246, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.359572400388727e-05, |
|
"loss": 3.0956, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.3955183093459647, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 5.347424684159378e-05, |
|
"loss": 3.159, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.3991619602842047, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 5.3352769679300295e-05, |
|
"loss": 3.2044, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.402805611222445, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.3231292517006805e-05, |
|
"loss": 3.1354, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.406449262160685, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 5.310981535471332e-05, |
|
"loss": 3.2342, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.410092913098925, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.298833819241983e-05, |
|
"loss": 3.1566, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.4137365640371653, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 5.286686103012633e-05, |
|
"loss": 3.1535, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.4173802149754053, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 5.2745383867832856e-05, |
|
"loss": 3.1968, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.4210238659136456, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 5.262390670553936e-05, |
|
"loss": 3.2237, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.4246675168518856, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.250242954324587e-05, |
|
"loss": 3.154, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.4283111677901257, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 5.2380952380952384e-05, |
|
"loss": 3.2096, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.4319548187283657, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 5.2259475218658894e-05, |
|
"loss": 3.1827, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.435598469666606, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 5.213799805636541e-05, |
|
"loss": 3.1439, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.439242120604846, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 5.201652089407192e-05, |
|
"loss": 3.1562, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.4428857715430863, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 5.189504373177842e-05, |
|
"loss": 3.1539, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.4465294224813263, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 5.1773566569484945e-05, |
|
"loss": 3.1449, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.4501730734195664, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 5.165208940719145e-05, |
|
"loss": 3.1867, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.4538167243578064, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.153061224489796e-05, |
|
"loss": 3.1182, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.4574603752960467, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 5.1409135082604474e-05, |
|
"loss": 3.1719, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.4611040262342867, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 5.128765792031098e-05, |
|
"loss": 3.2043, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.464747677172527, |
|
"grad_norm": 0.875, |
|
"learning_rate": 5.116618075801749e-05, |
|
"loss": 3.1937, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.468391328110767, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 5.104470359572401e-05, |
|
"loss": 3.1864, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.472034979049007, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 5.092322643343052e-05, |
|
"loss": 3.1312, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.4756786299872473, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 5.0801749271137035e-05, |
|
"loss": 3.0836, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.4793222809254873, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 5.068027210884354e-05, |
|
"loss": 3.1631, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.4829659318637274, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 5.055879494655005e-05, |
|
"loss": 3.1043, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.4866095828019676, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 5.043731778425656e-05, |
|
"loss": 3.281, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.4902532337402077, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 5.031584062196307e-05, |
|
"loss": 3.1505, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.4938968846784477, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 5.019436345966958e-05, |
|
"loss": 3.1435, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.497540535616688, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 5.00728862973761e-05, |
|
"loss": 3.1984, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.501184186554928, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.995140913508261e-05, |
|
"loss": 3.1526, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.5048278374931683, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.982993197278912e-05, |
|
"loss": 3.1732, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.5084714884314083, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.970845481049563e-05, |
|
"loss": 3.2319, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.5121151393696484, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 4.958697764820214e-05, |
|
"loss": 3.1467, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.5157587903078884, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 4.946550048590865e-05, |
|
"loss": 3.0744, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.5194024412461287, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.934402332361516e-05, |
|
"loss": 3.1253, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.5230460921843687, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.922254616132168e-05, |
|
"loss": 3.1691, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.526689743122609, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 4.910106899902818e-05, |
|
"loss": 3.1127, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.530333394060849, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 3.199, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.533977044999089, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.8858114674441206e-05, |
|
"loss": 3.1458, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.537620695937329, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.873663751214772e-05, |
|
"loss": 3.144, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.5412643468755693, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.8615160349854225e-05, |
|
"loss": 3.1611, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.5449079978138094, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.849368318756074e-05, |
|
"loss": 3.2377, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.5485516487520496, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.837220602526725e-05, |
|
"loss": 3.0911, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.5521952996902897, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.825072886297377e-05, |
|
"loss": 3.1286, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.5558389506285297, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.812925170068027e-05, |
|
"loss": 3.1614, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.5594826015667698, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.8007774538386786e-05, |
|
"loss": 3.1076, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.56312625250501, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.7886297376093295e-05, |
|
"loss": 3.1806, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.5667699034432503, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 4.776482021379981e-05, |
|
"loss": 3.1957, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.5704135543814903, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.7643343051506314e-05, |
|
"loss": 3.1933, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.5740572053197304, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.752186588921283e-05, |
|
"loss": 3.212, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.5777008562579704, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.740038872691934e-05, |
|
"loss": 3.1293, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.5813445071962104, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.7278911564625856e-05, |
|
"loss": 3.2165, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.5849881581344507, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.715743440233236e-05, |
|
"loss": 3.1911, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.588631809072691, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.7035957240038875e-05, |
|
"loss": 3.1359, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.592275460010931, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.6914480077745385e-05, |
|
"loss": 3.2345, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 1.595919110949171, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 4.6793002915451894e-05, |
|
"loss": 3.1874, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 1.599562761887411, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 4.667152575315841e-05, |
|
"loss": 3.2192, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 1.6032064128256514, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.655004859086492e-05, |
|
"loss": 3.1632, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.6068500637638914, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 3.1723, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 1.6104937147021317, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 4.630709426627794e-05, |
|
"loss": 3.1858, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 1.6141373656403717, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.6185617103984455e-05, |
|
"loss": 3.1226, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 1.6177810165786117, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 4.6064139941690965e-05, |
|
"loss": 3.2065, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 1.6214246675168518, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.5942662779397474e-05, |
|
"loss": 3.105, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.625068318455092, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 4.5821185617103983e-05, |
|
"loss": 3.1379, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 1.628711969393332, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.56997084548105e-05, |
|
"loss": 3.1684, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 1.6323556203315723, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.557823129251701e-05, |
|
"loss": 3.1278, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 1.6359992712698124, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.5456754130223525e-05, |
|
"loss": 3.1971, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 1.6396429222080524, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 4.533527696793003e-05, |
|
"loss": 3.1004, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.6432865731462925, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 4.5213799805636544e-05, |
|
"loss": 3.1026, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 1.6469302240845327, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 4.5092322643343054e-05, |
|
"loss": 3.1681, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 1.650573875022773, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.497084548104957e-05, |
|
"loss": 3.185, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 1.654217525961013, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.484936831875607e-05, |
|
"loss": 3.1992, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 1.657861176899253, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.472789115646259e-05, |
|
"loss": 3.1486, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.661504827837493, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.46064139941691e-05, |
|
"loss": 3.1765, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 1.6651484787757331, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 4.4484936831875615e-05, |
|
"loss": 3.1672, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 1.6687921297139734, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.436345966958212e-05, |
|
"loss": 3.1509, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 1.6724357806522137, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.4241982507288634e-05, |
|
"loss": 3.1479, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 1.6760794315904537, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 4.412050534499514e-05, |
|
"loss": 3.1274, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.6797230825286937, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.399902818270165e-05, |
|
"loss": 3.1988, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 1.6833667334669338, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 4.387755102040816e-05, |
|
"loss": 3.1433, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 1.687010384405174, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.375607385811468e-05, |
|
"loss": 3.1616, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 1.690654035343414, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.363459669582119e-05, |
|
"loss": 3.2244, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 1.6942976862816543, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.35131195335277e-05, |
|
"loss": 3.2014, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.6979413372198944, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.3391642371234207e-05, |
|
"loss": 3.1558, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 1.7015849881581344, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.327016520894072e-05, |
|
"loss": 3.1166, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 1.7052286390963745, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.314868804664723e-05, |
|
"loss": 3.1352, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 1.7088722900346147, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.302721088435374e-05, |
|
"loss": 3.1346, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 1.7125159409728548, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 4.290573372206025e-05, |
|
"loss": 3.1268, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.716159591911095, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.278425655976677e-05, |
|
"loss": 3.2035, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 1.719803242849335, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 4.266277939747328e-05, |
|
"loss": 3.153, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 1.723446893787575, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 4.2541302235179786e-05, |
|
"loss": 3.1211, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 1.7270905447258151, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.2419825072886296e-05, |
|
"loss": 3.1218, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 1.7307341956640554, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.229834791059281e-05, |
|
"loss": 3.1847, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.7343778466022957, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.217687074829932e-05, |
|
"loss": 3.1372, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 1.7380214975405357, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.205539358600583e-05, |
|
"loss": 3.1543, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 1.7416651484787757, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.193391642371235e-05, |
|
"loss": 3.2583, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 1.7453087994170158, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.181243926141886e-05, |
|
"loss": 3.2017, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 1.7489524503552558, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.1690962099125366e-05, |
|
"loss": 3.1221, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.752596101293496, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.1569484936831876e-05, |
|
"loss": 3.017, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 1.7562397522317363, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.144800777453839e-05, |
|
"loss": 3.0838, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 1.7598834031699764, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.13265306122449e-05, |
|
"loss": 3.1651, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 1.7635270541082164, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.120505344995141e-05, |
|
"loss": 3.2177, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 1.7671707050464565, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.108357628765792e-05, |
|
"loss": 3.2003, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.7708143559846967, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 4.0962099125364436e-05, |
|
"loss": 3.2039, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 1.7744580069229368, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 4.0840621963070946e-05, |
|
"loss": 3.1705, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 1.778101657861177, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.0719144800777455e-05, |
|
"loss": 3.1413, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 1.781745308799417, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 4.0597667638483965e-05, |
|
"loss": 3.177, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 1.785388959737657, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 4.047619047619048e-05, |
|
"loss": 3.19, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.7890326106758971, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.035471331389699e-05, |
|
"loss": 3.1219, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 1.7926762616141374, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.02332361516035e-05, |
|
"loss": 3.2115, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 1.7963199125523774, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.011175898931001e-05, |
|
"loss": 3.2519, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 1.7999635634906177, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.9990281827016526e-05, |
|
"loss": 3.1165, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 1.8036072144288577, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.9868804664723035e-05, |
|
"loss": 3.1574, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 1.8072508653670978, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.9747327502429545e-05, |
|
"loss": 3.1719, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 1.8108945163053378, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.9625850340136054e-05, |
|
"loss": 3.204, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 1.814538167243578, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.950437317784257e-05, |
|
"loss": 3.1539, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.938289601554908e-05, |
|
"loss": 3.2391, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 1.8218254691200584, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.926141885325559e-05, |
|
"loss": 3.2341, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.8254691200582984, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 3.91399416909621e-05, |
|
"loss": 3.1369, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 1.8291127709965385, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.9018464528668615e-05, |
|
"loss": 3.2285, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 1.8327564219347785, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.8896987366375124e-05, |
|
"loss": 3.1143, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 1.8364000728730188, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.8775510204081634e-05, |
|
"loss": 3.1757, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 1.840043723811259, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.865403304178814e-05, |
|
"loss": 3.0718, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 1.843687374749499, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.853255587949466e-05, |
|
"loss": 3.0605, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 1.847331025687739, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 3.841107871720116e-05, |
|
"loss": 3.1806, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 1.8509746766259791, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.828960155490768e-05, |
|
"loss": 3.1507, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 1.8546183275642192, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 3.816812439261419e-05, |
|
"loss": 3.1032, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 1.8582619785024594, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.8046647230320704e-05, |
|
"loss": 3.1767, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.8619056294406997, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 3.7925170068027214e-05, |
|
"loss": 3.1871, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 1.8655492803789397, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.780369290573372e-05, |
|
"loss": 3.2559, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 1.8691929313171798, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.768221574344023e-05, |
|
"loss": 3.1719, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 1.8728365822554198, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.756073858114675e-05, |
|
"loss": 3.1794, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 1.87648023319366, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.743926141885326e-05, |
|
"loss": 3.1099, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 1.8801238841319001, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.731778425655977e-05, |
|
"loss": 3.1963, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 1.8837675350701404, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.7196307094266284e-05, |
|
"loss": 3.1068, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 1.8874111860083804, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.707482993197279e-05, |
|
"loss": 3.1131, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 1.8910548369466205, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.69533527696793e-05, |
|
"loss": 3.2152, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 1.8946984878848605, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.683187560738581e-05, |
|
"loss": 3.2625, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.8983421388231008, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 3.671039844509233e-05, |
|
"loss": 3.1989, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 1.901985789761341, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.658892128279884e-05, |
|
"loss": 3.2053, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 1.905629440699581, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.646744412050535e-05, |
|
"loss": 3.2008, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 1.909273091637821, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.634596695821186e-05, |
|
"loss": 3.1759, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 1.9129167425760611, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.622448979591837e-05, |
|
"loss": 3.1949, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 1.9165603935143012, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.6103012633624876e-05, |
|
"loss": 3.1669, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 1.9202040444525414, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 3.598153547133139e-05, |
|
"loss": 3.1745, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 1.9238476953907817, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.58600583090379e-05, |
|
"loss": 3.1438, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 1.9274913463290217, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.573858114674442e-05, |
|
"loss": 3.1506, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 1.9311349972672618, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.561710398445092e-05, |
|
"loss": 3.1507, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.9347786482055018, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.549562682215744e-05, |
|
"loss": 3.1188, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 1.9384222991437419, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.5374149659863946e-05, |
|
"loss": 3.1295, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 1.9420659500819821, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.525267249757046e-05, |
|
"loss": 3.2526, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 1.9457096010202224, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.5131195335276965e-05, |
|
"loss": 3.1166, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 1.9493532519584624, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.500971817298348e-05, |
|
"loss": 3.1793, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 1.9529969028967025, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 3.488824101068999e-05, |
|
"loss": 3.078, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 1.9566405538349425, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.476676384839651e-05, |
|
"loss": 3.1149, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 1.9602842047731828, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.464528668610301e-05, |
|
"loss": 3.1697, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 1.9639278557114228, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.4523809523809526e-05, |
|
"loss": 3.1781, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 1.967571506649663, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.4402332361516035e-05, |
|
"loss": 3.11, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.971215157587903, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.428085519922255e-05, |
|
"loss": 3.1757, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 1.9748588085261432, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.4159378036929054e-05, |
|
"loss": 3.2046, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 1.9785024594643832, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 3.403790087463557e-05, |
|
"loss": 3.2087, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 1.9821461104026235, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.391642371234208e-05, |
|
"loss": 3.1347, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 1.9857897613408635, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.3794946550048596e-05, |
|
"loss": 3.14, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 1.9894334122791038, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.36734693877551e-05, |
|
"loss": 3.1691, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 1.9930770632173438, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 3.3551992225461615e-05, |
|
"loss": 3.1885, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 1.9967207141555838, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.3430515063168125e-05, |
|
"loss": 3.2177, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 2.000364365093824, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.3309037900874634e-05, |
|
"loss": 3.2009, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 2.004008016032064, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.318756073858115e-05, |
|
"loss": 3.0367, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.0076516669703044, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.306608357628766e-05, |
|
"loss": 3.0228, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 2.0112953179085444, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 3.294460641399417e-05, |
|
"loss": 3.0782, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 2.0149389688467845, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.282312925170068e-05, |
|
"loss": 3.1693, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 2.0185826197850245, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.2701652089407195e-05, |
|
"loss": 3.0759, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 2.0222262707232646, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 3.2580174927113704e-05, |
|
"loss": 3.135, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 2.025869921661505, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 3.245869776482022e-05, |
|
"loss": 3.1345, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 2.029513572599745, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.233722060252672e-05, |
|
"loss": 3.0659, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 2.033157223537985, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.221574344023324e-05, |
|
"loss": 3.1813, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 2.036800874476225, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 3.209426627793975e-05, |
|
"loss": 3.0976, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 2.040444525414465, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.1972789115646265e-05, |
|
"loss": 3.1173, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.0440881763527052, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.185131195335277e-05, |
|
"loss": 3.1076, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 2.0477318272909457, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.1729834791059284e-05, |
|
"loss": 3.112, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 2.0513754782291858, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 3.1608357628765794e-05, |
|
"loss": 3.0959, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 2.055019129167426, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.148688046647231e-05, |
|
"loss": 3.0968, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 2.058662780105666, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.136540330417881e-05, |
|
"loss": 3.0484, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.062306431043906, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.124392614188533e-05, |
|
"loss": 3.1581, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 2.065950081982146, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 3.112244897959184e-05, |
|
"loss": 3.0967, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 2.0695937329203864, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.1000971817298355e-05, |
|
"loss": 3.0299, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 2.0732373838586264, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.087949465500486e-05, |
|
"loss": 3.1771, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 2.0768810347968665, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 3.0758017492711373e-05, |
|
"loss": 3.1248, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.0805246857351065, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 3.063654033041788e-05, |
|
"loss": 3.1227, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 2.0841683366733466, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.0515063168124392e-05, |
|
"loss": 3.1019, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 2.0878119876115866, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.0393586005830905e-05, |
|
"loss": 3.151, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 2.091455638549827, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.0272108843537418e-05, |
|
"loss": 3.0596, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 2.095099289488067, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.015063168124393e-05, |
|
"loss": 3.1311, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.098742940426307, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.0029154518950437e-05, |
|
"loss": 3.1926, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 2.102386591364547, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.990767735665695e-05, |
|
"loss": 3.1265, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 2.1060302423027872, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 2.9786200194363463e-05, |
|
"loss": 3.1637, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 2.1096738932410277, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 2.9664723032069976e-05, |
|
"loss": 3.135, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 2.1133175441792678, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.954324586977648e-05, |
|
"loss": 2.9778, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.116961195117508, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 2.9421768707482994e-05, |
|
"loss": 3.1818, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 2.120604846055748, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.9300291545189507e-05, |
|
"loss": 3.0939, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 2.124248496993988, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 2.917881438289602e-05, |
|
"loss": 3.1183, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 2.127892147932228, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.9057337220602526e-05, |
|
"loss": 3.1304, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 2.1315357988704684, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 2.893586005830904e-05, |
|
"loss": 3.0758, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.1351794498087084, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 2.8814382896015552e-05, |
|
"loss": 3.0331, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 2.1388231007469485, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.8692905733722065e-05, |
|
"loss": 3.1495, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 2.1424667516851885, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 3.0309, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 2.1461104026234286, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 2.8449951409135084e-05, |
|
"loss": 3.1425, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 2.1497540535616686, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 2.8328474246841597e-05, |
|
"loss": 3.0591, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.153397704499909, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.820699708454811e-05, |
|
"loss": 3.1644, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 2.157041355438149, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 2.8085519922254615e-05, |
|
"loss": 3.0664, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 2.160685006376389, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 2.796404275996113e-05, |
|
"loss": 3.1222, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 2.164328657314629, |
|
"grad_norm": 0.875, |
|
"learning_rate": 2.784256559766764e-05, |
|
"loss": 3.1721, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 2.1679723082528692, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.7721088435374147e-05, |
|
"loss": 3.2084, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.1716159591911097, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 2.759961127308066e-05, |
|
"loss": 3.1617, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 2.1752596101293498, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.7478134110787173e-05, |
|
"loss": 3.1179, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 2.17890326106759, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 2.7356656948493686e-05, |
|
"loss": 3.09, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 2.18254691200583, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 2.7235179786200192e-05, |
|
"loss": 3.0906, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 2.18619056294407, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 2.7113702623906705e-05, |
|
"loss": 3.118, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.18983421388231, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.6992225461613218e-05, |
|
"loss": 3.0609, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 2.19347786482055, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 2.687074829931973e-05, |
|
"loss": 3.1233, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 2.1971215157587904, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.674927113702624e-05, |
|
"loss": 3.1228, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 2.2007651666970305, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.662779397473275e-05, |
|
"loss": 3.0195, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 2.2044088176352705, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 2.6506316812439262e-05, |
|
"loss": 3.1056, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.2080524685735106, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 2.6384839650145775e-05, |
|
"loss": 3.0823, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 2.2116961195117506, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 2.6263362487852285e-05, |
|
"loss": 3.0095, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 2.215339770449991, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.6141885325558797e-05, |
|
"loss": 3.1279, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 2.218983421388231, |
|
"grad_norm": 0.875, |
|
"learning_rate": 2.6020408163265307e-05, |
|
"loss": 3.0758, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 2.222627072326471, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.589893100097182e-05, |
|
"loss": 3.0668, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.226270723264711, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 2.577745383867833e-05, |
|
"loss": 3.0601, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 2.2299143742029512, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 2.5655976676384842e-05, |
|
"loss": 3.0568, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 2.2335580251411913, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.5534499514091355e-05, |
|
"loss": 3.1399, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 2.2372016760794318, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 2.541302235179786e-05, |
|
"loss": 3.1351, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 2.240845327017672, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.5291545189504374e-05, |
|
"loss": 3.0368, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.244488977955912, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.5170068027210887e-05, |
|
"loss": 3.1425, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.248132628894152, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 2.50485908649174e-05, |
|
"loss": 3.1455, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 2.251776279832392, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 2.492711370262391e-05, |
|
"loss": 3.1195, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 2.255419930770632, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.480563654033042e-05, |
|
"loss": 3.1929, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 2.2590635817088724, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.468415937803693e-05, |
|
"loss": 3.0456, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.2627072326471125, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.456268221574344e-05, |
|
"loss": 3.1606, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 2.2663508835853525, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.4441205053449954e-05, |
|
"loss": 3.054, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 2.2699945345235926, |
|
"grad_norm": 1.0, |
|
"learning_rate": 2.4319727891156463e-05, |
|
"loss": 3.17, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 2.2736381854618326, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.4198250728862976e-05, |
|
"loss": 3.0642, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 2.277281836400073, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.4076773566569485e-05, |
|
"loss": 2.9784, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.280925487338313, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 2.3955296404275998e-05, |
|
"loss": 3.0481, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 2.284569138276553, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 2.3833819241982508e-05, |
|
"loss": 3.1128, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 2.288212789214793, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 2.371234207968902e-05, |
|
"loss": 3.0554, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 2.2918564401530332, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.359086491739553e-05, |
|
"loss": 3.1442, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 2.2955000910912733, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 2.3469387755102043e-05, |
|
"loss": 3.1732, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.2991437420295133, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 2.3347910592808552e-05, |
|
"loss": 3.1065, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 2.302787392967754, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.3226433430515065e-05, |
|
"loss": 3.1013, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 2.306431043905994, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 2.3104956268221575e-05, |
|
"loss": 3.1159, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 2.310074694844234, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.2983479105928087e-05, |
|
"loss": 3.0996, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 2.313718345782474, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 2.2862001943634597e-05, |
|
"loss": 3.1101, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 2.317361996720714, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 2.2740524781341106e-05, |
|
"loss": 3.1715, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 2.3210056476589545, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 2.261904761904762e-05, |
|
"loss": 3.0314, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 2.3246492985971945, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.249757045675413e-05, |
|
"loss": 3.1542, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 2.3282929495354345, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 2.237609329446064e-05, |
|
"loss": 3.1521, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 2.3319366004736746, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 2.225461613216715e-05, |
|
"loss": 3.1132, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.3355802514119146, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 2.2133138969873664e-05, |
|
"loss": 3.0016, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 2.339223902350155, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.2011661807580177e-05, |
|
"loss": 3.1012, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 2.342867553288395, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 2.1890184645286686e-05, |
|
"loss": 3.0911, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 2.346511204226635, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 2.17687074829932e-05, |
|
"loss": 3.0734, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 2.350154855164875, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.1647230320699712e-05, |
|
"loss": 3.1034, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 2.3537985061031153, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.152575315840622e-05, |
|
"loss": 3.1082, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 2.3574421570413553, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.1404275996112734e-05, |
|
"loss": 3.078, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 2.3610858079795953, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.1282798833819244e-05, |
|
"loss": 3.077, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 2.364729458917836, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 2.1161321671525756e-05, |
|
"loss": 3.0717, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 2.368373109856076, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 2.1039844509232266e-05, |
|
"loss": 3.0983, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.372016760794316, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 2.091836734693878e-05, |
|
"loss": 3.0621, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 2.375660411732556, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.0796890184645288e-05, |
|
"loss": 3.0199, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 2.379304062670796, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 2.06754130223518e-05, |
|
"loss": 3.1456, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 2.3829477136090365, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.055393586005831e-05, |
|
"loss": 3.0592, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 2.3865913645472765, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 2.0432458697764823e-05, |
|
"loss": 3.0988, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 2.3902350154855165, |
|
"grad_norm": 0.875, |
|
"learning_rate": 2.0310981535471333e-05, |
|
"loss": 3.1426, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 2.3938786664237566, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.0189504373177842e-05, |
|
"loss": 3.0916, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 2.3975223173619966, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 2.0068027210884355e-05, |
|
"loss": 3.1088, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 2.4011659683002367, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.9946550048590865e-05, |
|
"loss": 3.0977, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 2.404809619238477, |
|
"grad_norm": 0.875, |
|
"learning_rate": 1.9825072886297377e-05, |
|
"loss": 3.1589, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.408453270176717, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.9703595724003887e-05, |
|
"loss": 3.0655, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 2.412096921114957, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 1.95821185617104e-05, |
|
"loss": 3.085, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 2.4157405720531973, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.946064139941691e-05, |
|
"loss": 3.0235, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 2.4193842229914373, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.9339164237123422e-05, |
|
"loss": 3.1214, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 2.4230278739296773, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 1.921768707482993e-05, |
|
"loss": 3.1466, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 2.426671524867918, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.9096209912536444e-05, |
|
"loss": 3.0947, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 2.430315175806158, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 1.8974732750242954e-05, |
|
"loss": 3.1565, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 2.433958826744398, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.8853255587949467e-05, |
|
"loss": 3.1209, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 2.437602477682638, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.8731778425655976e-05, |
|
"loss": 3.0784, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 2.441246128620878, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.861030126336249e-05, |
|
"loss": 3.1211, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.4448897795591185, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.8488824101069e-05, |
|
"loss": 3.1112, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 2.4485334304973585, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 3.1023, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 2.4521770814355985, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 1.824586977648202e-05, |
|
"loss": 3.1156, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 2.4558207323738386, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.8124392614188534e-05, |
|
"loss": 3.104, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 2.4594643833120786, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 1.8002915451895043e-05, |
|
"loss": 3.1426, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 2.4631080342503187, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.7881438289601556e-05, |
|
"loss": 3.126, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 2.4667516851885587, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.7759961127308065e-05, |
|
"loss": 3.104, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 2.470395336126799, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 1.7638483965014578e-05, |
|
"loss": 3.1772, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 2.474038987065039, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.7517006802721088e-05, |
|
"loss": 3.0617, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 2.4776826380032793, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.73955296404276e-05, |
|
"loss": 3.0941, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.4813262889415193, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.7274052478134113e-05, |
|
"loss": 3.1459, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 2.4849699398797593, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 1.7152575315840623e-05, |
|
"loss": 3.1183, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 2.488613590818, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.7031098153547136e-05, |
|
"loss": 3.1467, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 2.49225724175624, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.6909620991253645e-05, |
|
"loss": 3.1645, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 2.49590089269448, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.6788143828960158e-05, |
|
"loss": 3.1622, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 2.49954454363272, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 3.1339, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 2.50318819457096, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 1.654518950437318e-05, |
|
"loss": 3.0641, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 2.5068318455092005, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 1.642371234207969e-05, |
|
"loss": 3.1318, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 2.51047549644744, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 1.6302235179786203e-05, |
|
"loss": 3.1287, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 2.5141191473856805, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 1.6180758017492712e-05, |
|
"loss": 3.0884, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.5177627983239206, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 1.6059280855199225e-05, |
|
"loss": 3.1332, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 2.5214064492621606, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.5937803692905734e-05, |
|
"loss": 3.129, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 2.5250501002004007, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.5816326530612247e-05, |
|
"loss": 3.134, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 2.5286937511386407, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 1.5694849368318757e-05, |
|
"loss": 3.0906, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 2.532337402076881, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.557337220602527e-05, |
|
"loss": 3.1158, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 2.5359810530151212, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.545189504373178e-05, |
|
"loss": 3.0513, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 2.5396247039533613, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 1.5330417881438292e-05, |
|
"loss": 3.0564, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 2.5432683548916013, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.5208940719144801e-05, |
|
"loss": 3.1163, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 2.5469120058298413, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 1.5087463556851314e-05, |
|
"loss": 3.0554, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 2.550555656768082, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.4965986394557824e-05, |
|
"loss": 3.1519, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.554199307706322, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 1.4844509232264333e-05, |
|
"loss": 3.0809, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 2.557842958644562, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 1.4723032069970846e-05, |
|
"loss": 3.0894, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 2.561486609582802, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.4601554907677355e-05, |
|
"loss": 3.2001, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 2.565130260521042, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.4480077745383868e-05, |
|
"loss": 3.0819, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 2.5687739114592825, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.435860058309038e-05, |
|
"loss": 3.084, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 2.572417562397522, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 1.423712342079689e-05, |
|
"loss": 3.0735, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 2.5760612133357625, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.4115646258503402e-05, |
|
"loss": 3.1273, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 2.5797048642740026, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 1.3994169096209913e-05, |
|
"loss": 3.1316, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 2.5833485152122426, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 1.3872691933916424e-05, |
|
"loss": 3.1375, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 2.5869921661504827, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 1.3751214771622937e-05, |
|
"loss": 3.1154, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.5906358170887227, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.3629737609329446e-05, |
|
"loss": 3.1512, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 2.594279468026963, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.350826044703596e-05, |
|
"loss": 3.087, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 2.5979231189652032, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.3386783284742469e-05, |
|
"loss": 3.1213, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 2.6015667699034433, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.3265306122448982e-05, |
|
"loss": 3.1229, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 2.6052104208416833, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.3143828960155491e-05, |
|
"loss": 3.1252, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 2.6088540717799233, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 1.3022351797862004e-05, |
|
"loss": 3.0816, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 2.612497722718164, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.2900874635568513e-05, |
|
"loss": 3.1299, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 2.616141373656404, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.2779397473275026e-05, |
|
"loss": 3.0499, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 2.619785024594644, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.2657920310981536e-05, |
|
"loss": 3.1844, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 2.623428675532884, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 1.2536443148688048e-05, |
|
"loss": 3.148, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.627072326471124, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 1.2414965986394558e-05, |
|
"loss": 3.0752, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 2.630715977409364, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.2293488824101069e-05, |
|
"loss": 3.1968, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 2.634359628347604, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.217201166180758e-05, |
|
"loss": 3.2027, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 2.6380032792858445, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 1.2050534499514091e-05, |
|
"loss": 3.1299, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 2.6416469302240846, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.1929057337220603e-05, |
|
"loss": 3.1109, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 2.6452905811623246, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 1.1807580174927114e-05, |
|
"loss": 3.1591, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 2.6489342321005647, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.1686103012633627e-05, |
|
"loss": 3.0862, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 2.6525778830388047, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 1.1564625850340138e-05, |
|
"loss": 3.1458, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 2.656221533977045, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.1443148688046649e-05, |
|
"loss": 3.1476, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 2.6598651849152852, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.132167152575316e-05, |
|
"loss": 3.136, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.6635088358535253, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 1.1200194363459671e-05, |
|
"loss": 3.0971, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 2.6671524867917653, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 1.1078717201166182e-05, |
|
"loss": 3.1187, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 2.6707961377300053, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 1.0957240038872693e-05, |
|
"loss": 3.1171, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 2.674439788668246, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 1.0835762876579203e-05, |
|
"loss": 3.1523, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 2.6780834396064854, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 3.1301, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 2.681727090544726, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.0592808551992225e-05, |
|
"loss": 3.1293, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 2.685370741482966, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.0471331389698736e-05, |
|
"loss": 3.1171, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 2.689014392421206, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.0349854227405248e-05, |
|
"loss": 3.0375, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 2.692658043359446, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.0228377065111759e-05, |
|
"loss": 3.0265, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 2.696301694297686, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.010689990281827e-05, |
|
"loss": 3.097, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.6999453452359266, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 9.985422740524781e-06, |
|
"loss": 3.0494, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 2.7035889961741666, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 9.863945578231292e-06, |
|
"loss": 3.0811, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 2.7072326471124066, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 9.742468415937803e-06, |
|
"loss": 3.1214, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 2.7108762980506467, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 9.620991253644314e-06, |
|
"loss": 3.1006, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 2.7145199489888867, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 9.499514091350827e-06, |
|
"loss": 3.1645, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 2.718163599927127, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 9.378036929057338e-06, |
|
"loss": 3.07, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 2.7218072508653672, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 9.25655976676385e-06, |
|
"loss": 3.1343, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 2.7254509018036073, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.13508260447036e-06, |
|
"loss": 3.1049, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 2.7290945527418473, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 9.013605442176872e-06, |
|
"loss": 3.1182, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 2.7327382036800874, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 8.892128279883383e-06, |
|
"loss": 3.063, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.736381854618328, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 8.770651117589894e-06, |
|
"loss": 3.0678, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 2.7400255055565674, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 8.649173955296405e-06, |
|
"loss": 3.1132, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 2.743669156494808, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 8.527696793002917e-06, |
|
"loss": 3.0649, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 2.747312807433048, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.406219630709428e-06, |
|
"loss": 3.0386, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 2.750956458371288, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 8.284742468415939e-06, |
|
"loss": 3.0972, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 2.754600109309528, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 3.1145, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 2.758243760247768, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.04178814382896e-06, |
|
"loss": 3.1053, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 2.7618874111860086, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 7.92031098153547e-06, |
|
"loss": 3.1086, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 2.7655310621242486, |
|
"grad_norm": 0.875, |
|
"learning_rate": 7.798833819241982e-06, |
|
"loss": 3.0831, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 2.7691747130624886, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.677356656948493e-06, |
|
"loss": 3.1135, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.7728183640007287, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.555879494655005e-06, |
|
"loss": 3.0605, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 2.7764620149389687, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 7.434402332361516e-06, |
|
"loss": 2.9854, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 2.780105665877209, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.312925170068027e-06, |
|
"loss": 3.15, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 2.7837493168154492, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.191448007774538e-06, |
|
"loss": 3.1166, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 2.7873929677536893, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.06997084548105e-06, |
|
"loss": 3.0783, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 2.7910366186919293, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.948493683187561e-06, |
|
"loss": 3.0845, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 2.7946802696301694, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 6.827016520894072e-06, |
|
"loss": 3.0717, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 2.7983239205684094, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 6.705539358600584e-06, |
|
"loss": 3.0456, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 2.8019675715066494, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 6.584062196307095e-06, |
|
"loss": 3.071, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 2.80561122244489, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 6.462585034013606e-06, |
|
"loss": 3.063, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 2.80925487338313, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 6.341107871720117e-06, |
|
"loss": 3.1031, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 2.81289852432137, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 6.219630709426628e-06, |
|
"loss": 3.0297, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 2.81654217525961, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 6.098153547133139e-06, |
|
"loss": 3.119, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 2.82018582619785, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 5.97667638483965e-06, |
|
"loss": 3.0535, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 2.8238294771360906, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.855199222546161e-06, |
|
"loss": 3.1086, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 2.8274731280743306, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 5.733722060252672e-06, |
|
"loss": 3.133, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 2.8311167790125706, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 5.612244897959184e-06, |
|
"loss": 3.1374, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 2.8347604299508107, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 5.4907677356656954e-06, |
|
"loss": 3.1706, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 2.8384040808890507, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 5.369290573372207e-06, |
|
"loss": 3.0924, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 2.842047731827291, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 5.247813411078718e-06, |
|
"loss": 3.0695, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 2.845691382765531, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 5.126336248785229e-06, |
|
"loss": 3.0492, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 2.8493350337037713, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 5.00485908649174e-06, |
|
"loss": 3.0992, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 2.8529786846420113, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.88338192419825e-06, |
|
"loss": 3.0975, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 2.8566223355802514, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.7619047619047615e-06, |
|
"loss": 3.1571, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 2.8602659865184914, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 4.640427599611273e-06, |
|
"loss": 3.1478, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 2.8639096374567314, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.518950437317785e-06, |
|
"loss": 3.117, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 2.867553288394972, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.397473275024296e-06, |
|
"loss": 3.0144, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 2.871196939333212, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.275996112730807e-06, |
|
"loss": 3.1565, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 2.874840590271452, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.154518950437318e-06, |
|
"loss": 3.2086, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 2.878484241209692, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 4.033041788143829e-06, |
|
"loss": 3.124, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 2.882127892147932, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.9115646258503405e-06, |
|
"loss": 3.046, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 2.8857715430861726, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.7900874635568516e-06, |
|
"loss": 3.1214, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 2.8894151940244126, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 3.6686103012633628e-06, |
|
"loss": 3.0823, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 2.8930588449626526, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.5471331389698735e-06, |
|
"loss": 3.0588, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 2.8967024959008927, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.4256559766763847e-06, |
|
"loss": 3.1368, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 2.9003461468391327, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.304178814382896e-06, |
|
"loss": 3.0578, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 2.903989797777373, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.1827016520894074e-06, |
|
"loss": 3.1724, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 2.907633448715613, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.0612244897959185e-06, |
|
"loss": 3.1477, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 2.9112770996538533, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 2.9397473275024297e-06, |
|
"loss": 3.1196, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 2.9149207505920933, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 2.818270165208941e-06, |
|
"loss": 3.1087, |
|
"step": 8000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 8232, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7504106874736026e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|