|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.986282578875171, |
|
"eval_steps": 500, |
|
"global_step": 3640, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027434842249657063, |
|
"grad_norm": 9.328125, |
|
"learning_rate": 0.00019999627553166294, |
|
"loss": 2.6306, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05486968449931413, |
|
"grad_norm": 5.1328125, |
|
"learning_rate": 0.00019998510240408496, |
|
"loss": 2.4194, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0823045267489712, |
|
"grad_norm": 5.58203125, |
|
"learning_rate": 0.0001999664814495453, |
|
"loss": 2.336, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10973936899862825, |
|
"grad_norm": 3.78515625, |
|
"learning_rate": 0.00019994041405510705, |
|
"loss": 2.4327, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13717421124828533, |
|
"grad_norm": 3.041015625, |
|
"learning_rate": 0.00019990690216251396, |
|
"loss": 2.3063, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1646090534979424, |
|
"grad_norm": 3.302734375, |
|
"learning_rate": 0.0001998659482680456, |
|
"loss": 2.3151, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.19204389574759945, |
|
"grad_norm": 3.869140625, |
|
"learning_rate": 0.00019981755542233177, |
|
"loss": 2.3566, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2194787379972565, |
|
"grad_norm": 5.70703125, |
|
"learning_rate": 0.0001997617272301248, |
|
"loss": 2.3368, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24691358024691357, |
|
"grad_norm": 4.81640625, |
|
"learning_rate": 0.00019969846785003134, |
|
"loss": 2.3303, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.27434842249657065, |
|
"grad_norm": 4.6015625, |
|
"learning_rate": 0.00019962778199420265, |
|
"loss": 2.4144, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3017832647462277, |
|
"grad_norm": 4.52734375, |
|
"learning_rate": 0.00019954967492798333, |
|
"loss": 2.4014, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3292181069958848, |
|
"grad_norm": 4.68359375, |
|
"learning_rate": 0.0001994641524695193, |
|
"loss": 2.4312, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35665294924554186, |
|
"grad_norm": 3.36328125, |
|
"learning_rate": 0.00019937122098932428, |
|
"loss": 2.3563, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3840877914951989, |
|
"grad_norm": 3.3984375, |
|
"learning_rate": 0.0001992708874098054, |
|
"loss": 2.351, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.411522633744856, |
|
"grad_norm": 3.103515625, |
|
"learning_rate": 0.0001991631592047475, |
|
"loss": 2.2869, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.438957475994513, |
|
"grad_norm": 2.904296875, |
|
"learning_rate": 0.00019904804439875633, |
|
"loss": 2.364, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4663923182441701, |
|
"grad_norm": 4.171875, |
|
"learning_rate": 0.00019892555156666089, |
|
"loss": 2.362, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 3.611328125, |
|
"learning_rate": 0.00019879568983287467, |
|
"loss": 2.2994, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5212620027434842, |
|
"grad_norm": 2.470703125, |
|
"learning_rate": 0.00019865846887071596, |
|
"loss": 2.2469, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5486968449931413, |
|
"grad_norm": 2.314453125, |
|
"learning_rate": 0.0001985138989016874, |
|
"loss": 2.2429, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5761316872427984, |
|
"grad_norm": 2.720703125, |
|
"learning_rate": 0.00019836199069471437, |
|
"loss": 2.3157, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6035665294924554, |
|
"grad_norm": 2.998046875, |
|
"learning_rate": 0.00019820275556534304, |
|
"loss": 2.2214, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6310013717421125, |
|
"grad_norm": 3.96484375, |
|
"learning_rate": 0.00019803620537489736, |
|
"loss": 2.3018, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6584362139917695, |
|
"grad_norm": 2.982421875, |
|
"learning_rate": 0.00019786235252959553, |
|
"loss": 2.2603, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6858710562414266, |
|
"grad_norm": 2.01953125, |
|
"learning_rate": 0.00019768120997962592, |
|
"loss": 2.3007, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7133058984910837, |
|
"grad_norm": 2.552734375, |
|
"learning_rate": 0.00019749279121818235, |
|
"loss": 2.281, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 2.98046875, |
|
"learning_rate": 0.00019729711028045909, |
|
"loss": 2.2611, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7681755829903978, |
|
"grad_norm": 2.716796875, |
|
"learning_rate": 0.0001970941817426052, |
|
"loss": 2.1961, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7956104252400549, |
|
"grad_norm": 2.318359375, |
|
"learning_rate": 0.00019688402072063903, |
|
"loss": 2.2652, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.823045267489712, |
|
"grad_norm": 2.53515625, |
|
"learning_rate": 0.00019666664286932198, |
|
"loss": 2.1895, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.850480109739369, |
|
"grad_norm": 2.451171875, |
|
"learning_rate": 0.0001964420643809925, |
|
"loss": 2.252, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.877914951989026, |
|
"grad_norm": 1.8544921875, |
|
"learning_rate": 0.00019621030198436006, |
|
"loss": 2.1616, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.9053497942386831, |
|
"grad_norm": 2.611328125, |
|
"learning_rate": 0.00019597137294325877, |
|
"loss": 2.1698, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.9327846364883402, |
|
"grad_norm": 2.349609375, |
|
"learning_rate": 0.0001957252950553616, |
|
"loss": 2.2043, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9602194787379973, |
|
"grad_norm": 2.169921875, |
|
"learning_rate": 0.00019547208665085457, |
|
"loss": 2.1506, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 1.8349609375, |
|
"learning_rate": 0.00019521176659107142, |
|
"loss": 2.1987, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0150891632373114, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.00019494435426708855, |
|
"loss": 2.1909, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0425240054869684, |
|
"grad_norm": 1.65234375, |
|
"learning_rate": 0.0001946698695982806, |
|
"loss": 2.1928, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0699588477366255, |
|
"grad_norm": 1.677734375, |
|
"learning_rate": 0.00019438833303083678, |
|
"loss": 2.1761, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0973936899862826, |
|
"grad_norm": 1.94140625, |
|
"learning_rate": 0.00019409976553623766, |
|
"loss": 2.1634, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1248285322359397, |
|
"grad_norm": 1.8623046875, |
|
"learning_rate": 0.00019380418860969322, |
|
"loss": 2.2044, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1522633744855968, |
|
"grad_norm": 1.7763671875, |
|
"learning_rate": 0.0001935016242685415, |
|
"loss": 2.1264, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1796982167352539, |
|
"grad_norm": 2.4609375, |
|
"learning_rate": 0.0001931920950506087, |
|
"loss": 2.1819, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.2071330589849107, |
|
"grad_norm": 2.990234375, |
|
"learning_rate": 0.00019287562401253022, |
|
"loss": 2.1799, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2345679012345678, |
|
"grad_norm": 4.86328125, |
|
"learning_rate": 0.00019255223472803334, |
|
"loss": 2.1497, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.262002743484225, |
|
"grad_norm": 2.998046875, |
|
"learning_rate": 0.00019222195128618106, |
|
"loss": 2.0783, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.289437585733882, |
|
"grad_norm": 2.033203125, |
|
"learning_rate": 0.00019188479828957772, |
|
"loss": 2.1195, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.316872427983539, |
|
"grad_norm": 1.9560546875, |
|
"learning_rate": 0.00019154080085253666, |
|
"loss": 2.0549, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3443072702331962, |
|
"grad_norm": 2.150390625, |
|
"learning_rate": 0.00019118998459920902, |
|
"loss": 2.2041, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3717421124828533, |
|
"grad_norm": 1.9931640625, |
|
"learning_rate": 0.0001908323756616754, |
|
"loss": 2.1735, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3991769547325104, |
|
"grad_norm": 1.732421875, |
|
"learning_rate": 0.0001904680006779991, |
|
"loss": 2.1329, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4266117969821672, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 2.075, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4540466392318243, |
|
"grad_norm": 2.724609375, |
|
"learning_rate": 0.00018971906164244232, |
|
"loss": 2.1452, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 0.00018933455337855632, |
|
"loss": 2.081, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.5089163237311385, |
|
"grad_norm": 2.15234375, |
|
"learning_rate": 0.000188943390640361, |
|
"loss": 2.1311, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5363511659807956, |
|
"grad_norm": 2.5078125, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 2.1465, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5637860082304527, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.00018814121878441814, |
|
"loss": 2.242, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5912208504801097, |
|
"grad_norm": 2.095703125, |
|
"learning_rate": 0.0001877302694199442, |
|
"loss": 2.1339, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.6186556927297668, |
|
"grad_norm": 1.5791015625, |
|
"learning_rate": 0.00018731278508325708, |
|
"loss": 2.1318, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.646090534979424, |
|
"grad_norm": 1.8525390625, |
|
"learning_rate": 0.00018688879687250067, |
|
"loss": 2.108, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.673525377229081, |
|
"grad_norm": 1.310546875, |
|
"learning_rate": 0.00018645833637028825, |
|
"loss": 2.2039, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.700960219478738, |
|
"grad_norm": 1.50390625, |
|
"learning_rate": 0.0001860214356413501, |
|
"loss": 2.1096, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7283950617283952, |
|
"grad_norm": 1.6025390625, |
|
"learning_rate": 0.00018557812723014476, |
|
"loss": 2.0806, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7558299039780523, |
|
"grad_norm": 2.197265625, |
|
"learning_rate": 0.00018512844415843514, |
|
"loss": 2.0545, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7832647462277091, |
|
"grad_norm": 1.8056640625, |
|
"learning_rate": 0.00018467241992282843, |
|
"loss": 2.0552, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.8106995884773662, |
|
"grad_norm": 1.40234375, |
|
"learning_rate": 0.00018421008849228118, |
|
"loss": 2.0906, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8381344307270233, |
|
"grad_norm": 1.7080078125, |
|
"learning_rate": 0.0001837414843055689, |
|
"loss": 2.0512, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8655692729766804, |
|
"grad_norm": 1.6162109375, |
|
"learning_rate": 0.00018326664226872065, |
|
"loss": 2.0945, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8930041152263375, |
|
"grad_norm": 1.603515625, |
|
"learning_rate": 0.0001827855977524191, |
|
"loss": 2.0553, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.9204389574759944, |
|
"grad_norm": 1.5771484375, |
|
"learning_rate": 0.00018229838658936564, |
|
"loss": 2.0876, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9478737997256514, |
|
"grad_norm": 1.400390625, |
|
"learning_rate": 0.0001818050450716113, |
|
"loss": 2.0565, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 0.00018130560994785325, |
|
"loss": 2.0883, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.0027434842249656, |
|
"grad_norm": 3.23046875, |
|
"learning_rate": 0.00018080011842069765, |
|
"loss": 2.0904, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.0301783264746227, |
|
"grad_norm": 1.611328125, |
|
"learning_rate": 0.00018028860814388827, |
|
"loss": 1.9926, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.05761316872428, |
|
"grad_norm": 1.638671875, |
|
"learning_rate": 0.00017977111721950164, |
|
"loss": 1.9994, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.085048010973937, |
|
"grad_norm": 1.7724609375, |
|
"learning_rate": 0.00017924768419510904, |
|
"loss": 2.0709, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.112482853223594, |
|
"grad_norm": 1.9208984375, |
|
"learning_rate": 0.00017871834806090501, |
|
"loss": 1.9618, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.139917695473251, |
|
"grad_norm": 2.603515625, |
|
"learning_rate": 0.000178183148246803, |
|
"loss": 1.9531, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.167352537722908, |
|
"grad_norm": 2.236328125, |
|
"learning_rate": 0.0001776421246194982, |
|
"loss": 2.0776, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1947873799725652, |
|
"grad_norm": 2.79296875, |
|
"learning_rate": 0.00017709531747949796, |
|
"loss": 2.0563, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 2.55078125, |
|
"learning_rate": 0.00017654276755811997, |
|
"loss": 2.052, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2496570644718794, |
|
"grad_norm": 2.2421875, |
|
"learning_rate": 0.0001759845160144579, |
|
"loss": 2.0051, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2770919067215365, |
|
"grad_norm": 2.35546875, |
|
"learning_rate": 0.00017542060443231572, |
|
"loss": 2.0448, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.3045267489711936, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.00017485107481711012, |
|
"loss": 2.068, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.3319615912208507, |
|
"grad_norm": 1.9912109375, |
|
"learning_rate": 0.00017427596959274143, |
|
"loss": 2.0173, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3593964334705078, |
|
"grad_norm": 1.841796875, |
|
"learning_rate": 0.00017369533159843369, |
|
"loss": 1.9539, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.386831275720165, |
|
"grad_norm": 1.9970703125, |
|
"learning_rate": 0.00017310920408554332, |
|
"loss": 1.9894, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.4142661179698215, |
|
"grad_norm": 1.80078125, |
|
"learning_rate": 0.00017251763071433765, |
|
"loss": 2.0438, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.4417009602194786, |
|
"grad_norm": 1.8701171875, |
|
"learning_rate": 0.00017192065555074245, |
|
"loss": 2.0079, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.4691358024691357, |
|
"grad_norm": 1.9775390625, |
|
"learning_rate": 0.00017131832306305965, |
|
"loss": 2.0738, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4965706447187928, |
|
"grad_norm": 2.087890625, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 2.0805, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.52400548696845, |
|
"grad_norm": 2.212890625, |
|
"learning_rate": 0.00017009776598061495, |
|
"loss": 2.0563, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.551440329218107, |
|
"grad_norm": 2.291015625, |
|
"learning_rate": 0.00016947963230437725, |
|
"loss": 2.0289, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.578875171467764, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0001688563231343277, |
|
"loss": 2.0648, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.606310013717421, |
|
"grad_norm": 2.197265625, |
|
"learning_rate": 0.00016822788490037177, |
|
"loss": 1.9541, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.633744855967078, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.00016759436441447545, |
|
"loss": 2.0415, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.6611796982167353, |
|
"grad_norm": 1.7353515625, |
|
"learning_rate": 0.00016695580886717858, |
|
"loss": 2.0242, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.6886145404663924, |
|
"grad_norm": 2.115234375, |
|
"learning_rate": 0.00016631226582407952, |
|
"loss": 2.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.7160493827160495, |
|
"grad_norm": 1.78515625, |
|
"learning_rate": 0.00016566378322229204, |
|
"loss": 2.0559, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7434842249657065, |
|
"grad_norm": 1.6708984375, |
|
"learning_rate": 0.00016501040936687443, |
|
"loss": 2.0658, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7709190672153636, |
|
"grad_norm": 2.06640625, |
|
"learning_rate": 0.00016435219292723147, |
|
"loss": 2.0381, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.7983539094650207, |
|
"grad_norm": 1.99609375, |
|
"learning_rate": 0.00016368918293348892, |
|
"loss": 1.9942, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.825788751714678, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.00016302142877284138, |
|
"loss": 2.0459, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.8532235939643344, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 1.9964, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.8806584362139915, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.00016167188726285434, |
|
"loss": 1.9702, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.9080932784636486, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00016099020044000727, |
|
"loss": 1.971, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.9355281207133057, |
|
"grad_norm": 1.662109375, |
|
"learning_rate": 0.00016030397049575203, |
|
"loss": 2.0445, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 2.14453125, |
|
"learning_rate": 0.00015961324854692254, |
|
"loss": 1.9905, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.99039780521262, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.00015891808604495938, |
|
"loss": 2.0048, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.017832647462277, |
|
"grad_norm": 1.599609375, |
|
"learning_rate": 0.00015821853477207708, |
|
"loss": 2.0107, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.045267489711934, |
|
"grad_norm": 2.4765625, |
|
"learning_rate": 0.00015751464683740697, |
|
"loss": 1.9425, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.072702331961591, |
|
"grad_norm": 3.009765625, |
|
"learning_rate": 0.00015680647467311557, |
|
"loss": 1.9891, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.1001371742112482, |
|
"grad_norm": 2.1328125, |
|
"learning_rate": 0.00015609407103049896, |
|
"loss": 1.9283, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.1275720164609053, |
|
"grad_norm": 3.130859375, |
|
"learning_rate": 0.0001553774889760533, |
|
"loss": 1.9353, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.1550068587105624, |
|
"grad_norm": 2.798828125, |
|
"learning_rate": 0.0001546567818875221, |
|
"loss": 1.9945, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.1824417009602195, |
|
"grad_norm": 2.287109375, |
|
"learning_rate": 0.00015393200344991995, |
|
"loss": 1.9199, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.2098765432098766, |
|
"grad_norm": 2.748046875, |
|
"learning_rate": 0.00015320320765153367, |
|
"loss": 1.9415, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.2373113854595337, |
|
"grad_norm": 2.10546875, |
|
"learning_rate": 0.0001524704487799008, |
|
"loss": 1.9417, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.2647462277091908, |
|
"grad_norm": 2.1015625, |
|
"learning_rate": 0.00015173378141776568, |
|
"loss": 1.9477, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.292181069958848, |
|
"grad_norm": 1.8896484375, |
|
"learning_rate": 0.0001509932604390136, |
|
"loss": 1.8957, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.319615912208505, |
|
"grad_norm": 2.36328125, |
|
"learning_rate": 0.0001502489410045833, |
|
"loss": 1.9313, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.347050754458162, |
|
"grad_norm": 2.107421875, |
|
"learning_rate": 0.00014950087855835815, |
|
"loss": 1.902, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.374485596707819, |
|
"grad_norm": 1.919921875, |
|
"learning_rate": 0.000148749128823036, |
|
"loss": 1.8492, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.401920438957476, |
|
"grad_norm": 2.07421875, |
|
"learning_rate": 0.00014799374779597867, |
|
"loss": 1.9576, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.4293552812071333, |
|
"grad_norm": 2.115234375, |
|
"learning_rate": 0.00014723479174504037, |
|
"loss": 1.9472, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.45679012345679, |
|
"grad_norm": 2.748046875, |
|
"learning_rate": 0.00014647231720437686, |
|
"loss": 1.969, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.484224965706447, |
|
"grad_norm": 2.20703125, |
|
"learning_rate": 0.0001457063809702338, |
|
"loss": 1.9607, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.511659807956104, |
|
"grad_norm": 1.8447265625, |
|
"learning_rate": 0.00014493704009671613, |
|
"loss": 1.9347, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.539094650205761, |
|
"grad_norm": 1.857421875, |
|
"learning_rate": 0.00014416435189153846, |
|
"loss": 1.9848, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.5665294924554183, |
|
"grad_norm": 2.009765625, |
|
"learning_rate": 0.00014338837391175582, |
|
"loss": 1.9784, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.5939643347050754, |
|
"grad_norm": 2.373046875, |
|
"learning_rate": 0.00014260916395947656, |
|
"loss": 1.9356, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.6213991769547325, |
|
"grad_norm": 2.474609375, |
|
"learning_rate": 0.0001418267800775565, |
|
"loss": 1.9703, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.6488340192043895, |
|
"grad_norm": 1.7783203125, |
|
"learning_rate": 0.0001410412805452757, |
|
"loss": 1.9149, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.6762688614540466, |
|
"grad_norm": 2.7421875, |
|
"learning_rate": 0.00014025272387399674, |
|
"loss": 1.948, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 2.64453125, |
|
"learning_rate": 0.00013946116880280681, |
|
"loss": 1.9427, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.731138545953361, |
|
"grad_norm": 3.169921875, |
|
"learning_rate": 0.0001386666742941419, |
|
"loss": 1.8966, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.758573388203018, |
|
"grad_norm": 2.00390625, |
|
"learning_rate": 0.00013786929952939477, |
|
"loss": 1.9682, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.786008230452675, |
|
"grad_norm": 1.7138671875, |
|
"learning_rate": 0.00013706910390450677, |
|
"loss": 1.9255, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.813443072702332, |
|
"grad_norm": 1.9404296875, |
|
"learning_rate": 0.0001362661470255432, |
|
"loss": 1.8883, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.840877914951989, |
|
"grad_norm": 2.662109375, |
|
"learning_rate": 0.00013546048870425356, |
|
"loss": 1.9409, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.8683127572016462, |
|
"grad_norm": 1.763671875, |
|
"learning_rate": 0.000134652188953616, |
|
"loss": 1.9766, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.895747599451303, |
|
"grad_norm": 1.7626953125, |
|
"learning_rate": 0.00013384130798336705, |
|
"loss": 1.9428, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.92318244170096, |
|
"grad_norm": 1.8583984375, |
|
"learning_rate": 0.00013302790619551674, |
|
"loss": 1.9664, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.950617283950617, |
|
"grad_norm": 1.7919921875, |
|
"learning_rate": 0.00013221204417984908, |
|
"loss": 1.9387, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.978052126200274, |
|
"grad_norm": 2.16015625, |
|
"learning_rate": 0.000131393782709409, |
|
"loss": 1.8872, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.005486968449931, |
|
"grad_norm": 1.833984375, |
|
"learning_rate": 0.0001305731827359753, |
|
"loss": 1.9299, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.032921810699588, |
|
"grad_norm": 2.3359375, |
|
"learning_rate": 0.00012975030538552032, |
|
"loss": 1.8399, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.060356652949245, |
|
"grad_norm": 1.4580078125, |
|
"learning_rate": 0.00012892521195365678, |
|
"loss": 1.9137, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.0877914951989025, |
|
"grad_norm": 1.7353515625, |
|
"learning_rate": 0.00012809796390107195, |
|
"loss": 1.8806, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.11522633744856, |
|
"grad_norm": 2.189453125, |
|
"learning_rate": 0.00012726862284894938, |
|
"loss": 1.9019, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.142661179698217, |
|
"grad_norm": 1.490234375, |
|
"learning_rate": 0.0001264372505743789, |
|
"loss": 1.8454, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.170096021947874, |
|
"grad_norm": 1.6123046875, |
|
"learning_rate": 0.0001256039090057547, |
|
"loss": 1.9057, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.197530864197531, |
|
"grad_norm": 1.3447265625, |
|
"learning_rate": 0.0001247686602181626, |
|
"loss": 1.8994, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.224965706447188, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.0001239315664287558, |
|
"loss": 1.8779, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.252400548696845, |
|
"grad_norm": 2.07421875, |
|
"learning_rate": 0.0001230926899921206, |
|
"loss": 1.8386, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.279835390946502, |
|
"grad_norm": 1.7724609375, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 1.8646, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.307270233196159, |
|
"grad_norm": 1.8017578125, |
|
"learning_rate": 0.00012140983925479662, |
|
"loss": 1.8488, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.334705075445816, |
|
"grad_norm": 2.1171875, |
|
"learning_rate": 0.00012056599030859366, |
|
"loss": 1.8531, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.362139917695473, |
|
"grad_norm": 1.9462890625, |
|
"learning_rate": 0.00011972060941479621, |
|
"loss": 1.8437, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.3895747599451305, |
|
"grad_norm": 1.5908203125, |
|
"learning_rate": 0.00011887375954529168, |
|
"loss": 1.8201, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.4170096021947876, |
|
"grad_norm": 5.125, |
|
"learning_rate": 0.0001180255037813906, |
|
"loss": 1.865, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 1.919921875, |
|
"learning_rate": 0.00011717590530912763, |
|
"loss": 1.8605, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.471879286694102, |
|
"grad_norm": 1.666015625, |
|
"learning_rate": 0.00011632502741455496, |
|
"loss": 1.8294, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.499314128943759, |
|
"grad_norm": 1.6572265625, |
|
"learning_rate": 0.00011547293347902812, |
|
"loss": 1.8254, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.526748971193416, |
|
"grad_norm": 1.4775390625, |
|
"learning_rate": 0.00011461968697448485, |
|
"loss": 1.8534, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.554183813443073, |
|
"grad_norm": 1.791015625, |
|
"learning_rate": 0.00011376535145871684, |
|
"loss": 1.8151, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.58161865569273, |
|
"grad_norm": 1.9013671875, |
|
"learning_rate": 0.00011290999057063569, |
|
"loss": 1.875, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.609053497942387, |
|
"grad_norm": 2.322265625, |
|
"learning_rate": 0.0001120536680255323, |
|
"loss": 1.9154, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.636488340192044, |
|
"grad_norm": 1.814453125, |
|
"learning_rate": 0.00011119644761033078, |
|
"loss": 1.898, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.663923182441701, |
|
"grad_norm": 2.25390625, |
|
"learning_rate": 0.00011033839317883701, |
|
"loss": 1.852, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.6913580246913575, |
|
"grad_norm": 2.240234375, |
|
"learning_rate": 0.00010947956864698223, |
|
"loss": 1.8394, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.7187928669410155, |
|
"grad_norm": 2.544921875, |
|
"learning_rate": 0.00010862003798806196, |
|
"loss": 1.84, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.746227709190672, |
|
"grad_norm": 1.7255859375, |
|
"learning_rate": 0.00010775986522797063, |
|
"loss": 1.8682, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.77366255144033, |
|
"grad_norm": 2.705078125, |
|
"learning_rate": 0.00010689911444043248, |
|
"loss": 1.8197, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.801097393689986, |
|
"grad_norm": 1.7275390625, |
|
"learning_rate": 0.00010603784974222861, |
|
"loss": 1.868, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.828532235939643, |
|
"grad_norm": 1.7900390625, |
|
"learning_rate": 0.00010517613528842097, |
|
"loss": 1.8828, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.8559670781893, |
|
"grad_norm": 1.603515625, |
|
"learning_rate": 0.00010431403526757347, |
|
"loss": 1.8683, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.883401920438957, |
|
"grad_norm": 1.958984375, |
|
"learning_rate": 0.00010345161389697082, |
|
"loss": 1.8725, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.910836762688614, |
|
"grad_norm": 1.857421875, |
|
"learning_rate": 0.00010258893541783476, |
|
"loss": 1.8893, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.938271604938271, |
|
"grad_norm": 1.755859375, |
|
"learning_rate": 0.00010172606409053886, |
|
"loss": 1.892, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.965706447187928, |
|
"grad_norm": 1.76171875, |
|
"learning_rate": 0.0001008630641898219, |
|
"loss": 1.8952, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.9931412894375855, |
|
"grad_norm": 1.767578125, |
|
"learning_rate": 0.0001, |
|
"loss": 1.8623, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 5.020576131687243, |
|
"grad_norm": 2.0234375, |
|
"learning_rate": 9.913693581017812e-05, |
|
"loss": 1.7838, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 5.0480109739369, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 9.827393590946116e-05, |
|
"loss": 1.7935, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.075445816186557, |
|
"grad_norm": 2.12109375, |
|
"learning_rate": 9.741106458216528e-05, |
|
"loss": 1.838, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.102880658436214, |
|
"grad_norm": 1.63671875, |
|
"learning_rate": 9.654838610302923e-05, |
|
"loss": 1.8097, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 5.130315500685871, |
|
"grad_norm": 2.62109375, |
|
"learning_rate": 9.568596473242654e-05, |
|
"loss": 1.7773, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 5.157750342935528, |
|
"grad_norm": 2.1015625, |
|
"learning_rate": 9.482386471157904e-05, |
|
"loss": 1.8083, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 5.185185185185185, |
|
"grad_norm": 1.8603515625, |
|
"learning_rate": 9.396215025777139e-05, |
|
"loss": 1.7376, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 5.212620027434842, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 9.31008855595675e-05, |
|
"loss": 1.8674, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.240054869684499, |
|
"grad_norm": 2.1796875, |
|
"learning_rate": 9.224013477202939e-05, |
|
"loss": 1.8239, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 5.267489711934156, |
|
"grad_norm": 2.208984375, |
|
"learning_rate": 9.137996201193805e-05, |
|
"loss": 1.811, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 5.2949245541838135, |
|
"grad_norm": 2.001953125, |
|
"learning_rate": 9.052043135301779e-05, |
|
"loss": 1.7938, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 5.322359396433471, |
|
"grad_norm": 2.083984375, |
|
"learning_rate": 8.9661606821163e-05, |
|
"loss": 1.8577, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 5.349794238683128, |
|
"grad_norm": 1.939453125, |
|
"learning_rate": 8.880355238966923e-05, |
|
"loss": 1.8207, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.377229080932785, |
|
"grad_norm": 1.873046875, |
|
"learning_rate": 8.79463319744677e-05, |
|
"loss": 1.7565, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 5.404663923182442, |
|
"grad_norm": 1.646484375, |
|
"learning_rate": 8.709000942936433e-05, |
|
"loss": 1.8572, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 5.432098765432099, |
|
"grad_norm": 1.5400390625, |
|
"learning_rate": 8.62346485412832e-05, |
|
"loss": 1.8169, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 5.459533607681756, |
|
"grad_norm": 1.8388671875, |
|
"learning_rate": 8.538031302551522e-05, |
|
"loss": 1.8642, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 5.486968449931413, |
|
"grad_norm": 2.240234375, |
|
"learning_rate": 8.452706652097186e-05, |
|
"loss": 1.7803, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.51440329218107, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 8.367497258544507e-05, |
|
"loss": 1.7859, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 5.541838134430727, |
|
"grad_norm": 1.880859375, |
|
"learning_rate": 8.282409469087239e-05, |
|
"loss": 1.8381, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 5.569272976680384, |
|
"grad_norm": 1.865234375, |
|
"learning_rate": 8.197449621860943e-05, |
|
"loss": 1.7921, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 5.596707818930041, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 8.112624045470835e-05, |
|
"loss": 1.781, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 5.6241426611796985, |
|
"grad_norm": 1.7802734375, |
|
"learning_rate": 8.027939058520381e-05, |
|
"loss": 1.7988, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 5.651577503429356, |
|
"grad_norm": 1.72265625, |
|
"learning_rate": 7.943400969140635e-05, |
|
"loss": 1.7888, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 5.679012345679013, |
|
"grad_norm": 1.8701171875, |
|
"learning_rate": 7.85901607452034e-05, |
|
"loss": 1.7995, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 5.70644718792867, |
|
"grad_norm": 1.9873046875, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 1.7753, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 5.733882030178327, |
|
"grad_norm": 2.19921875, |
|
"learning_rate": 7.690731000787948e-05, |
|
"loss": 1.7387, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 5.761316872427983, |
|
"grad_norm": 1.85546875, |
|
"learning_rate": 7.606843357124426e-05, |
|
"loss": 1.8478, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.788751714677641, |
|
"grad_norm": 1.8662109375, |
|
"learning_rate": 7.52313397818374e-05, |
|
"loss": 1.8373, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 5.816186556927297, |
|
"grad_norm": 1.8505859375, |
|
"learning_rate": 7.43960909942453e-05, |
|
"loss": 1.8703, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 5.843621399176955, |
|
"grad_norm": 1.591796875, |
|
"learning_rate": 7.356274942562111e-05, |
|
"loss": 1.7647, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 5.871056241426611, |
|
"grad_norm": 1.8681640625, |
|
"learning_rate": 7.273137715105063e-05, |
|
"loss": 1.7957, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 5.8984910836762685, |
|
"grad_norm": 1.7666015625, |
|
"learning_rate": 7.190203609892808e-05, |
|
"loss": 1.7988, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 1.6123046875, |
|
"learning_rate": 7.107478804634325e-05, |
|
"loss": 1.7388, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 5.953360768175583, |
|
"grad_norm": 1.7646484375, |
|
"learning_rate": 7.024969461447972e-05, |
|
"loss": 1.7994, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 5.98079561042524, |
|
"grad_norm": 1.96484375, |
|
"learning_rate": 6.942681726402473e-05, |
|
"loss": 1.7937, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 6.008230452674897, |
|
"grad_norm": 2.099609375, |
|
"learning_rate": 6.8606217290591e-05, |
|
"loss": 1.7915, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 6.035665294924554, |
|
"grad_norm": 1.8466796875, |
|
"learning_rate": 6.778795582015097e-05, |
|
"loss": 1.8106, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.063100137174211, |
|
"grad_norm": 1.9228515625, |
|
"learning_rate": 6.697209380448333e-05, |
|
"loss": 1.7836, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 6.090534979423868, |
|
"grad_norm": 2.74609375, |
|
"learning_rate": 6.615869201663296e-05, |
|
"loss": 1.7202, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 6.117969821673525, |
|
"grad_norm": 1.708984375, |
|
"learning_rate": 6.534781104638399e-05, |
|
"loss": 1.7432, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 6.145404663923182, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 6.453951129574644e-05, |
|
"loss": 1.719, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 6.172839506172839, |
|
"grad_norm": 1.7333984375, |
|
"learning_rate": 6.37338529744568e-05, |
|
"loss": 1.778, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 6.2002743484224965, |
|
"grad_norm": 1.7802734375, |
|
"learning_rate": 6.293089609549325e-05, |
|
"loss": 1.7294, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 6.227709190672154, |
|
"grad_norm": 2.220703125, |
|
"learning_rate": 6.213070047060524e-05, |
|
"loss": 1.6875, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 6.255144032921811, |
|
"grad_norm": 1.8193359375, |
|
"learning_rate": 6.133332570585812e-05, |
|
"loss": 1.8336, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 6.282578875171468, |
|
"grad_norm": 1.9384765625, |
|
"learning_rate": 6.05388311971932e-05, |
|
"loss": 1.7279, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 6.310013717421125, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 5.9747276126003257e-05, |
|
"loss": 1.753, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.337448559670782, |
|
"grad_norm": 1.7490234375, |
|
"learning_rate": 5.8958719454724346e-05, |
|
"loss": 1.7593, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 6.364883401920439, |
|
"grad_norm": 1.9990234375, |
|
"learning_rate": 5.817321992244351e-05, |
|
"loss": 1.7361, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 6.392318244170096, |
|
"grad_norm": 2.55078125, |
|
"learning_rate": 5.739083604052351e-05, |
|
"loss": 1.7527, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 6.419753086419753, |
|
"grad_norm": 1.8408203125, |
|
"learning_rate": 5.6611626088244194e-05, |
|
"loss": 1.7893, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 6.44718792866941, |
|
"grad_norm": 1.8388671875, |
|
"learning_rate": 5.583564810846157e-05, |
|
"loss": 1.744, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 6.474622770919067, |
|
"grad_norm": 1.947265625, |
|
"learning_rate": 5.506295990328385e-05, |
|
"loss": 1.7609, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 6.502057613168724, |
|
"grad_norm": 1.947265625, |
|
"learning_rate": 5.429361902976624e-05, |
|
"loss": 1.7273, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 6.5294924554183815, |
|
"grad_norm": 1.9033203125, |
|
"learning_rate": 5.3527682795623146e-05, |
|
"loss": 1.7782, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 6.556927297668039, |
|
"grad_norm": 1.75390625, |
|
"learning_rate": 5.276520825495963e-05, |
|
"loss": 1.7612, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 6.584362139917696, |
|
"grad_norm": 1.8037109375, |
|
"learning_rate": 5.200625220402139e-05, |
|
"loss": 1.7672, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.611796982167353, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 5.1250871176964036e-05, |
|
"loss": 1.7832, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 6.63923182441701, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 5.0499121441641864e-05, |
|
"loss": 1.7438, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 1.865234375, |
|
"learning_rate": 4.975105899541671e-05, |
|
"loss": 1.7172, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 6.694101508916324, |
|
"grad_norm": 1.8056640625, |
|
"learning_rate": 4.900673956098644e-05, |
|
"loss": 1.7476, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 6.721536351165981, |
|
"grad_norm": 1.75390625, |
|
"learning_rate": 4.826621858223431e-05, |
|
"loss": 1.7547, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 6.748971193415638, |
|
"grad_norm": 1.841796875, |
|
"learning_rate": 4.75295512200992e-05, |
|
"loss": 1.7363, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 6.776406035665294, |
|
"grad_norm": 1.8349609375, |
|
"learning_rate": 4.6796792348466356e-05, |
|
"loss": 1.7725, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 6.803840877914952, |
|
"grad_norm": 1.7373046875, |
|
"learning_rate": 4.606799655008009e-05, |
|
"loss": 1.6962, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 6.831275720164609, |
|
"grad_norm": 2.0234375, |
|
"learning_rate": 4.5343218112477904e-05, |
|
"loss": 1.6918, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 6.858710562414267, |
|
"grad_norm": 1.73828125, |
|
"learning_rate": 4.462251102394669e-05, |
|
"loss": 1.7336, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.886145404663923, |
|
"grad_norm": 1.974609375, |
|
"learning_rate": 4.3905928969501056e-05, |
|
"loss": 1.7502, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 6.91358024691358, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 4.3193525326884435e-05, |
|
"loss": 1.8341, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 6.941015089163237, |
|
"grad_norm": 1.7939453125, |
|
"learning_rate": 4.248535316259305e-05, |
|
"loss": 1.7671, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 6.968449931412894, |
|
"grad_norm": 1.5888671875, |
|
"learning_rate": 4.1781465227922957e-05, |
|
"loss": 1.7457, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 6.995884773662551, |
|
"grad_norm": 1.7177734375, |
|
"learning_rate": 4.108191395504064e-05, |
|
"loss": 1.7736, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 7.023319615912208, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.038675145307747e-05, |
|
"loss": 1.6925, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 7.050754458161865, |
|
"grad_norm": 2.091796875, |
|
"learning_rate": 3.9696029504247956e-05, |
|
"loss": 1.7176, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 7.078189300411522, |
|
"grad_norm": 1.919921875, |
|
"learning_rate": 3.900979955999271e-05, |
|
"loss": 1.7825, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 7.1056241426611795, |
|
"grad_norm": 1.943359375, |
|
"learning_rate": 3.832811273714569e-05, |
|
"loss": 1.7257, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 7.133058984910837, |
|
"grad_norm": 2.052734375, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 1.7665, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 7.160493827160494, |
|
"grad_norm": 1.6552734375, |
|
"learning_rate": 3.697857122715865e-05, |
|
"loss": 1.7373, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 7.187928669410151, |
|
"grad_norm": 1.8623046875, |
|
"learning_rate": 3.6310817066511105e-05, |
|
"loss": 1.6965, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 7.215363511659808, |
|
"grad_norm": 1.48828125, |
|
"learning_rate": 3.5647807072768526e-05, |
|
"loss": 1.7009, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 7.242798353909465, |
|
"grad_norm": 1.5126953125, |
|
"learning_rate": 3.498959063312558e-05, |
|
"loss": 1.7086, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 7.270233196159122, |
|
"grad_norm": 1.6748046875, |
|
"learning_rate": 3.4336216777708e-05, |
|
"loss": 1.6847, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 7.297668038408779, |
|
"grad_norm": 1.4794921875, |
|
"learning_rate": 3.36877341759205e-05, |
|
"loss": 1.756, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 7.325102880658436, |
|
"grad_norm": 1.6240234375, |
|
"learning_rate": 3.3044191132821454e-05, |
|
"loss": 1.6819, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 7.352537722908093, |
|
"grad_norm": 1.623046875, |
|
"learning_rate": 3.2405635585524565e-05, |
|
"loss": 1.7719, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 7.37997256515775, |
|
"grad_norm": 1.775390625, |
|
"learning_rate": 3.177211509962826e-05, |
|
"loss": 1.7437, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 1.880859375, |
|
"learning_rate": 3.114367686567228e-05, |
|
"loss": 1.702, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 7.4348422496570645, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.052036769562276e-05, |
|
"loss": 1.6124, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 7.462277091906722, |
|
"grad_norm": 1.8525390625, |
|
"learning_rate": 2.9902234019385057e-05, |
|
"loss": 1.6915, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 7.489711934156379, |
|
"grad_norm": 1.998046875, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 1.7637, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 7.517146776406036, |
|
"grad_norm": 1.744140625, |
|
"learning_rate": 2.8681676936940393e-05, |
|
"loss": 1.7212, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 7.544581618655693, |
|
"grad_norm": 1.486328125, |
|
"learning_rate": 2.8079344449257572e-05, |
|
"loss": 1.7415, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 7.57201646090535, |
|
"grad_norm": 2.044921875, |
|
"learning_rate": 2.7482369285662378e-05, |
|
"loss": 1.7262, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 7.599451303155007, |
|
"grad_norm": 1.6982421875, |
|
"learning_rate": 2.6890795914456678e-05, |
|
"loss": 1.6766, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 7.626886145404664, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 2.6304668401566335e-05, |
|
"loss": 1.7522, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 7.654320987654321, |
|
"grad_norm": 1.8017578125, |
|
"learning_rate": 2.572403040725855e-05, |
|
"loss": 1.7095, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 7.681755829903978, |
|
"grad_norm": 2.005859375, |
|
"learning_rate": 2.514892518288988e-05, |
|
"loss": 1.7196, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.709190672153635, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 2.4579395567684283e-05, |
|
"loss": 1.7174, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 7.7366255144032925, |
|
"grad_norm": 2.1015625, |
|
"learning_rate": 2.401548398554213e-05, |
|
"loss": 1.745, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 7.76406035665295, |
|
"grad_norm": 1.7236328125, |
|
"learning_rate": 2.345723244188006e-05, |
|
"loss": 1.7127, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 7.791495198902607, |
|
"grad_norm": 2.4296875, |
|
"learning_rate": 2.290468252050204e-05, |
|
"loss": 1.6945, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 7.818930041152264, |
|
"grad_norm": 1.67578125, |
|
"learning_rate": 2.2357875380501836e-05, |
|
"loss": 1.7206, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 7.84636488340192, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 2.181685175319702e-05, |
|
"loss": 1.683, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 7.873799725651578, |
|
"grad_norm": 1.7236328125, |
|
"learning_rate": 2.1281651939094992e-05, |
|
"loss": 1.7218, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 7.901234567901234, |
|
"grad_norm": 1.82421875, |
|
"learning_rate": 2.0752315804890977e-05, |
|
"loss": 1.7274, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 7.928669410150892, |
|
"grad_norm": 3.5078125, |
|
"learning_rate": 2.0228882780498404e-05, |
|
"loss": 1.6874, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 7.956104252400548, |
|
"grad_norm": 2.08984375, |
|
"learning_rate": 1.971139185611176e-05, |
|
"loss": 1.7064, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 7.983539094650205, |
|
"grad_norm": 2.240234375, |
|
"learning_rate": 1.919988157930236e-05, |
|
"loss": 1.6922, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 8.010973936899862, |
|
"grad_norm": 2.25, |
|
"learning_rate": 1.8694390052146737e-05, |
|
"loss": 1.73, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 8.03840877914952, |
|
"grad_norm": 1.62890625, |
|
"learning_rate": 1.819495492838872e-05, |
|
"loss": 1.6912, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 8.065843621399177, |
|
"grad_norm": 1.755859375, |
|
"learning_rate": 1.7701613410634365e-05, |
|
"loss": 1.6858, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 8.093278463648835, |
|
"grad_norm": 1.5791015625, |
|
"learning_rate": 1.7214402247580918e-05, |
|
"loss": 1.6634, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 8.12071330589849, |
|
"grad_norm": 1.63671875, |
|
"learning_rate": 1.6733357731279377e-05, |
|
"loss": 1.6908, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 8.148148148148149, |
|
"grad_norm": 1.8701171875, |
|
"learning_rate": 1.6258515694431144e-05, |
|
"loss": 1.7138, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 8.175582990397805, |
|
"grad_norm": 1.6357421875, |
|
"learning_rate": 1.5789911507718826e-05, |
|
"loss": 1.7258, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 8.203017832647463, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.5327580077171587e-05, |
|
"loss": 1.7178, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 8.23045267489712, |
|
"grad_norm": 1.787109375, |
|
"learning_rate": 1.4871555841564887e-05, |
|
"loss": 1.6809, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 8.257887517146777, |
|
"grad_norm": 1.7177734375, |
|
"learning_rate": 1.442187276985526e-05, |
|
"loss": 1.6501, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 8.285322359396433, |
|
"grad_norm": 1.822265625, |
|
"learning_rate": 1.3978564358649927e-05, |
|
"loss": 1.7259, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 8.312757201646091, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.3541663629711766e-05, |
|
"loss": 1.752, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 8.340192043895748, |
|
"grad_norm": 1.7744140625, |
|
"learning_rate": 1.311120312749935e-05, |
|
"loss": 1.6563, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 8.367626886145406, |
|
"grad_norm": 1.8427734375, |
|
"learning_rate": 1.2687214916742918e-05, |
|
"loss": 1.7103, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 8.395061728395062, |
|
"grad_norm": 2.611328125, |
|
"learning_rate": 1.2269730580055805e-05, |
|
"loss": 1.6951, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 8.422496570644718, |
|
"grad_norm": 1.763671875, |
|
"learning_rate": 1.185878121558186e-05, |
|
"loss": 1.6747, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 8.449931412894376, |
|
"grad_norm": 1.7568359375, |
|
"learning_rate": 1.1454397434679021e-05, |
|
"loss": 1.7284, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 8.477366255144032, |
|
"grad_norm": 1.8837890625, |
|
"learning_rate": 1.1056609359639025e-05, |
|
"loss": 1.6907, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 8.50480109739369, |
|
"grad_norm": 1.677734375, |
|
"learning_rate": 1.0665446621443708e-05, |
|
"loss": 1.652, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.532235939643346, |
|
"grad_norm": 1.7236328125, |
|
"learning_rate": 1.028093835755769e-05, |
|
"loss": 1.6751, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 8.559670781893004, |
|
"grad_norm": 1.716796875, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 1.6465, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 8.58710562414266, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 9.531999322000885e-06, |
|
"loss": 1.7407, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 8.614540466392318, |
|
"grad_norm": 1.681640625, |
|
"learning_rate": 9.1676243383246e-06, |
|
"loss": 1.7061, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 8.641975308641975, |
|
"grad_norm": 2.072265625, |
|
"learning_rate": 8.810015400790994e-06, |
|
"loss": 1.6604, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 8.669410150891633, |
|
"grad_norm": 1.7685546875, |
|
"learning_rate": 8.45919914746337e-06, |
|
"loss": 1.7301, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 8.696844993141289, |
|
"grad_norm": 2.193359375, |
|
"learning_rate": 8.115201710422282e-06, |
|
"loss": 1.6973, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 8.724279835390947, |
|
"grad_norm": 1.677734375, |
|
"learning_rate": 7.778048713818975e-06, |
|
"loss": 1.7165, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 8.751714677640603, |
|
"grad_norm": 2.1171875, |
|
"learning_rate": 7.447765271966656e-06, |
|
"loss": 1.6841, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 8.779149519890261, |
|
"grad_norm": 1.6279296875, |
|
"learning_rate": 7.124375987469767e-06, |
|
"loss": 1.7142, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.806584362139917, |
|
"grad_norm": 1.93359375, |
|
"learning_rate": 6.80790494939132e-06, |
|
"loss": 1.7619, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 8.834019204389575, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.498375731458528e-06, |
|
"loss": 1.7146, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 8.861454046639231, |
|
"grad_norm": 1.59765625, |
|
"learning_rate": 6.195811390306816e-06, |
|
"loss": 1.6753, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 5.900234463762366e-06, |
|
"loss": 1.662, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 8.916323731138545, |
|
"grad_norm": 1.70703125, |
|
"learning_rate": 5.611666969163243e-06, |
|
"loss": 1.6781, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 8.943758573388203, |
|
"grad_norm": 1.8642578125, |
|
"learning_rate": 5.3301304017194135e-06, |
|
"loss": 1.6446, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 8.97119341563786, |
|
"grad_norm": 1.96484375, |
|
"learning_rate": 5.055645732911462e-06, |
|
"loss": 1.6632, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 8.998628257887518, |
|
"grad_norm": 2.181640625, |
|
"learning_rate": 4.788233408928589e-06, |
|
"loss": 1.707, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 9.026063100137174, |
|
"grad_norm": 1.75390625, |
|
"learning_rate": 4.527913349145441e-06, |
|
"loss": 1.7361, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 9.053497942386832, |
|
"grad_norm": 2.705078125, |
|
"learning_rate": 4.27470494463843e-06, |
|
"loss": 1.7412, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 9.080932784636488, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.028627056741252e-06, |
|
"loss": 1.6508, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 9.108367626886146, |
|
"grad_norm": 1.7490234375, |
|
"learning_rate": 3.789698015639953e-06, |
|
"loss": 1.7095, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 9.135802469135802, |
|
"grad_norm": 1.7919921875, |
|
"learning_rate": 3.5579356190074907e-06, |
|
"loss": 1.6629, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 9.16323731138546, |
|
"grad_norm": 1.662109375, |
|
"learning_rate": 3.3333571306780497e-06, |
|
"loss": 1.6755, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 9.190672153635116, |
|
"grad_norm": 5.0703125, |
|
"learning_rate": 3.115979279360992e-06, |
|
"loss": 1.6963, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 9.218106995884774, |
|
"grad_norm": 1.662109375, |
|
"learning_rate": 2.905818257394799e-06, |
|
"loss": 1.694, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 9.24554183813443, |
|
"grad_norm": 1.55078125, |
|
"learning_rate": 2.702889719540924e-06, |
|
"loss": 1.6488, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 9.272976680384089, |
|
"grad_norm": 2.107421875, |
|
"learning_rate": 2.5072087818176382e-06, |
|
"loss": 1.6729, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 9.300411522633745, |
|
"grad_norm": 1.5791015625, |
|
"learning_rate": 2.3187900203740844e-06, |
|
"loss": 1.6518, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 9.327846364883403, |
|
"grad_norm": 2.0234375, |
|
"learning_rate": 2.137647470404469e-06, |
|
"loss": 1.7342, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 9.355281207133059, |
|
"grad_norm": 1.9423828125, |
|
"learning_rate": 1.963794625102655e-06, |
|
"loss": 1.7223, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 9.382716049382717, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.797244434656975e-06, |
|
"loss": 1.7224, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 9.410150891632373, |
|
"grad_norm": 1.6806640625, |
|
"learning_rate": 1.6380093052856483e-06, |
|
"loss": 1.6956, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 9.437585733882031, |
|
"grad_norm": 1.693359375, |
|
"learning_rate": 1.48610109831262e-06, |
|
"loss": 1.6918, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 9.465020576131687, |
|
"grad_norm": 1.7080078125, |
|
"learning_rate": 1.341531129284046e-06, |
|
"loss": 1.6376, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 9.492455418381343, |
|
"grad_norm": 2.220703125, |
|
"learning_rate": 1.2043101671253554e-06, |
|
"loss": 1.6831, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 9.519890260631001, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.0744484333391368e-06, |
|
"loss": 1.6559, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 9.547325102880658, |
|
"grad_norm": 1.982421875, |
|
"learning_rate": 9.519556012436815e-07, |
|
"loss": 1.6864, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 9.574759945130316, |
|
"grad_norm": 2.025390625, |
|
"learning_rate": 8.368407952525026e-07, |
|
"loss": 1.6973, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 9.602194787379972, |
|
"grad_norm": 1.6865234375, |
|
"learning_rate": 7.291125901946027e-07, |
|
"loss": 1.7048, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.62962962962963, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 6.287790106757396e-07, |
|
"loss": 1.6939, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 9.657064471879286, |
|
"grad_norm": 1.5322265625, |
|
"learning_rate": 5.358475304807375e-07, |
|
"loss": 1.7541, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 9.684499314128944, |
|
"grad_norm": 2.29296875, |
|
"learning_rate": 4.503250720166774e-07, |
|
"loss": 1.6679, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 9.7119341563786, |
|
"grad_norm": 1.5634765625, |
|
"learning_rate": 3.7221800579735346e-07, |
|
"loss": 1.685, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 9.739368998628258, |
|
"grad_norm": 1.6884765625, |
|
"learning_rate": 3.0153214996866406e-07, |
|
"loss": 1.6628, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 9.766803840877914, |
|
"grad_norm": 1.8408203125, |
|
"learning_rate": 2.382727698752474e-07, |
|
"loss": 1.6632, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 9.794238683127572, |
|
"grad_norm": 1.9072265625, |
|
"learning_rate": 1.824445776682504e-07, |
|
"loss": 1.7231, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 9.821673525377228, |
|
"grad_norm": 1.791015625, |
|
"learning_rate": 1.340517319543877e-07, |
|
"loss": 1.6758, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 9.849108367626886, |
|
"grad_norm": 1.7236328125, |
|
"learning_rate": 9.309783748606693e-08, |
|
"loss": 1.6677, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 9.876543209876543, |
|
"grad_norm": 1.5498046875, |
|
"learning_rate": 5.958594489295921e-08, |
|
"loss": 1.6355, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 9.9039780521262, |
|
"grad_norm": 1.802734375, |
|
"learning_rate": 3.351855045471419e-08, |
|
"loss": 1.6854, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 9.931412894375857, |
|
"grad_norm": 1.9306640625, |
|
"learning_rate": 1.4897595915053242e-08, |
|
"loss": 1.6643, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 9.958847736625515, |
|
"grad_norm": 1.8544921875, |
|
"learning_rate": 3.724468337085174e-09, |
|
"loss": 1.7076, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 9.986282578875171, |
|
"grad_norm": 2.060546875, |
|
"learning_rate": 0.0, |
|
"loss": 1.6734, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 9.986282578875171, |
|
"step": 3640, |
|
"total_flos": 2.39046462799872e+17, |
|
"train_loss": 1.891263527398581, |
|
"train_runtime": 3639.5664, |
|
"train_samples_per_second": 4.006, |
|
"train_steps_per_second": 1.0 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3640, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.39046462799872e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|