|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9994747347410442, |
|
"eval_steps": 500, |
|
"global_step": 2379, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008404244143292362, |
|
"grad_norm": 0.7383340001106262, |
|
"learning_rate": 2e-05, |
|
"loss": 2.4066, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016808488286584725, |
|
"grad_norm": 0.46394309401512146, |
|
"learning_rate": 4e-05, |
|
"loss": 2.0375, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025212732429877087, |
|
"grad_norm": 0.4739478528499603, |
|
"learning_rate": 6e-05, |
|
"loss": 1.5044, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03361697657316945, |
|
"grad_norm": 0.20930196344852448, |
|
"learning_rate": 8e-05, |
|
"loss": 0.8704, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04202122071646181, |
|
"grad_norm": 0.15288038551807404, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6533, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.050425464859754174, |
|
"grad_norm": 0.13073962926864624, |
|
"learning_rate": 0.00012, |
|
"loss": 0.586, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.058829709003046536, |
|
"grad_norm": 0.14555367827415466, |
|
"learning_rate": 0.00014, |
|
"loss": 0.5793, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0672339531463389, |
|
"grad_norm": 0.12397414445877075, |
|
"learning_rate": 0.00016, |
|
"loss": 0.581, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07563819728963127, |
|
"grad_norm": 0.13021130859851837, |
|
"learning_rate": 0.00018, |
|
"loss": 0.5512, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08404244143292362, |
|
"grad_norm": 0.13012883067131042, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5403, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09244668557621599, |
|
"grad_norm": 0.11942347884178162, |
|
"learning_rate": 0.00019942313239111625, |
|
"loss": 0.5247, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10085092971950835, |
|
"grad_norm": 0.11690942198038101, |
|
"learning_rate": 0.0001988462647822325, |
|
"loss": 0.5417, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10925517386280072, |
|
"grad_norm": 0.1355101615190506, |
|
"learning_rate": 0.00019826939717334873, |
|
"loss": 0.5273, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11765941800609307, |
|
"grad_norm": 0.1345665603876114, |
|
"learning_rate": 0.00019769252956446497, |
|
"loss": 0.5243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12606366214938544, |
|
"grad_norm": 0.12515193223953247, |
|
"learning_rate": 0.0001971156619555812, |
|
"loss": 0.5344, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1344679062926778, |
|
"grad_norm": 0.15686553716659546, |
|
"learning_rate": 0.00019653879434669745, |
|
"loss": 0.5118, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14287215043597015, |
|
"grad_norm": 0.12068944424390793, |
|
"learning_rate": 0.0001959619267378137, |
|
"loss": 0.4979, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15127639457926254, |
|
"grad_norm": 0.13319459557533264, |
|
"learning_rate": 0.00019538505912892993, |
|
"loss": 0.503, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1596806387225549, |
|
"grad_norm": 0.11806949228048325, |
|
"learning_rate": 0.00019480819152004617, |
|
"loss": 0.49, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16808488286584725, |
|
"grad_norm": 0.12932075560092926, |
|
"learning_rate": 0.00019423132391116238, |
|
"loss": 0.514, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17648912700913963, |
|
"grad_norm": 0.11743929982185364, |
|
"learning_rate": 0.00019365445630227862, |
|
"loss": 0.4788, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18489337115243198, |
|
"grad_norm": 0.11788313835859299, |
|
"learning_rate": 0.00019307758869339486, |
|
"loss": 0.4891, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19329761529572434, |
|
"grad_norm": 0.11414741724729538, |
|
"learning_rate": 0.0001925007210845111, |
|
"loss": 0.5033, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2017018594390167, |
|
"grad_norm": 0.11419043689966202, |
|
"learning_rate": 0.00019192385347562737, |
|
"loss": 0.4844, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21010610358230908, |
|
"grad_norm": 0.12788020074367523, |
|
"learning_rate": 0.0001913469858667436, |
|
"loss": 0.4697, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21851034772560143, |
|
"grad_norm": 0.13661302626132965, |
|
"learning_rate": 0.00019077011825785982, |
|
"loss": 0.4627, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2269145918688938, |
|
"grad_norm": 0.12041325867176056, |
|
"learning_rate": 0.00019019325064897606, |
|
"loss": 0.4964, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23531883601218614, |
|
"grad_norm": 0.133742094039917, |
|
"learning_rate": 0.0001896163830400923, |
|
"loss": 0.4658, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24372308015547853, |
|
"grad_norm": 0.1261977106332779, |
|
"learning_rate": 0.00018903951543120854, |
|
"loss": 0.4781, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2521273242987709, |
|
"grad_norm": 0.130150705575943, |
|
"learning_rate": 0.00018846264782232478, |
|
"loss": 0.4922, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26053156844206327, |
|
"grad_norm": 0.13174410164356232, |
|
"learning_rate": 0.00018788578021344102, |
|
"loss": 0.4559, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2689358125853556, |
|
"grad_norm": 0.1186077669262886, |
|
"learning_rate": 0.00018730891260455726, |
|
"loss": 0.4722, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.277340056728648, |
|
"grad_norm": 0.116569384932518, |
|
"learning_rate": 0.0001867320449956735, |
|
"loss": 0.4457, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2857443008719403, |
|
"grad_norm": 0.12219471484422684, |
|
"learning_rate": 0.00018615517738678974, |
|
"loss": 0.4849, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2941485450152327, |
|
"grad_norm": 0.12746909260749817, |
|
"learning_rate": 0.00018557830977790598, |
|
"loss": 0.4821, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.30255278915852507, |
|
"grad_norm": 0.14125944674015045, |
|
"learning_rate": 0.00018500144216902222, |
|
"loss": 0.4605, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3109570333018174, |
|
"grad_norm": 0.19157269597053528, |
|
"learning_rate": 0.00018442457456013846, |
|
"loss": 0.4541, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3193612774451098, |
|
"grad_norm": 0.12603330612182617, |
|
"learning_rate": 0.0001838477069512547, |
|
"loss": 0.4536, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.32776552158840216, |
|
"grad_norm": 0.12653909623622894, |
|
"learning_rate": 0.00018327083934237091, |
|
"loss": 0.4468, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3361697657316945, |
|
"grad_norm": 0.15930472314357758, |
|
"learning_rate": 0.00018269397173348718, |
|
"loss": 0.4542, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3445740098749869, |
|
"grad_norm": 0.13266988098621368, |
|
"learning_rate": 0.00018211710412460342, |
|
"loss": 0.4335, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35297825401827926, |
|
"grad_norm": 0.12103667855262756, |
|
"learning_rate": 0.00018154023651571966, |
|
"loss": 0.4575, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3613824981615716, |
|
"grad_norm": 0.14439740777015686, |
|
"learning_rate": 0.0001809633689068359, |
|
"loss": 0.4377, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.36978674230486397, |
|
"grad_norm": 0.12652407586574554, |
|
"learning_rate": 0.00018038650129795214, |
|
"loss": 0.4363, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3781909864481563, |
|
"grad_norm": 0.14594405889511108, |
|
"learning_rate": 0.00017980963368906835, |
|
"loss": 0.4306, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3865952305914487, |
|
"grad_norm": 0.12562687695026398, |
|
"learning_rate": 0.0001792327660801846, |
|
"loss": 0.4501, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.39499947473474106, |
|
"grad_norm": 0.14584492146968842, |
|
"learning_rate": 0.00017865589847130083, |
|
"loss": 0.4509, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4034037188780334, |
|
"grad_norm": 0.13192500174045563, |
|
"learning_rate": 0.00017807903086241707, |
|
"loss": 0.4505, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4118079630213258, |
|
"grad_norm": 0.14266645908355713, |
|
"learning_rate": 0.00017750216325353331, |
|
"loss": 0.4585, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42021220716461816, |
|
"grad_norm": 0.1400412619113922, |
|
"learning_rate": 0.00017692529564464958, |
|
"loss": 0.4365, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4286164513079105, |
|
"grad_norm": 0.14728468656539917, |
|
"learning_rate": 0.0001763484280357658, |
|
"loss": 0.4303, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.43702069545120287, |
|
"grad_norm": 0.15791365504264832, |
|
"learning_rate": 0.00017577156042688203, |
|
"loss": 0.4407, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4454249395944952, |
|
"grad_norm": 0.15447258949279785, |
|
"learning_rate": 0.00017519469281799827, |
|
"loss": 0.4365, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4538291837377876, |
|
"grad_norm": 0.1518252044916153, |
|
"learning_rate": 0.00017461782520911451, |
|
"loss": 0.4305, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.46223342788107996, |
|
"grad_norm": 0.1154065877199173, |
|
"learning_rate": 0.00017404095760023075, |
|
"loss": 0.4212, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4706376720243723, |
|
"grad_norm": 0.12900012731552124, |
|
"learning_rate": 0.000173464089991347, |
|
"loss": 0.4277, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 0.1349458247423172, |
|
"learning_rate": 0.00017288722238246323, |
|
"loss": 0.4051, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.48744616031095706, |
|
"grad_norm": 0.16337165236473083, |
|
"learning_rate": 0.00017231035477357947, |
|
"loss": 0.407, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4958504044542494, |
|
"grad_norm": 0.13420593738555908, |
|
"learning_rate": 0.0001717334871646957, |
|
"loss": 0.4138, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5042546485975418, |
|
"grad_norm": 0.13840581476688385, |
|
"learning_rate": 0.00017115661955581195, |
|
"loss": 0.4099, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5126588927408341, |
|
"grad_norm": 0.1378021389245987, |
|
"learning_rate": 0.0001705797519469282, |
|
"loss": 0.4254, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5210631368841265, |
|
"grad_norm": 0.1607150137424469, |
|
"learning_rate": 0.00017000288433804443, |
|
"loss": 0.4353, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5294673810274189, |
|
"grad_norm": 0.13462169468402863, |
|
"learning_rate": 0.00016942601672916067, |
|
"loss": 0.4267, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5378716251707112, |
|
"grad_norm": 0.14311543107032776, |
|
"learning_rate": 0.00016884914912027689, |
|
"loss": 0.4301, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5462758693140036, |
|
"grad_norm": 0.15559442341327667, |
|
"learning_rate": 0.00016827228151139313, |
|
"loss": 0.4102, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.554680113457296, |
|
"grad_norm": 0.15557149052619934, |
|
"learning_rate": 0.00016769541390250937, |
|
"loss": 0.4136, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5630843576005883, |
|
"grad_norm": 0.135511115193367, |
|
"learning_rate": 0.00016711854629362563, |
|
"loss": 0.4153, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5714886017438806, |
|
"grad_norm": 0.13760776817798615, |
|
"learning_rate": 0.00016654167868474187, |
|
"loss": 0.4145, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.579892845887173, |
|
"grad_norm": 0.14971590042114258, |
|
"learning_rate": 0.0001659648110758581, |
|
"loss": 0.3875, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5882970900304654, |
|
"grad_norm": 0.16005663573741913, |
|
"learning_rate": 0.00016538794346697433, |
|
"loss": 0.3938, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5967013341737577, |
|
"grad_norm": 0.1625218689441681, |
|
"learning_rate": 0.00016481107585809057, |
|
"loss": 0.3871, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6051055783170501, |
|
"grad_norm": 0.17047689855098724, |
|
"learning_rate": 0.0001642342082492068, |
|
"loss": 0.412, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6135098224603425, |
|
"grad_norm": 0.13825903832912445, |
|
"learning_rate": 0.00016365734064032305, |
|
"loss": 0.3948, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6219140666036348, |
|
"grad_norm": 0.14830929040908813, |
|
"learning_rate": 0.00016308047303143929, |
|
"loss": 0.3927, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6303183107469272, |
|
"grad_norm": 0.13950933516025543, |
|
"learning_rate": 0.00016250360542255553, |
|
"loss": 0.4051, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6387225548902196, |
|
"grad_norm": 0.15511371195316315, |
|
"learning_rate": 0.0001619267378136718, |
|
"loss": 0.4041, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6471267990335119, |
|
"grad_norm": 0.14828190207481384, |
|
"learning_rate": 0.000161349870204788, |
|
"loss": 0.3824, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6555310431768043, |
|
"grad_norm": 0.144051194190979, |
|
"learning_rate": 0.00016077300259590425, |
|
"loss": 0.3829, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6639352873200967, |
|
"grad_norm": 0.14780694246292114, |
|
"learning_rate": 0.00016019613498702049, |
|
"loss": 0.3814, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.672339531463389, |
|
"grad_norm": 0.15042325854301453, |
|
"learning_rate": 0.00015961926737813673, |
|
"loss": 0.3962, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6807437756066814, |
|
"grad_norm": 0.16325107216835022, |
|
"learning_rate": 0.00015904239976925297, |
|
"loss": 0.3801, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6891480197499738, |
|
"grad_norm": 0.14843328297138214, |
|
"learning_rate": 0.0001584655321603692, |
|
"loss": 0.4082, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6975522638932661, |
|
"grad_norm": 0.16731064021587372, |
|
"learning_rate": 0.00015788866455148545, |
|
"loss": 0.4192, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7059565080365585, |
|
"grad_norm": 0.18703435361385345, |
|
"learning_rate": 0.00015731179694260169, |
|
"loss": 0.4009, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7143607521798508, |
|
"grad_norm": 0.13935630023479462, |
|
"learning_rate": 0.00015673492933371793, |
|
"loss": 0.3618, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7227649963231432, |
|
"grad_norm": 0.13263636827468872, |
|
"learning_rate": 0.00015615806172483417, |
|
"loss": 0.3963, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7311692404664355, |
|
"grad_norm": 0.14940643310546875, |
|
"learning_rate": 0.0001555811941159504, |
|
"loss": 0.3585, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7395734846097279, |
|
"grad_norm": 0.14807912707328796, |
|
"learning_rate": 0.00015500432650706665, |
|
"loss": 0.3748, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7479777287530203, |
|
"grad_norm": 0.15254080295562744, |
|
"learning_rate": 0.00015442745889818286, |
|
"loss": 0.3718, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7563819728963126, |
|
"grad_norm": 0.16590768098831177, |
|
"learning_rate": 0.0001538505912892991, |
|
"loss": 0.386, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.764786217039605, |
|
"grad_norm": 0.15733902156352997, |
|
"learning_rate": 0.00015327372368041534, |
|
"loss": 0.3756, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7731904611828974, |
|
"grad_norm": 0.13757385313510895, |
|
"learning_rate": 0.00015269685607153158, |
|
"loss": 0.3843, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7815947053261897, |
|
"grad_norm": 0.14952607452869415, |
|
"learning_rate": 0.00015211998846264784, |
|
"loss": 0.3634, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7899989494694821, |
|
"grad_norm": 0.1516282558441162, |
|
"learning_rate": 0.00015154312085376408, |
|
"loss": 0.3798, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7984031936127745, |
|
"grad_norm": 0.17785628139972687, |
|
"learning_rate": 0.00015096625324488032, |
|
"loss": 0.3681, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8068074377560668, |
|
"grad_norm": 0.171351820230484, |
|
"learning_rate": 0.00015038938563599654, |
|
"loss": 0.3686, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8152116818993592, |
|
"grad_norm": 0.1742231398820877, |
|
"learning_rate": 0.00014981251802711278, |
|
"loss": 0.3792, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8236159260426515, |
|
"grad_norm": 0.16650599241256714, |
|
"learning_rate": 0.00014923565041822902, |
|
"loss": 0.3577, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8320201701859439, |
|
"grad_norm": 0.1497887670993805, |
|
"learning_rate": 0.00014865878280934526, |
|
"loss": 0.3553, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8404244143292363, |
|
"grad_norm": 0.14781557023525238, |
|
"learning_rate": 0.0001480819152004615, |
|
"loss": 0.3538, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8488286584725286, |
|
"grad_norm": 0.15724751353263855, |
|
"learning_rate": 0.00014750504759157774, |
|
"loss": 0.3597, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.857232902615821, |
|
"grad_norm": 0.18635571002960205, |
|
"learning_rate": 0.00014692817998269398, |
|
"loss": 0.3615, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8656371467591134, |
|
"grad_norm": 0.17742526531219482, |
|
"learning_rate": 0.00014635131237381022, |
|
"loss": 0.348, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8740413909024057, |
|
"grad_norm": 0.20535768568515778, |
|
"learning_rate": 0.00014577444476492646, |
|
"loss": 0.3343, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8824456350456981, |
|
"grad_norm": 0.18968522548675537, |
|
"learning_rate": 0.0001451975771560427, |
|
"loss": 0.3615, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8908498791889904, |
|
"grad_norm": 0.1528492122888565, |
|
"learning_rate": 0.00014462070954715894, |
|
"loss": 0.3786, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8992541233322828, |
|
"grad_norm": 0.15841075778007507, |
|
"learning_rate": 0.00014404384193827518, |
|
"loss": 0.3761, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9076583674755752, |
|
"grad_norm": 0.15167982876300812, |
|
"learning_rate": 0.0001434669743293914, |
|
"loss": 0.3528, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9160626116188675, |
|
"grad_norm": 0.14096671342849731, |
|
"learning_rate": 0.00014289010672050766, |
|
"loss": 0.371, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9244668557621599, |
|
"grad_norm": 0.1579194813966751, |
|
"learning_rate": 0.0001423132391116239, |
|
"loss": 0.3491, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9328710999054523, |
|
"grad_norm": 0.16789057850837708, |
|
"learning_rate": 0.00014173637150274014, |
|
"loss": 0.3536, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9412753440487446, |
|
"grad_norm": 0.13980717957019806, |
|
"learning_rate": 0.00014115950389385638, |
|
"loss": 0.3423, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.949679588192037, |
|
"grad_norm": 0.19879643619060516, |
|
"learning_rate": 0.00014058263628497262, |
|
"loss": 0.3285, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 0.16574440896511078, |
|
"learning_rate": 0.00014000576867608886, |
|
"loss": 0.3568, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9664880764786217, |
|
"grad_norm": 0.15376180410385132, |
|
"learning_rate": 0.00013942890106720507, |
|
"loss": 0.3558, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9748923206219141, |
|
"grad_norm": 0.17232170701026917, |
|
"learning_rate": 0.0001388520334583213, |
|
"loss": 0.342, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9832965647652064, |
|
"grad_norm": 0.1959993690252304, |
|
"learning_rate": 0.00013827516584943755, |
|
"loss": 0.3458, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9917008089084988, |
|
"grad_norm": 0.14029347896575928, |
|
"learning_rate": 0.0001376982982405538, |
|
"loss": 0.3297, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.0002101061035824, |
|
"grad_norm": 0.20758652687072754, |
|
"learning_rate": 0.00013712143063167006, |
|
"loss": 0.3642, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.0086143502468747, |
|
"grad_norm": 0.15599438548088074, |
|
"learning_rate": 0.0001365445630227863, |
|
"loss": 0.3004, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.017018594390167, |
|
"grad_norm": 0.16680683195590973, |
|
"learning_rate": 0.0001359676954139025, |
|
"loss": 0.2915, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.0254228385334594, |
|
"grad_norm": 0.1668105274438858, |
|
"learning_rate": 0.00013539082780501875, |
|
"loss": 0.2963, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.0338270826767517, |
|
"grad_norm": 0.16461539268493652, |
|
"learning_rate": 0.000134813960196135, |
|
"loss": 0.3041, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.042231326820044, |
|
"grad_norm": 0.18869394063949585, |
|
"learning_rate": 0.00013423709258725123, |
|
"loss": 0.3046, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.0506355709633364, |
|
"grad_norm": 0.16899700462818146, |
|
"learning_rate": 0.00013366022497836747, |
|
"loss": 0.2921, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.059039815106629, |
|
"grad_norm": 0.1905297338962555, |
|
"learning_rate": 0.0001330833573694837, |
|
"loss": 0.2879, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0674440592499213, |
|
"grad_norm": 0.17273731529712677, |
|
"learning_rate": 0.00013250648976059995, |
|
"loss": 0.3038, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.0758483033932136, |
|
"grad_norm": 0.1947745531797409, |
|
"learning_rate": 0.0001319296221517162, |
|
"loss": 0.3029, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.084252547536506, |
|
"grad_norm": 0.1741725355386734, |
|
"learning_rate": 0.00013135275454283243, |
|
"loss": 0.3073, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.0926567916797982, |
|
"grad_norm": 0.18244194984436035, |
|
"learning_rate": 0.00013077588693394867, |
|
"loss": 0.287, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.1010610358230906, |
|
"grad_norm": 0.18360966444015503, |
|
"learning_rate": 0.0001301990193250649, |
|
"loss": 0.307, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.1094652799663831, |
|
"grad_norm": 0.16066686809062958, |
|
"learning_rate": 0.00012962215171618115, |
|
"loss": 0.2712, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.1178695241096754, |
|
"grad_norm": 0.16239213943481445, |
|
"learning_rate": 0.00012904528410729736, |
|
"loss": 0.2857, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.1262737682529678, |
|
"grad_norm": 0.16966617107391357, |
|
"learning_rate": 0.0001284684164984136, |
|
"loss": 0.3087, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.13467801239626, |
|
"grad_norm": 0.16753819584846497, |
|
"learning_rate": 0.00012789154888952984, |
|
"loss": 0.2852, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.1430822565395524, |
|
"grad_norm": 0.19184084236621857, |
|
"learning_rate": 0.0001273146812806461, |
|
"loss": 0.3138, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.1514865006828447, |
|
"grad_norm": 0.15949766337871552, |
|
"learning_rate": 0.00012673781367176235, |
|
"loss": 0.2812, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.159890744826137, |
|
"grad_norm": 0.16187496483325958, |
|
"learning_rate": 0.0001261609460628786, |
|
"loss": 0.2841, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1682949889694296, |
|
"grad_norm": 0.1778268665075302, |
|
"learning_rate": 0.00012558407845399483, |
|
"loss": 0.3181, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.176699233112722, |
|
"grad_norm": 0.17179737985134125, |
|
"learning_rate": 0.00012500721084511104, |
|
"loss": 0.2904, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1851034772560143, |
|
"grad_norm": 0.16989010572433472, |
|
"learning_rate": 0.00012443034323622728, |
|
"loss": 0.2856, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.1935077213993066, |
|
"grad_norm": 0.21040703356266022, |
|
"learning_rate": 0.00012385347562734352, |
|
"loss": 0.2743, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.201911965542599, |
|
"grad_norm": 0.19255656003952026, |
|
"learning_rate": 0.00012327660801845976, |
|
"loss": 0.316, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.2103162096858915, |
|
"grad_norm": 0.16303245723247528, |
|
"learning_rate": 0.000122699740409576, |
|
"loss": 0.2671, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.2187204538291838, |
|
"grad_norm": 0.21385671198368073, |
|
"learning_rate": 0.00012212287280069227, |
|
"loss": 0.2865, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.2271246979724761, |
|
"grad_norm": 0.18770861625671387, |
|
"learning_rate": 0.00012154600519180848, |
|
"loss": 0.2795, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.2355289421157685, |
|
"grad_norm": 0.20827870070934296, |
|
"learning_rate": 0.00012096913758292472, |
|
"loss": 0.2769, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.2439331862590608, |
|
"grad_norm": 0.1704486757516861, |
|
"learning_rate": 0.00012039226997404096, |
|
"loss": 0.2993, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.2523374304023531, |
|
"grad_norm": 0.21233461797237396, |
|
"learning_rate": 0.0001198154023651572, |
|
"loss": 0.2912, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2607416745456455, |
|
"grad_norm": 0.1879620999097824, |
|
"learning_rate": 0.00011923853475627344, |
|
"loss": 0.2885, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.2691459186889378, |
|
"grad_norm": 0.14288674294948578, |
|
"learning_rate": 0.00011866166714738968, |
|
"loss": 0.2794, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.2775501628322303, |
|
"grad_norm": 0.1654644012451172, |
|
"learning_rate": 0.00011808479953850591, |
|
"loss": 0.2762, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.2859544069755227, |
|
"grad_norm": 0.15648572146892548, |
|
"learning_rate": 0.00011750793192962215, |
|
"loss": 0.2853, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.294358651118815, |
|
"grad_norm": 0.14321617782115936, |
|
"learning_rate": 0.00011693106432073839, |
|
"loss": 0.2949, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.3027628952621073, |
|
"grad_norm": 0.18823479115962982, |
|
"learning_rate": 0.00011635419671185464, |
|
"loss": 0.2734, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.3111671394053999, |
|
"grad_norm": 0.1524640917778015, |
|
"learning_rate": 0.00011577732910297088, |
|
"loss": 0.2668, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.3195713835486922, |
|
"grad_norm": 0.1731933057308197, |
|
"learning_rate": 0.00011520046149408712, |
|
"loss": 0.2815, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.3279756276919845, |
|
"grad_norm": 0.19858598709106445, |
|
"learning_rate": 0.00011462359388520336, |
|
"loss": 0.2863, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.3363798718352768, |
|
"grad_norm": 0.20350554585456848, |
|
"learning_rate": 0.00011404672627631959, |
|
"loss": 0.2974, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.3447841159785692, |
|
"grad_norm": 0.16735605895519257, |
|
"learning_rate": 0.00011346985866743583, |
|
"loss": 0.2742, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.3531883601218615, |
|
"grad_norm": 0.18708328902721405, |
|
"learning_rate": 0.00011289299105855207, |
|
"loss": 0.2877, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.3615926042651538, |
|
"grad_norm": 0.19334456324577332, |
|
"learning_rate": 0.00011231612344966831, |
|
"loss": 0.2735, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.3699968484084462, |
|
"grad_norm": 0.20367129147052765, |
|
"learning_rate": 0.00011173925584078455, |
|
"loss": 0.2801, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.3784010925517387, |
|
"grad_norm": 0.18539854884147644, |
|
"learning_rate": 0.00011116238823190079, |
|
"loss": 0.2842, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.386805336695031, |
|
"grad_norm": 0.2150140106678009, |
|
"learning_rate": 0.00011058552062301701, |
|
"loss": 0.2611, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3952095808383234, |
|
"grad_norm": 0.162113755941391, |
|
"learning_rate": 0.00011000865301413325, |
|
"loss": 0.289, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.4036138249816157, |
|
"grad_norm": 0.18180853128433228, |
|
"learning_rate": 0.0001094317854052495, |
|
"loss": 0.2808, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.412018069124908, |
|
"grad_norm": 0.17916476726531982, |
|
"learning_rate": 0.00010885491779636575, |
|
"loss": 0.2912, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.4204223132682006, |
|
"grad_norm": 0.22721944749355316, |
|
"learning_rate": 0.00010827805018748199, |
|
"loss": 0.2611, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.428826557411493, |
|
"grad_norm": 0.16184848546981812, |
|
"learning_rate": 0.00010770118257859823, |
|
"loss": 0.2722, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.4372308015547852, |
|
"grad_norm": 0.19588448107242584, |
|
"learning_rate": 0.00010712431496971444, |
|
"loss": 0.2817, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.4456350456980775, |
|
"grad_norm": 0.1870766133069992, |
|
"learning_rate": 0.0001065474473608307, |
|
"loss": 0.2835, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.4540392898413699, |
|
"grad_norm": 0.1768248826265335, |
|
"learning_rate": 0.00010597057975194693, |
|
"loss": 0.2643, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.4624435339846622, |
|
"grad_norm": 0.1726955771446228, |
|
"learning_rate": 0.00010539371214306317, |
|
"loss": 0.2674, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.4708477781279545, |
|
"grad_norm": 0.1709883064031601, |
|
"learning_rate": 0.00010481684453417941, |
|
"loss": 0.262, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.4792520222712469, |
|
"grad_norm": 0.2008083164691925, |
|
"learning_rate": 0.00010423997692529565, |
|
"loss": 0.2634, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.4876562664145394, |
|
"grad_norm": 0.17773209512233734, |
|
"learning_rate": 0.0001036631093164119, |
|
"loss": 0.2805, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.4960605105578317, |
|
"grad_norm": 0.18000538647174835, |
|
"learning_rate": 0.00010308624170752812, |
|
"loss": 0.2443, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.504464754701124, |
|
"grad_norm": 0.2176659256219864, |
|
"learning_rate": 0.00010250937409864436, |
|
"loss": 0.2594, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.5128689988444164, |
|
"grad_norm": 0.15863171219825745, |
|
"learning_rate": 0.0001019325064897606, |
|
"loss": 0.2751, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.521273242987709, |
|
"grad_norm": 0.19906319677829742, |
|
"learning_rate": 0.00010135563888087685, |
|
"loss": 0.2865, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.5296774871310013, |
|
"grad_norm": 0.21247649192810059, |
|
"learning_rate": 0.00010077877127199309, |
|
"loss": 0.2892, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.5380817312742936, |
|
"grad_norm": 0.21099700033664703, |
|
"learning_rate": 0.00010020190366310933, |
|
"loss": 0.3008, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.546485975417586, |
|
"grad_norm": 0.15469135344028473, |
|
"learning_rate": 9.962503605422556e-05, |
|
"loss": 0.2672, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.5548902195608783, |
|
"grad_norm": 0.16477440297603607, |
|
"learning_rate": 9.90481684453418e-05, |
|
"loss": 0.2799, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.5632944637041706, |
|
"grad_norm": 0.17361459136009216, |
|
"learning_rate": 9.847130083645804e-05, |
|
"loss": 0.2756, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.571698707847463, |
|
"grad_norm": 0.15138483047485352, |
|
"learning_rate": 9.789443322757428e-05, |
|
"loss": 0.2785, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.5801029519907552, |
|
"grad_norm": 0.16653598845005035, |
|
"learning_rate": 9.731756561869052e-05, |
|
"loss": 0.2814, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.5885071961340476, |
|
"grad_norm": 0.16785801947116852, |
|
"learning_rate": 9.674069800980675e-05, |
|
"loss": 0.2752, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.59691144027734, |
|
"grad_norm": 0.21643054485321045, |
|
"learning_rate": 9.6163830400923e-05, |
|
"loss": 0.2623, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.6053156844206324, |
|
"grad_norm": 0.15368995070457458, |
|
"learning_rate": 9.558696279203924e-05, |
|
"loss": 0.2722, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.6137199285639248, |
|
"grad_norm": 0.21962004899978638, |
|
"learning_rate": 9.501009518315547e-05, |
|
"loss": 0.2563, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.622124172707217, |
|
"grad_norm": 0.14919191598892212, |
|
"learning_rate": 9.44332275742717e-05, |
|
"loss": 0.2502, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.6305284168505096, |
|
"grad_norm": 0.2036961317062378, |
|
"learning_rate": 9.385635996538795e-05, |
|
"loss": 0.2539, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.638932660993802, |
|
"grad_norm": 0.19002236425876617, |
|
"learning_rate": 9.327949235650419e-05, |
|
"loss": 0.2464, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.6473369051370943, |
|
"grad_norm": 0.16677500307559967, |
|
"learning_rate": 9.270262474762043e-05, |
|
"loss": 0.2684, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.6557411492803866, |
|
"grad_norm": 0.15206314623355865, |
|
"learning_rate": 9.212575713873667e-05, |
|
"loss": 0.242, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.664145393423679, |
|
"grad_norm": 0.17641034722328186, |
|
"learning_rate": 9.15488895298529e-05, |
|
"loss": 0.2604, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.6725496375669713, |
|
"grad_norm": 0.17574937641620636, |
|
"learning_rate": 9.097202192096915e-05, |
|
"loss": 0.2547, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.6809538817102636, |
|
"grad_norm": 0.16344806551933289, |
|
"learning_rate": 9.039515431208539e-05, |
|
"loss": 0.2681, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.689358125853556, |
|
"grad_norm": 0.18498322367668152, |
|
"learning_rate": 8.981828670320161e-05, |
|
"loss": 0.2713, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.6977623699968483, |
|
"grad_norm": 0.14767137169837952, |
|
"learning_rate": 8.924141909431785e-05, |
|
"loss": 0.2604, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.7061666141401408, |
|
"grad_norm": 0.1902410387992859, |
|
"learning_rate": 8.86645514854341e-05, |
|
"loss": 0.2516, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.7145708582834331, |
|
"grad_norm": 0.1728687733411789, |
|
"learning_rate": 8.808768387655033e-05, |
|
"loss": 0.2711, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.7229751024267255, |
|
"grad_norm": 0.1836615651845932, |
|
"learning_rate": 8.751081626766657e-05, |
|
"loss": 0.2717, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.731379346570018, |
|
"grad_norm": 0.1553170531988144, |
|
"learning_rate": 8.693394865878281e-05, |
|
"loss": 0.2303, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.7397835907133103, |
|
"grad_norm": 0.1942613571882248, |
|
"learning_rate": 8.635708104989905e-05, |
|
"loss": 0.2581, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.7481878348566027, |
|
"grad_norm": 0.1734922230243683, |
|
"learning_rate": 8.578021344101529e-05, |
|
"loss": 0.259, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.756592078999895, |
|
"grad_norm": 0.1309240758419037, |
|
"learning_rate": 8.520334583213153e-05, |
|
"loss": 0.2381, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.7649963231431873, |
|
"grad_norm": 0.17716042697429657, |
|
"learning_rate": 8.462647822324777e-05, |
|
"loss": 0.2413, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.7734005672864797, |
|
"grad_norm": 0.16437722742557526, |
|
"learning_rate": 8.404961061436401e-05, |
|
"loss": 0.2699, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.781804811429772, |
|
"grad_norm": 0.15865294635295868, |
|
"learning_rate": 8.347274300548025e-05, |
|
"loss": 0.2515, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.7902090555730643, |
|
"grad_norm": 0.16365793347358704, |
|
"learning_rate": 8.289587539659649e-05, |
|
"loss": 0.2507, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.7986132997163566, |
|
"grad_norm": 0.19089579582214355, |
|
"learning_rate": 8.231900778771272e-05, |
|
"loss": 0.2572, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.807017543859649, |
|
"grad_norm": 0.1750141978263855, |
|
"learning_rate": 8.174214017882896e-05, |
|
"loss": 0.2692, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.8154217880029415, |
|
"grad_norm": 0.14101552963256836, |
|
"learning_rate": 8.116527256994521e-05, |
|
"loss": 0.2658, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.8238260321462338, |
|
"grad_norm": 0.14396284520626068, |
|
"learning_rate": 8.058840496106144e-05, |
|
"loss": 0.2556, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.8322302762895262, |
|
"grad_norm": 0.15593650937080383, |
|
"learning_rate": 8.001153735217768e-05, |
|
"loss": 0.2442, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.8406345204328187, |
|
"grad_norm": 0.18202078342437744, |
|
"learning_rate": 7.943466974329392e-05, |
|
"loss": 0.2509, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.849038764576111, |
|
"grad_norm": 0.17855936288833618, |
|
"learning_rate": 7.885780213441016e-05, |
|
"loss": 0.2595, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.8574430087194034, |
|
"grad_norm": 0.16823212802410126, |
|
"learning_rate": 7.82809345255264e-05, |
|
"loss": 0.2469, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.8658472528626957, |
|
"grad_norm": 0.15248893201351166, |
|
"learning_rate": 7.770406691664264e-05, |
|
"loss": 0.2603, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.874251497005988, |
|
"grad_norm": 0.16229604184627533, |
|
"learning_rate": 7.712719930775886e-05, |
|
"loss": 0.2434, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.8826557411492804, |
|
"grad_norm": 0.18594375252723694, |
|
"learning_rate": 7.655033169887512e-05, |
|
"loss": 0.266, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.8910599852925727, |
|
"grad_norm": 0.18467053771018982, |
|
"learning_rate": 7.597346408999136e-05, |
|
"loss": 0.2535, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.899464229435865, |
|
"grad_norm": 0.18451227247714996, |
|
"learning_rate": 7.539659648110758e-05, |
|
"loss": 0.2579, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.9078684735791573, |
|
"grad_norm": 0.15458305180072784, |
|
"learning_rate": 7.481972887222382e-05, |
|
"loss": 0.2506, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.91627271772245, |
|
"grad_norm": 0.17949137091636658, |
|
"learning_rate": 7.424286126334006e-05, |
|
"loss": 0.2659, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.9246769618657422, |
|
"grad_norm": 0.1898379623889923, |
|
"learning_rate": 7.366599365445632e-05, |
|
"loss": 0.2882, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.9330812060090345, |
|
"grad_norm": 0.14720788598060608, |
|
"learning_rate": 7.308912604557254e-05, |
|
"loss": 0.2367, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.9414854501523269, |
|
"grad_norm": 0.15253467857837677, |
|
"learning_rate": 7.251225843668878e-05, |
|
"loss": 0.256, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.9498896942956194, |
|
"grad_norm": 0.1564057618379593, |
|
"learning_rate": 7.193539082780502e-05, |
|
"loss": 0.2536, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.9582939384389118, |
|
"grad_norm": 0.15893864631652832, |
|
"learning_rate": 7.135852321892126e-05, |
|
"loss": 0.2347, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.966698182582204, |
|
"grad_norm": 0.20592626929283142, |
|
"learning_rate": 7.07816556100375e-05, |
|
"loss": 0.2419, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.9751024267254964, |
|
"grad_norm": 0.20137999951839447, |
|
"learning_rate": 7.020478800115374e-05, |
|
"loss": 0.2415, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.9835066708687887, |
|
"grad_norm": 0.19287312030792236, |
|
"learning_rate": 6.962792039226997e-05, |
|
"loss": 0.2484, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.991910915012081, |
|
"grad_norm": 0.1620776355266571, |
|
"learning_rate": 6.905105278338622e-05, |
|
"loss": 0.2599, |
|
"step": 2370 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3567, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.613824550319268e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|