| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9995971532160601, |
| "eval_steps": 500, |
| "global_step": 1861, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005371290452531221, |
| "grad_norm": 495.1745300292969, |
| "learning_rate": 1.7857142857142858e-10, |
| "loss": 40.2102, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.010742580905062441, |
| "grad_norm": 477.615966796875, |
| "learning_rate": 3.5714285714285715e-10, |
| "loss": 40.7707, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01611387135759366, |
| "grad_norm": 492.8292236328125, |
| "learning_rate": 5.357142857142857e-10, |
| "loss": 40.2476, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.021485161810124883, |
| "grad_norm": 529.812255859375, |
| "learning_rate": 7.142857142857143e-10, |
| "loss": 40.2061, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.026856452262656105, |
| "grad_norm": 534.6681518554688, |
| "learning_rate": 8.92857142857143e-10, |
| "loss": 40.3632, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03222774271518732, |
| "grad_norm": 493.5475769042969, |
| "learning_rate": 9.999878827638992e-10, |
| "loss": 40.3255, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.037599033167718544, |
| "grad_norm": 514.5845336914062, |
| "learning_rate": 9.998515706025587e-10, |
| "loss": 40.1727, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.042970323620249766, |
| "grad_norm": 520.8786010742188, |
| "learning_rate": 9.99563841164693e-10, |
| "loss": 40.3397, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.04834161407278099, |
| "grad_norm": 506.258056640625, |
| "learning_rate": 9.991247816105924e-10, |
| "loss": 40.5221, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05371290452531221, |
| "grad_norm": 518.2510375976562, |
| "learning_rate": 9.985345249421433e-10, |
| "loss": 40.3247, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.059084194977843424, |
| "grad_norm": 509.5160827636719, |
| "learning_rate": 9.977932499625396e-10, |
| "loss": 40.4914, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.06445548543037465, |
| "grad_norm": 521.5568237304688, |
| "learning_rate": 9.969011812221178e-10, |
| "loss": 40.4141, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.06982677588290587, |
| "grad_norm": 429.8058776855469, |
| "learning_rate": 9.958585889503364e-10, |
| "loss": 40.7874, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07519806633543709, |
| "grad_norm": 523.8095092773438, |
| "learning_rate": 9.946657889739163e-10, |
| "loss": 40.9356, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.08056935678796831, |
| "grad_norm": 492.9051818847656, |
| "learning_rate": 9.933231426211678e-10, |
| "loss": 40.3382, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08594064724049953, |
| "grad_norm": 455.4582824707031, |
| "learning_rate": 9.918310566125387e-10, |
| "loss": 39.8185, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.09131193769303075, |
| "grad_norm": 479.072265625, |
| "learning_rate": 9.901899829374048e-10, |
| "loss": 39.3643, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.09668322814556198, |
| "grad_norm": 472.71112060546875, |
| "learning_rate": 9.884004187171542e-10, |
| "loss": 39.7452, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.1020545185980932, |
| "grad_norm": 508.54547119140625, |
| "learning_rate": 9.864629060545955e-10, |
| "loss": 41.1781, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.10742580905062442, |
| "grad_norm": 468.7608642578125, |
| "learning_rate": 9.843780318697425e-10, |
| "loss": 40.0115, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.11279709950315564, |
| "grad_norm": 512.1522216796875, |
| "learning_rate": 9.821464277220207e-10, |
| "loss": 40.1177, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.11816838995568685, |
| "grad_norm": 483.4913635253906, |
| "learning_rate": 9.79768769618954e-10, |
| "loss": 40.2036, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.12353968040821807, |
| "grad_norm": 509.5931091308594, |
| "learning_rate": 9.77245777811384e-10, |
| "loss": 39.6307, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.1289109708607493, |
| "grad_norm": 483.1646728515625, |
| "learning_rate": 9.745782165752891e-10, |
| "loss": 40.7153, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.13428226131328053, |
| "grad_norm": 436.9029235839844, |
| "learning_rate": 9.717668939802664e-10, |
| "loss": 40.1682, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.13965355176581173, |
| "grad_norm": 467.0509338378906, |
| "learning_rate": 9.68812661644748e-10, |
| "loss": 40.397, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.14502484221834297, |
| "grad_norm": 496.4578857421875, |
| "learning_rate": 9.657164144780247e-10, |
| "loss": 40.5665, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.15039613267087418, |
| "grad_norm": 542.7703857421875, |
| "learning_rate": 9.624790904091554e-10, |
| "loss": 40.8245, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.15576742312340539, |
| "grad_norm": 467.4183349609375, |
| "learning_rate": 9.59101670102847e-10, |
| "loss": 40.5495, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.16113871357593662, |
| "grad_norm": 514.7463989257812, |
| "learning_rate": 9.555851766623854e-10, |
| "loss": 40.3572, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.16651000402846783, |
| "grad_norm": 479.075439453125, |
| "learning_rate": 9.519306753197134e-10, |
| "loss": 40.9737, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.17188129448099906, |
| "grad_norm": 480.1302185058594, |
| "learning_rate": 9.481392731127458e-10, |
| "loss": 40.1158, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.17725258493353027, |
| "grad_norm": 459.5805969238281, |
| "learning_rate": 9.442121185500201e-10, |
| "loss": 39.8788, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.1826238753860615, |
| "grad_norm": 460.0196533203125, |
| "learning_rate": 9.401504012627857e-10, |
| "loss": 39.5867, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.18799516583859271, |
| "grad_norm": 477.8091735839844, |
| "learning_rate": 9.35955351644635e-10, |
| "loss": 41.1957, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.19336645629112395, |
| "grad_norm": 475.1239929199219, |
| "learning_rate": 9.31628240478787e-10, |
| "loss": 40.1875, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.19873774674365516, |
| "grad_norm": 522.9048461914062, |
| "learning_rate": 9.27170378553137e-10, |
| "loss": 40.7773, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2041090371961864, |
| "grad_norm": 489.0563049316406, |
| "learning_rate": 9.225831162631853e-10, |
| "loss": 40.9744, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2094803276487176, |
| "grad_norm": 426.8688659667969, |
| "learning_rate": 9.178678432029706e-10, |
| "loss": 39.4605, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.21485161810124884, |
| "grad_norm": 479.16900634765625, |
| "learning_rate": 9.130259877441272e-10, |
| "loss": 39.4938, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.22022290855378004, |
| "grad_norm": 523.247802734375, |
| "learning_rate": 9.080590166031966e-10, |
| "loss": 40.7306, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.22559419900631128, |
| "grad_norm": 505.7490234375, |
| "learning_rate": 9.02968434397323e-10, |
| "loss": 40.1471, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.2309654894588425, |
| "grad_norm": 474.9671630859375, |
| "learning_rate": 8.977557831884684e-10, |
| "loss": 41.0007, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.2363367799113737, |
| "grad_norm": 490.84332275390625, |
| "learning_rate": 8.924226420162834e-10, |
| "loss": 39.3389, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.24170807036390493, |
| "grad_norm": 542.4966430664062, |
| "learning_rate": 8.869706264197784e-10, |
| "loss": 40.3484, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.24707936081643614, |
| "grad_norm": 480.568603515625, |
| "learning_rate": 8.814013879479366e-10, |
| "loss": 40.1192, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2524506512689674, |
| "grad_norm": 496.9430236816406, |
| "learning_rate": 8.757166136594194e-10, |
| "loss": 40.247, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.2578219417214986, |
| "grad_norm": 519.3447265625, |
| "learning_rate": 8.699180256115157e-10, |
| "loss": 39.9668, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2631932321740298, |
| "grad_norm": 462.6995849609375, |
| "learning_rate": 8.640073803384881e-10, |
| "loss": 40.2213, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.26856452262656105, |
| "grad_norm": 498.30548095703125, |
| "learning_rate": 8.579864683194752e-10, |
| "loss": 39.849, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.27393581307909226, |
| "grad_norm": 491.5065002441406, |
| "learning_rate": 8.518571134361129e-10, |
| "loss": 39.8567, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.27930710353162347, |
| "grad_norm": 401.1820068359375, |
| "learning_rate": 8.456211724200347e-10, |
| "loss": 40.0964, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.2846783939841547, |
| "grad_norm": 472.61627197265625, |
| "learning_rate": 8.392805342904231e-10, |
| "loss": 39.9992, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.29004968443668594, |
| "grad_norm": 504.82861328125, |
| "learning_rate": 8.328371197817788e-10, |
| "loss": 40.4024, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.29542097488921715, |
| "grad_norm": 460.82733154296875, |
| "learning_rate": 8.262928807620843e-10, |
| "loss": 41.2341, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.30079226534174835, |
| "grad_norm": 515.23583984375, |
| "learning_rate": 8.196497996415337e-10, |
| "loss": 40.4191, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.30616355579427956, |
| "grad_norm": 534.2943725585938, |
| "learning_rate": 8.129098887720137e-10, |
| "loss": 40.0219, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.31153484624681077, |
| "grad_norm": 506.2889099121094, |
| "learning_rate": 8.060751898375115e-10, |
| "loss": 40.2062, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.31690613669934203, |
| "grad_norm": 451.0182800292969, |
| "learning_rate": 7.991477732356403e-10, |
| "loss": 40.1886, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.32227742715187324, |
| "grad_norm": 497.3751525878906, |
| "learning_rate": 7.921297374504637e-10, |
| "loss": 40.7882, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.32764871760440445, |
| "grad_norm": 548.7998657226562, |
| "learning_rate": 7.850232084168145e-10, |
| "loss": 40.9427, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.33302000805693566, |
| "grad_norm": 486.71063232421875, |
| "learning_rate": 7.778303388762966e-10, |
| "loss": 39.4863, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.3383912985094669, |
| "grad_norm": 448.2780456542969, |
| "learning_rate": 7.705533077251672e-10, |
| "loss": 39.9087, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.34376258896199813, |
| "grad_norm": 526.2222900390625, |
| "learning_rate": 7.63194319354295e-10, |
| "loss": 39.7048, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.34913387941452934, |
| "grad_norm": 492.9909973144531, |
| "learning_rate": 7.557556029813974e-10, |
| "loss": 39.5465, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.35450516986706054, |
| "grad_norm": 483.2941589355469, |
| "learning_rate": 7.482394119757546e-10, |
| "loss": 40.6158, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.3598764603195918, |
| "grad_norm": 475.9729309082031, |
| "learning_rate": 7.406480231756098e-10, |
| "loss": 39.8862, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.365247750772123, |
| "grad_norm": 477.7049255371094, |
| "learning_rate": 7.329837361984598e-10, |
| "loss": 40.462, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.3706190412246542, |
| "grad_norm": 448.4286804199219, |
| "learning_rate": 7.252488727444418e-10, |
| "loss": 40.037, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.37599033167718543, |
| "grad_norm": 480.7619934082031, |
| "learning_rate": 7.174457758930374e-10, |
| "loss": 41.1926, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.38136162212971664, |
| "grad_norm": 452.7475280761719, |
| "learning_rate": 7.095768093932932e-10, |
| "loss": 39.8431, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.3867329125822479, |
| "grad_norm": 419.9246826171875, |
| "learning_rate": 7.016443569477854e-10, |
| "loss": 39.7369, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.3921042030347791, |
| "grad_norm": 516.64306640625, |
| "learning_rate": 6.936508214905369e-10, |
| "loss": 39.727, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.3974754934873103, |
| "grad_norm": 532.8106079101562, |
| "learning_rate": 6.855986244591104e-10, |
| "loss": 39.0725, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.4028467839398415, |
| "grad_norm": 510.8319396972656, |
| "learning_rate": 6.774902050610951e-10, |
| "loss": 40.6862, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.4082180743923728, |
| "grad_norm": 525.424560546875, |
| "learning_rate": 6.693280195352114e-10, |
| "loss": 40.5439, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.413589364844904, |
| "grad_norm": 476.6329040527344, |
| "learning_rate": 6.61114540407256e-10, |
| "loss": 40.4504, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.4189606552974352, |
| "grad_norm": 495.2875061035156, |
| "learning_rate": 6.528522557411133e-10, |
| "loss": 40.9673, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.4243319457499664, |
| "grad_norm": 468.4483642578125, |
| "learning_rate": 6.445436683850597e-10, |
| "loss": 40.2403, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.4297032362024977, |
| "grad_norm": 516.677490234375, |
| "learning_rate": 6.361912952135903e-10, |
| "loss": 40.4345, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.4350745266550289, |
| "grad_norm": 509.36138916015625, |
| "learning_rate": 6.277976663649947e-10, |
| "loss": 39.9229, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4404458171075601, |
| "grad_norm": 502.6529541015625, |
| "learning_rate": 6.193653244749179e-10, |
| "loss": 40.928, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.4458171075600913, |
| "grad_norm": 477.757568359375, |
| "learning_rate": 6.108968239061324e-10, |
| "loss": 40.2371, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.45118839801262256, |
| "grad_norm": 483.0531005859375, |
| "learning_rate": 6.023947299747592e-10, |
| "loss": 40.7409, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.45655968846515377, |
| "grad_norm": 495.1935119628906, |
| "learning_rate": 5.93861618173172e-10, |
| "loss": 40.123, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.461930978917685, |
| "grad_norm": 420.5578918457031, |
| "learning_rate": 5.853000733898161e-10, |
| "loss": 39.6038, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.4673022693702162, |
| "grad_norm": 512.252197265625, |
| "learning_rate": 5.767126891261828e-10, |
| "loss": 40.0436, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.4726735598227474, |
| "grad_norm": 499.7673645019531, |
| "learning_rate": 5.681020667111754e-10, |
| "loss": 39.6081, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.47804485027527865, |
| "grad_norm": 454.9427185058594, |
| "learning_rate": 5.594708145131012e-10, |
| "loss": 39.5993, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.48341614072780986, |
| "grad_norm": 437.3612060546875, |
| "learning_rate": 5.508215471495337e-10, |
| "loss": 39.8825, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.48878743118034107, |
| "grad_norm": 520.9217529296875, |
| "learning_rate": 5.421568846952822e-10, |
| "loss": 41.5034, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.4941587216328723, |
| "grad_norm": 488.6954650878906, |
| "learning_rate": 5.334794518887044e-10, |
| "loss": 39.5379, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.49953001208540354, |
| "grad_norm": 519.3870849609375, |
| "learning_rate": 5.247918773366112e-10, |
| "loss": 39.54, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.5049013025379347, |
| "grad_norm": 497.21746826171875, |
| "learning_rate": 5.160967927179963e-10, |
| "loss": 40.0503, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.510272592990466, |
| "grad_norm": 476.4524841308594, |
| "learning_rate": 5.073968319868361e-10, |
| "loss": 39.7168, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.5156438834429972, |
| "grad_norm": 484.02777099609375, |
| "learning_rate": 4.986946305742012e-10, |
| "loss": 39.6419, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.5210151738955284, |
| "grad_norm": 451.2009582519531, |
| "learning_rate": 4.899928245899194e-10, |
| "loss": 40.7173, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5263864643480596, |
| "grad_norm": 499.43408203125, |
| "learning_rate": 4.812940500240333e-10, |
| "loss": 40.2658, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.5317577548005908, |
| "grad_norm": 449.65496826171875, |
| "learning_rate": 4.72600941948295e-10, |
| "loss": 40.1894, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.5371290452531221, |
| "grad_norm": 492.1304016113281, |
| "learning_rate": 4.6391613371793786e-10, |
| "loss": 40.3246, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.5425003357056533, |
| "grad_norm": 484.3329772949219, |
| "learning_rate": 4.5524225617396904e-10, |
| "loss": 40.9067, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.5478716261581845, |
| "grad_norm": 447.1062927246094, |
| "learning_rate": 4.4658193684622293e-10, |
| "loss": 40.208, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.5532429166107157, |
| "grad_norm": 466.41522216796875, |
| "learning_rate": 4.3793779915741885e-10, |
| "loss": 40.008, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.5586142070632469, |
| "grad_norm": 472.39013671875, |
| "learning_rate": 4.293124616284608e-10, |
| "loss": 40.3332, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.5639854975157782, |
| "grad_norm": 463.3416748046875, |
| "learning_rate": 4.2070853708522495e-10, |
| "loss": 40.8243, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.5693567879683094, |
| "grad_norm": 516.2078247070312, |
| "learning_rate": 4.1212863186706943e-10, |
| "loss": 40.5519, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.5747280784208406, |
| "grad_norm": 487.50628662109375, |
| "learning_rate": 4.035753450373111e-10, |
| "loss": 40.4969, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.5800993688733719, |
| "grad_norm": 520.0319213867188, |
| "learning_rate": 3.950512675959052e-10, |
| "loss": 39.9747, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.585470659325903, |
| "grad_norm": 543.7863159179688, |
| "learning_rate": 3.865589816945685e-10, |
| "loss": 40.0276, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.5908419497784343, |
| "grad_norm": 486.9960021972656, |
| "learning_rate": 3.7810105985458137e-10, |
| "loss": 40.1272, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.5962132402309654, |
| "grad_norm": 502.4769287109375, |
| "learning_rate": 3.6968006418751e-10, |
| "loss": 40.3276, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.6015845306834967, |
| "grad_norm": 472.21533203125, |
| "learning_rate": 3.6129854561907786e-10, |
| "loss": 40.4212, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.606955821136028, |
| "grad_norm": 434.5205078125, |
| "learning_rate": 3.5295904311642897e-10, |
| "loss": 39.5327, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.6123271115885591, |
| "grad_norm": 511.1942138671875, |
| "learning_rate": 3.446640829190133e-10, |
| "loss": 40.5099, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.6176984020410904, |
| "grad_norm": 479.92901611328125, |
| "learning_rate": 3.3641617777332523e-10, |
| "loss": 39.1485, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.6230696924936215, |
| "grad_norm": 512.5575561523438, |
| "learning_rate": 3.2821782617173294e-10, |
| "loss": 41.33, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.6284409829461528, |
| "grad_norm": 517.29833984375, |
| "learning_rate": 3.2007151159562237e-10, |
| "loss": 39.8799, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.6338122733986841, |
| "grad_norm": 452.1294860839844, |
| "learning_rate": 3.119797017630914e-10, |
| "loss": 40.0134, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.6391835638512152, |
| "grad_norm": 499.8146057128906, |
| "learning_rate": 3.0394484788141616e-10, |
| "loss": 40.4734, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.6445548543037465, |
| "grad_norm": 551.718994140625, |
| "learning_rate": 2.9596938390452166e-10, |
| "loss": 39.968, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.6499261447562777, |
| "grad_norm": 476.2742614746094, |
| "learning_rate": 2.880557257956763e-10, |
| "loss": 40.1534, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.6552974352088089, |
| "grad_norm": 493.28167724609375, |
| "learning_rate": 2.8020627079563876e-10, |
| "loss": 39.8795, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.6606687256613402, |
| "grad_norm": 462.6866149902344, |
| "learning_rate": 2.7242339669647403e-10, |
| "loss": 40.3111, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.6660400161138713, |
| "grad_norm": 509.9576416015625, |
| "learning_rate": 2.647094611212626e-10, |
| "loss": 39.3712, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.6714113065664026, |
| "grad_norm": 483.03619384765625, |
| "learning_rate": 2.570668008099183e-10, |
| "loss": 39.4756, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.6767825970189338, |
| "grad_norm": 505.42071533203125, |
| "learning_rate": 2.494977309113331e-10, |
| "loss": 40.5326, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.682153887471465, |
| "grad_norm": 461.374755859375, |
| "learning_rate": 2.42004544282061e-10, |
| "loss": 39.9911, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.6875251779239963, |
| "grad_norm": 432.3858947753906, |
| "learning_rate": 2.3458951079175717e-10, |
| "loss": 40.3153, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.6928964683765274, |
| "grad_norm": 515.9682006835938, |
| "learning_rate": 2.2725487663557688e-10, |
| "loss": 40.6573, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.6982677588290587, |
| "grad_norm": 476.286865234375, |
| "learning_rate": 2.2000286365374955e-10, |
| "loss": 39.9867, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.7036390492815899, |
| "grad_norm": 472.92083740234375, |
| "learning_rate": 2.1283566865852822e-10, |
| "loss": 40.5379, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.7090103397341211, |
| "grad_norm": 552.19287109375, |
| "learning_rate": 2.0575546276872166e-10, |
| "loss": 41.3682, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.7143816301866523, |
| "grad_norm": 462.6091003417969, |
| "learning_rate": 1.9876439075200893e-10, |
| "loss": 41.0671, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.7197529206391836, |
| "grad_norm": 522.0980224609375, |
| "learning_rate": 1.9186457037523765e-10, |
| "loss": 40.3256, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.7251242110917148, |
| "grad_norm": 495.66510009765625, |
| "learning_rate": 1.8505809176289958e-10, |
| "loss": 40.3366, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.730495501544246, |
| "grad_norm": 536.3059692382812, |
| "learning_rate": 1.7834701676398057e-10, |
| "loss": 40.3298, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.7358667919967772, |
| "grad_norm": 527.6504516601562, |
| "learning_rate": 1.7173337832737773e-10, |
| "loss": 39.7742, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.7412380824493084, |
| "grad_norm": 508.9981689453125, |
| "learning_rate": 1.6521917988606762e-10, |
| "loss": 40.0357, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.7466093729018397, |
| "grad_norm": 546.9842529296875, |
| "learning_rate": 1.588063947502181e-10, |
| "loss": 39.8671, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.7519806633543709, |
| "grad_norm": 514.7166748046875, |
| "learning_rate": 1.524969655094242e-10, |
| "loss": 40.3517, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.7573519538069021, |
| "grad_norm": 458.2544250488281, |
| "learning_rate": 1.4629280344425106e-10, |
| "loss": 39.5965, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.7627232442594333, |
| "grad_norm": 509.53546142578125, |
| "learning_rate": 1.401957879472583e-10, |
| "loss": 40.175, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.7680945347119645, |
| "grad_norm": 522.7682495117188, |
| "learning_rate": 1.3420776595368834e-10, |
| "loss": 39.9108, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.7734658251644958, |
| "grad_norm": 528.9177856445312, |
| "learning_rate": 1.283305513819827e-10, |
| "loss": 39.4946, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.778837115617027, |
| "grad_norm": 460.80340576171875, |
| "learning_rate": 1.225659245843026e-10, |
| "loss": 39.8654, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.7842084060695582, |
| "grad_norm": 555.3992309570312, |
| "learning_rate": 1.169156318072163e-10, |
| "loss": 41.5166, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.7895796965220895, |
| "grad_norm": 549.1808471679688, |
| "learning_rate": 1.1138138466271913e-10, |
| "loss": 39.4821, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.7949509869746206, |
| "grad_norm": 529.6665649414062, |
| "learning_rate": 1.0596485960974251e-10, |
| "loss": 40.0072, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.8003222774271519, |
| "grad_norm": 526.4564819335938, |
| "learning_rate": 1.0066769744631571e-10, |
| "loss": 39.7705, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.805693567879683, |
| "grad_norm": 483.0169372558594, |
| "learning_rate": 9.549150281252633e-11, |
| "loss": 39.6957, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.8110648583322143, |
| "grad_norm": 508.51190185546875, |
| "learning_rate": 9.043784370443615e-11, |
| "loss": 39.9077, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.8164361487847456, |
| "grad_norm": 519.5391845703125, |
| "learning_rate": 8.550825099909671e-11, |
| "loss": 38.918, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.8218074392372767, |
| "grad_norm": 528.9270629882812, |
| "learning_rate": 8.070421799080951e-11, |
| "loss": 40.3429, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.827178729689808, |
| "grad_norm": 435.5723876953125, |
| "learning_rate": 7.602719993876945e-11, |
| "loss": 39.9604, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.8325500201423391, |
| "grad_norm": 500.90625, |
| "learning_rate": 7.147861362623287e-11, |
| "loss": 40.3838, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.8379213105948704, |
| "grad_norm": 495.9076232910156, |
| "learning_rate": 6.705983693133794e-11, |
| "loss": 40.1013, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.8432926010474017, |
| "grad_norm": 527.7721557617188, |
| "learning_rate": 6.277220840971198e-11, |
| "loss": 40.5773, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.8486638914999328, |
| "grad_norm": 552.565185546875, |
| "learning_rate": 5.861702688899046e-11, |
| "loss": 39.9742, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.8540351819524641, |
| "grad_norm": 478.8362731933594, |
| "learning_rate": 5.459555107537001e-11, |
| "loss": 40.7994, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.8594064724049953, |
| "grad_norm": 476.84759521484375, |
| "learning_rate": 5.0708999172315696e-11, |
| "loss": 40.5355, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.8647777628575265, |
| "grad_norm": 463.87127685546875, |
| "learning_rate": 4.695854851153714e-11, |
| "loss": 40.8749, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.8701490533100578, |
| "grad_norm": 455.86065673828125, |
| "learning_rate": 4.334533519634643e-11, |
| "loss": 40.2378, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.8755203437625889, |
| "grad_norm": 479.13995361328125, |
| "learning_rate": 3.9870453757503865e-11, |
| "loss": 40.0686, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.8808916342151202, |
| "grad_norm": 534.9734497070312, |
| "learning_rate": 3.653495682165842e-11, |
| "loss": 40.7489, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.8862629246676514, |
| "grad_norm": 463.552490234375, |
| "learning_rate": 3.333985479248103e-11, |
| "loss": 40.446, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.8916342151201826, |
| "grad_norm": 531.924072265625, |
| "learning_rate": 3.0286115544588767e-11, |
| "loss": 39.3065, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.8970055055727139, |
| "grad_norm": 483.08502197265625, |
| "learning_rate": 2.737466413035178e-11, |
| "loss": 40.1395, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.9023767960252451, |
| "grad_norm": 525.9647827148438, |
| "learning_rate": 2.460638249967251e-11, |
| "loss": 40.0024, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.9077480864777763, |
| "grad_norm": 535.5175170898438, |
| "learning_rate": 2.198210923282118e-11, |
| "loss": 39.3654, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.9131193769303075, |
| "grad_norm": 443.3262634277344, |
| "learning_rate": 1.9502639286409496e-11, |
| "loss": 40.2637, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.9184906673828387, |
| "grad_norm": 461.6935729980469, |
| "learning_rate": 1.7168723752578776e-11, |
| "loss": 40.2201, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.92386195783537, |
| "grad_norm": 450.4540100097656, |
| "learning_rate": 1.498106963147583e-11, |
| "loss": 40.5813, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.9292332482879012, |
| "grad_norm": 414.77166748046875, |
| "learning_rate": 1.294033961708513e-11, |
| "loss": 39.9295, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.9346045387404324, |
| "grad_norm": 539.1185302734375, |
| "learning_rate": 1.1047151896482754e-11, |
| "loss": 41.3669, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.9399758291929636, |
| "grad_norm": 455.1410217285156, |
| "learning_rate": 9.302079962572375e-12, |
| "loss": 41.2396, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.9453471196454948, |
| "grad_norm": 484.1607971191406, |
| "learning_rate": 7.705652440360033e-12, |
| "loss": 39.3638, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.950718410098026, |
| "grad_norm": 456.8821105957031, |
| "learning_rate": 6.258352926821032e-12, |
| "loss": 39.8145, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.9560897005505573, |
| "grad_norm": 504.9481506347656, |
| "learning_rate": 4.960619844406156e-12, |
| "loss": 40.1504, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.9614609910030885, |
| "grad_norm": 500.57025146484375, |
| "learning_rate": 3.812846308233031e-12, |
| "loss": 40.1062, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.9668322814556197, |
| "grad_norm": 494.5524597167969, |
| "learning_rate": 2.8153800070020444e-12, |
| "loss": 39.8419, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.972203571908151, |
| "grad_norm": 500.6388244628906, |
| "learning_rate": 1.9685230976726477e-12, |
| "loss": 40.3947, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.9775748623606821, |
| "grad_norm": 467.13702392578125, |
| "learning_rate": 1.2725321139326896e-12, |
| "loss": 40.5521, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.9829461528132134, |
| "grad_norm": 460.44854736328125, |
| "learning_rate": 7.276178884882412e-13, |
| "loss": 40.5297, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.9883174432657446, |
| "grad_norm": 521.1849365234375, |
| "learning_rate": 3.3394548919707394e-13, |
| "loss": 40.9712, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.9936887337182758, |
| "grad_norm": 527.3604125976562, |
| "learning_rate": 9.163416906554645e-14, |
| "loss": 40.1704, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.9990600241708071, |
| "grad_norm": 505.638427734375, |
| "learning_rate": 7.573301240570985e-16, |
| "loss": 41.4237, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.9995971532160601, |
| "step": 1861, |
| "total_flos": 0.0, |
| "train_loss": 40.22760858182174, |
| "train_runtime": 17617.7876, |
| "train_samples_per_second": 3.381, |
| "train_steps_per_second": 0.106 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1861, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|