ParagonLight's picture
upload loras
0a8e533
raw
history blame contribute delete
No virus
55.1 kB
{
"best_metric": 1.0082145929336548,
"best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/alpaca_no_sys/checkpoint-2000",
"epoch": 1.158161418747738,
"eval_steps": 200,
"global_step": 3200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.2189224660396576,
"learning_rate": 5e-05,
"loss": 1.4369,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.4817655384540558,
"learning_rate": 0.0001,
"loss": 1.3624,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.35323551297187805,
"learning_rate": 9.999996763266864e-05,
"loss": 1.1589,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 0.2697048485279083,
"learning_rate": 9.999987053071647e-05,
"loss": 1.1103,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 0.34059372544288635,
"learning_rate": 9.99997086942692e-05,
"loss": 1.0601,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 0.2907443344593048,
"learning_rate": 9.999948212353635e-05,
"loss": 1.0302,
"step": 60
},
{
"epoch": 0.03,
"grad_norm": 0.4002208113670349,
"learning_rate": 9.999919081881129e-05,
"loss": 1.114,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 0.4364459216594696,
"learning_rate": 9.999883478047113e-05,
"loss": 1.0913,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 0.322396844625473,
"learning_rate": 9.999841400897687e-05,
"loss": 1.0778,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 0.5678238868713379,
"learning_rate": 9.999792850487325e-05,
"loss": 1.0493,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 0.2919568717479706,
"learning_rate": 9.999737826878886e-05,
"loss": 1.0249,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 0.3787660300731659,
"learning_rate": 9.99967633014361e-05,
"loss": 1.0594,
"step": 120
},
{
"epoch": 0.05,
"grad_norm": 0.33062055706977844,
"learning_rate": 9.999608360361113e-05,
"loss": 1.0527,
"step": 130
},
{
"epoch": 0.05,
"grad_norm": 0.3306855857372284,
"learning_rate": 9.999533917619399e-05,
"loss": 1.0051,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": 0.41762664914131165,
"learning_rate": 9.999453002014846e-05,
"loss": 0.9906,
"step": 150
},
{
"epoch": 0.06,
"grad_norm": 0.291189044713974,
"learning_rate": 9.999365613652217e-05,
"loss": 1.0197,
"step": 160
},
{
"epoch": 0.06,
"grad_norm": 0.30276551842689514,
"learning_rate": 9.999271752644649e-05,
"loss": 1.0356,
"step": 170
},
{
"epoch": 0.07,
"grad_norm": 0.25866344571113586,
"learning_rate": 9.999171419113666e-05,
"loss": 1.0332,
"step": 180
},
{
"epoch": 0.07,
"grad_norm": 0.1927756369113922,
"learning_rate": 9.999064613189171e-05,
"loss": 1.0126,
"step": 190
},
{
"epoch": 0.07,
"grad_norm": 0.2776283621788025,
"learning_rate": 9.998951335009442e-05,
"loss": 1.0429,
"step": 200
},
{
"epoch": 0.07,
"eval_loss": 1.029819130897522,
"eval_runtime": 124.6792,
"eval_samples_per_second": 62.569,
"eval_steps_per_second": 3.914,
"step": 200
},
{
"epoch": 0.08,
"grad_norm": 0.320551335811615,
"learning_rate": 9.998831584721141e-05,
"loss": 1.0431,
"step": 210
},
{
"epoch": 0.08,
"grad_norm": 0.46670058369636536,
"learning_rate": 9.998705362479307e-05,
"loss": 1.0374,
"step": 220
},
{
"epoch": 0.08,
"grad_norm": 0.30959388613700867,
"learning_rate": 9.99857266844736e-05,
"loss": 1.1065,
"step": 230
},
{
"epoch": 0.09,
"grad_norm": 0.3016811013221741,
"learning_rate": 9.998433502797095e-05,
"loss": 1.1105,
"step": 240
},
{
"epoch": 0.09,
"grad_norm": 0.356992244720459,
"learning_rate": 9.998287865708694e-05,
"loss": 0.9839,
"step": 250
},
{
"epoch": 0.09,
"grad_norm": 0.29836413264274597,
"learning_rate": 9.998135757370708e-05,
"loss": 1.0401,
"step": 260
},
{
"epoch": 0.1,
"grad_norm": 0.4305395483970642,
"learning_rate": 9.997977177980074e-05,
"loss": 1.0461,
"step": 270
},
{
"epoch": 0.1,
"grad_norm": 0.2959505021572113,
"learning_rate": 9.9978121277421e-05,
"loss": 1.0662,
"step": 280
},
{
"epoch": 0.1,
"grad_norm": 0.2577110826969147,
"learning_rate": 9.99764060687048e-05,
"loss": 1.0736,
"step": 290
},
{
"epoch": 0.11,
"grad_norm": 0.2583490014076233,
"learning_rate": 9.997462615587276e-05,
"loss": 0.9963,
"step": 300
},
{
"epoch": 0.11,
"grad_norm": 0.29901596903800964,
"learning_rate": 9.997278154122935e-05,
"loss": 1.044,
"step": 310
},
{
"epoch": 0.12,
"grad_norm": 0.24256502091884613,
"learning_rate": 9.997087222716278e-05,
"loss": 1.0713,
"step": 320
},
{
"epoch": 0.12,
"grad_norm": 0.267166405916214,
"learning_rate": 9.996889821614502e-05,
"loss": 1.0721,
"step": 330
},
{
"epoch": 0.12,
"grad_norm": 0.21612702310085297,
"learning_rate": 9.996685951073182e-05,
"loss": 1.0414,
"step": 340
},
{
"epoch": 0.13,
"grad_norm": 0.3107874095439911,
"learning_rate": 9.996475611356264e-05,
"loss": 0.9856,
"step": 350
},
{
"epoch": 0.13,
"grad_norm": 0.27626070380210876,
"learning_rate": 9.996258802736079e-05,
"loss": 1.0121,
"step": 360
},
{
"epoch": 0.13,
"grad_norm": 0.2957281172275543,
"learning_rate": 9.996035525493322e-05,
"loss": 1.0785,
"step": 370
},
{
"epoch": 0.14,
"grad_norm": 0.3168753981590271,
"learning_rate": 9.995805779917073e-05,
"loss": 0.996,
"step": 380
},
{
"epoch": 0.14,
"grad_norm": 0.24823521077632904,
"learning_rate": 9.99556956630478e-05,
"loss": 1.0557,
"step": 390
},
{
"epoch": 0.14,
"grad_norm": 0.3291969895362854,
"learning_rate": 9.995326884962268e-05,
"loss": 1.0505,
"step": 400
},
{
"epoch": 0.14,
"eval_loss": 1.023820400238037,
"eval_runtime": 124.7265,
"eval_samples_per_second": 62.545,
"eval_steps_per_second": 3.913,
"step": 400
},
{
"epoch": 0.15,
"grad_norm": 0.3567464351654053,
"learning_rate": 9.995077736203733e-05,
"loss": 0.9919,
"step": 410
},
{
"epoch": 0.15,
"grad_norm": 0.2938403785228729,
"learning_rate": 9.99482212035175e-05,
"loss": 1.0736,
"step": 420
},
{
"epoch": 0.16,
"grad_norm": 0.27481499314308167,
"learning_rate": 9.994560037737259e-05,
"loss": 1.0633,
"step": 430
},
{
"epoch": 0.16,
"grad_norm": 0.34652218222618103,
"learning_rate": 9.994291488699579e-05,
"loss": 1.049,
"step": 440
},
{
"epoch": 0.16,
"grad_norm": 0.23733928799629211,
"learning_rate": 9.994016473586398e-05,
"loss": 1.0022,
"step": 450
},
{
"epoch": 0.17,
"grad_norm": 0.2666071653366089,
"learning_rate": 9.993734992753777e-05,
"loss": 1.0076,
"step": 460
},
{
"epoch": 0.17,
"grad_norm": 0.22843866050243378,
"learning_rate": 9.993447046566146e-05,
"loss": 1.0298,
"step": 470
},
{
"epoch": 0.17,
"grad_norm": 0.4334356486797333,
"learning_rate": 9.993152635396308e-05,
"loss": 1.0635,
"step": 480
},
{
"epoch": 0.18,
"grad_norm": 0.25845977663993835,
"learning_rate": 9.992851759625433e-05,
"loss": 1.0183,
"step": 490
},
{
"epoch": 0.18,
"grad_norm": 0.26029086112976074,
"learning_rate": 9.992544419643066e-05,
"loss": 0.963,
"step": 500
},
{
"epoch": 0.18,
"grad_norm": 0.23090577125549316,
"learning_rate": 9.992230615847116e-05,
"loss": 0.9691,
"step": 510
},
{
"epoch": 0.19,
"grad_norm": 0.2835213243961334,
"learning_rate": 9.991910348643865e-05,
"loss": 1.0309,
"step": 520
},
{
"epoch": 0.19,
"grad_norm": 0.2612157166004181,
"learning_rate": 9.991583618447958e-05,
"loss": 1.0232,
"step": 530
},
{
"epoch": 0.2,
"grad_norm": 0.43860122561454773,
"learning_rate": 9.99125042568241e-05,
"loss": 1.0308,
"step": 540
},
{
"epoch": 0.2,
"grad_norm": 0.2504933476448059,
"learning_rate": 9.990910770778606e-05,
"loss": 1.0581,
"step": 550
},
{
"epoch": 0.2,
"grad_norm": 0.2778143286705017,
"learning_rate": 9.990564654176293e-05,
"loss": 0.958,
"step": 560
},
{
"epoch": 0.21,
"grad_norm": 0.29035818576812744,
"learning_rate": 9.990212076323586e-05,
"loss": 1.0258,
"step": 570
},
{
"epoch": 0.21,
"grad_norm": 0.307841032743454,
"learning_rate": 9.989853037676965e-05,
"loss": 1.0724,
"step": 580
},
{
"epoch": 0.21,
"grad_norm": 0.3011914789676666,
"learning_rate": 9.989487538701279e-05,
"loss": 0.9847,
"step": 590
},
{
"epoch": 0.22,
"grad_norm": 0.27195674180984497,
"learning_rate": 9.989115579869732e-05,
"loss": 1.044,
"step": 600
},
{
"epoch": 0.22,
"eval_loss": 1.0194298028945923,
"eval_runtime": 124.7334,
"eval_samples_per_second": 62.541,
"eval_steps_per_second": 3.912,
"step": 600
},
{
"epoch": 0.22,
"grad_norm": 0.2725551724433899,
"learning_rate": 9.988737161663898e-05,
"loss": 1.0244,
"step": 610
},
{
"epoch": 0.22,
"grad_norm": 0.2821577787399292,
"learning_rate": 9.988352284573713e-05,
"loss": 1.0254,
"step": 620
},
{
"epoch": 0.23,
"grad_norm": 0.3664613664150238,
"learning_rate": 9.987960949097475e-05,
"loss": 1.1093,
"step": 630
},
{
"epoch": 0.23,
"grad_norm": 0.3072526156902313,
"learning_rate": 9.987563155741842e-05,
"loss": 1.0196,
"step": 640
},
{
"epoch": 0.24,
"grad_norm": 0.24550805985927582,
"learning_rate": 9.987158905021836e-05,
"loss": 1.012,
"step": 650
},
{
"epoch": 0.24,
"grad_norm": 0.2521149814128876,
"learning_rate": 9.986748197460837e-05,
"loss": 1.0219,
"step": 660
},
{
"epoch": 0.24,
"grad_norm": 0.34175044298171997,
"learning_rate": 9.986331033590586e-05,
"loss": 1.015,
"step": 670
},
{
"epoch": 0.25,
"grad_norm": 0.30103522539138794,
"learning_rate": 9.98590741395118e-05,
"loss": 1.1113,
"step": 680
},
{
"epoch": 0.25,
"grad_norm": 0.2344699651002884,
"learning_rate": 9.985477339091078e-05,
"loss": 1.0456,
"step": 690
},
{
"epoch": 0.25,
"grad_norm": 0.26754796504974365,
"learning_rate": 9.985040809567097e-05,
"loss": 1.0102,
"step": 700
},
{
"epoch": 0.26,
"grad_norm": 0.31665658950805664,
"learning_rate": 9.984597825944405e-05,
"loss": 1.0057,
"step": 710
},
{
"epoch": 0.26,
"grad_norm": 0.2716057300567627,
"learning_rate": 9.984148388796532e-05,
"loss": 0.9937,
"step": 720
},
{
"epoch": 0.26,
"grad_norm": 0.2589300274848938,
"learning_rate": 9.983692498705361e-05,
"loss": 0.9937,
"step": 730
},
{
"epoch": 0.27,
"grad_norm": 0.2215312272310257,
"learning_rate": 9.983230156261132e-05,
"loss": 1.0205,
"step": 740
},
{
"epoch": 0.27,
"grad_norm": 0.26202231645584106,
"learning_rate": 9.982761362062432e-05,
"loss": 1.0486,
"step": 750
},
{
"epoch": 0.28,
"grad_norm": 0.21432209014892578,
"learning_rate": 9.982286116716208e-05,
"loss": 1.0679,
"step": 760
},
{
"epoch": 0.28,
"grad_norm": 0.4230276048183441,
"learning_rate": 9.98180442083776e-05,
"loss": 1.0051,
"step": 770
},
{
"epoch": 0.28,
"grad_norm": 0.26559358835220337,
"learning_rate": 9.981316275050731e-05,
"loss": 1.0398,
"step": 780
},
{
"epoch": 0.29,
"grad_norm": 0.2559758722782135,
"learning_rate": 9.980821679987125e-05,
"loss": 1.0365,
"step": 790
},
{
"epoch": 0.29,
"grad_norm": 0.34101855754852295,
"learning_rate": 9.980320636287285e-05,
"loss": 1.0169,
"step": 800
},
{
"epoch": 0.29,
"eval_loss": 1.0172123908996582,
"eval_runtime": 124.7169,
"eval_samples_per_second": 62.55,
"eval_steps_per_second": 3.913,
"step": 800
},
{
"epoch": 0.29,
"grad_norm": 0.3401408791542053,
"learning_rate": 9.979813144599915e-05,
"loss": 1.0165,
"step": 810
},
{
"epoch": 0.3,
"grad_norm": 0.34302470088005066,
"learning_rate": 9.979299205582057e-05,
"loss": 1.0314,
"step": 820
},
{
"epoch": 0.3,
"grad_norm": 0.2908473610877991,
"learning_rate": 9.978778819899109e-05,
"loss": 0.9779,
"step": 830
},
{
"epoch": 0.3,
"grad_norm": 0.229986771941185,
"learning_rate": 9.978251988224804e-05,
"loss": 0.9564,
"step": 840
},
{
"epoch": 0.31,
"grad_norm": 0.441243052482605,
"learning_rate": 9.977718711241233e-05,
"loss": 1.0275,
"step": 850
},
{
"epoch": 0.31,
"grad_norm": 0.2620699107646942,
"learning_rate": 9.977178989638822e-05,
"loss": 1.0293,
"step": 860
},
{
"epoch": 0.31,
"grad_norm": 0.27257561683654785,
"learning_rate": 9.97663282411635e-05,
"loss": 1.0508,
"step": 870
},
{
"epoch": 0.32,
"grad_norm": 0.306587278842926,
"learning_rate": 9.97608021538093e-05,
"loss": 0.9949,
"step": 880
},
{
"epoch": 0.32,
"grad_norm": 0.30046141147613525,
"learning_rate": 9.97552116414802e-05,
"loss": 1.0752,
"step": 890
},
{
"epoch": 0.33,
"grad_norm": 0.2749102711677551,
"learning_rate": 9.974955671141424e-05,
"loss": 0.9947,
"step": 900
},
{
"epoch": 0.33,
"grad_norm": 0.38608163595199585,
"learning_rate": 9.974383737093279e-05,
"loss": 1.0362,
"step": 910
},
{
"epoch": 0.33,
"grad_norm": 0.24529774487018585,
"learning_rate": 9.973805362744064e-05,
"loss": 1.0469,
"step": 920
},
{
"epoch": 0.34,
"grad_norm": 0.33143192529678345,
"learning_rate": 9.973220548842598e-05,
"loss": 0.9705,
"step": 930
},
{
"epoch": 0.34,
"grad_norm": 0.3112998306751251,
"learning_rate": 9.972629296146035e-05,
"loss": 0.9956,
"step": 940
},
{
"epoch": 0.34,
"grad_norm": 0.32970279455184937,
"learning_rate": 9.972031605419864e-05,
"loss": 1.0232,
"step": 950
},
{
"epoch": 0.35,
"grad_norm": 0.256101131439209,
"learning_rate": 9.971427477437914e-05,
"loss": 1.0471,
"step": 960
},
{
"epoch": 0.35,
"grad_norm": 0.4258672595024109,
"learning_rate": 9.970816912982344e-05,
"loss": 0.9652,
"step": 970
},
{
"epoch": 0.35,
"grad_norm": 0.3143826425075531,
"learning_rate": 9.970199912843648e-05,
"loss": 0.9894,
"step": 980
},
{
"epoch": 0.36,
"grad_norm": 0.2868054509162903,
"learning_rate": 9.96957647782065e-05,
"loss": 1.0437,
"step": 990
},
{
"epoch": 0.36,
"grad_norm": 0.2594622075557709,
"learning_rate": 9.968946608720511e-05,
"loss": 1.02,
"step": 1000
},
{
"epoch": 0.36,
"eval_loss": 1.0154483318328857,
"eval_runtime": 124.672,
"eval_samples_per_second": 62.572,
"eval_steps_per_second": 3.914,
"step": 1000
},
{
"epoch": 0.37,
"grad_norm": 0.2359086573123932,
"learning_rate": 9.968310306358715e-05,
"loss": 1.0676,
"step": 1010
},
{
"epoch": 0.37,
"grad_norm": 0.22080975770950317,
"learning_rate": 9.967667571559081e-05,
"loss": 1.027,
"step": 1020
},
{
"epoch": 0.37,
"grad_norm": 0.3211756944656372,
"learning_rate": 9.967018405153749e-05,
"loss": 1.0004,
"step": 1030
},
{
"epoch": 0.38,
"grad_norm": 0.3681553602218628,
"learning_rate": 9.966362807983196e-05,
"loss": 1.0395,
"step": 1040
},
{
"epoch": 0.38,
"grad_norm": 0.3180038332939148,
"learning_rate": 9.965700780896216e-05,
"loss": 0.9948,
"step": 1050
},
{
"epoch": 0.38,
"grad_norm": 0.25071969628334045,
"learning_rate": 9.965032324749932e-05,
"loss": 1.0281,
"step": 1060
},
{
"epoch": 0.39,
"grad_norm": 0.2274983674287796,
"learning_rate": 9.964357440409789e-05,
"loss": 1.0094,
"step": 1070
},
{
"epoch": 0.39,
"grad_norm": 0.24825724959373474,
"learning_rate": 9.963676128749553e-05,
"loss": 1.0272,
"step": 1080
},
{
"epoch": 0.39,
"grad_norm": 0.3256381154060364,
"learning_rate": 9.96298839065132e-05,
"loss": 1.0191,
"step": 1090
},
{
"epoch": 0.4,
"grad_norm": 0.31695234775543213,
"learning_rate": 9.962294227005493e-05,
"loss": 1.08,
"step": 1100
},
{
"epoch": 0.4,
"grad_norm": 0.288083553314209,
"learning_rate": 9.961593638710804e-05,
"loss": 0.9954,
"step": 1110
},
{
"epoch": 0.41,
"grad_norm": 0.29730525612831116,
"learning_rate": 9.960886626674302e-05,
"loss": 1.071,
"step": 1120
},
{
"epoch": 0.41,
"grad_norm": 0.2090187519788742,
"learning_rate": 9.960173191811348e-05,
"loss": 0.9725,
"step": 1130
},
{
"epoch": 0.41,
"grad_norm": 0.2811983525753021,
"learning_rate": 9.959453335045622e-05,
"loss": 1.0071,
"step": 1140
},
{
"epoch": 0.42,
"grad_norm": 0.27806761860847473,
"learning_rate": 9.958727057309115e-05,
"loss": 1.0108,
"step": 1150
},
{
"epoch": 0.42,
"grad_norm": 0.2864569127559662,
"learning_rate": 9.957994359542138e-05,
"loss": 1.0495,
"step": 1160
},
{
"epoch": 0.42,
"grad_norm": 0.3440109193325043,
"learning_rate": 9.957255242693308e-05,
"loss": 1.0015,
"step": 1170
},
{
"epoch": 0.43,
"grad_norm": 0.2824917435646057,
"learning_rate": 9.956509707719555e-05,
"loss": 1.0559,
"step": 1180
},
{
"epoch": 0.43,
"grad_norm": 0.3080492317676544,
"learning_rate": 9.955757755586119e-05,
"loss": 1.0134,
"step": 1190
},
{
"epoch": 0.43,
"grad_norm": 0.2890901565551758,
"learning_rate": 9.954999387266546e-05,
"loss": 0.9492,
"step": 1200
},
{
"epoch": 0.43,
"eval_loss": 1.0133627653121948,
"eval_runtime": 124.7104,
"eval_samples_per_second": 62.553,
"eval_steps_per_second": 3.913,
"step": 1200
},
{
"epoch": 0.44,
"grad_norm": 0.33987322449684143,
"learning_rate": 9.95423460374269e-05,
"loss": 0.9629,
"step": 1210
},
{
"epoch": 0.44,
"grad_norm": 0.29403063654899597,
"learning_rate": 9.953463406004713e-05,
"loss": 1.0384,
"step": 1220
},
{
"epoch": 0.45,
"grad_norm": 0.20130111277103424,
"learning_rate": 9.952685795051077e-05,
"loss": 1.0235,
"step": 1230
},
{
"epoch": 0.45,
"grad_norm": 0.1973690539598465,
"learning_rate": 9.951901771888552e-05,
"loss": 1.0395,
"step": 1240
},
{
"epoch": 0.45,
"grad_norm": 0.24519580602645874,
"learning_rate": 9.951111337532205e-05,
"loss": 1.0914,
"step": 1250
},
{
"epoch": 0.46,
"grad_norm": 0.2706618309020996,
"learning_rate": 9.950314493005408e-05,
"loss": 1.0714,
"step": 1260
},
{
"epoch": 0.46,
"grad_norm": 0.23367558419704437,
"learning_rate": 9.949511239339831e-05,
"loss": 1.0224,
"step": 1270
},
{
"epoch": 0.46,
"grad_norm": 0.30005407333374023,
"learning_rate": 9.948701577575439e-05,
"loss": 1.0152,
"step": 1280
},
{
"epoch": 0.47,
"grad_norm": 0.3130083382129669,
"learning_rate": 9.947885508760496e-05,
"loss": 0.8988,
"step": 1290
},
{
"epoch": 0.47,
"grad_norm": 0.23657679557800293,
"learning_rate": 9.94706303395156e-05,
"loss": 1.0242,
"step": 1300
},
{
"epoch": 0.47,
"grad_norm": 0.40966659784317017,
"learning_rate": 9.946234154213487e-05,
"loss": 1.0145,
"step": 1310
},
{
"epoch": 0.48,
"grad_norm": 0.35292962193489075,
"learning_rate": 9.94539887061942e-05,
"loss": 1.0197,
"step": 1320
},
{
"epoch": 0.48,
"grad_norm": 0.38793638348579407,
"learning_rate": 9.944557184250794e-05,
"loss": 1.0273,
"step": 1330
},
{
"epoch": 0.48,
"grad_norm": 0.27373677492141724,
"learning_rate": 9.943709096197335e-05,
"loss": 0.9561,
"step": 1340
},
{
"epoch": 0.49,
"grad_norm": 0.24536257982254028,
"learning_rate": 9.942854607557057e-05,
"loss": 0.9678,
"step": 1350
},
{
"epoch": 0.49,
"grad_norm": 0.4609609842300415,
"learning_rate": 9.941993719436262e-05,
"loss": 1.0429,
"step": 1360
},
{
"epoch": 0.5,
"grad_norm": 0.27118805050849915,
"learning_rate": 9.941126432949535e-05,
"loss": 1.0506,
"step": 1370
},
{
"epoch": 0.5,
"grad_norm": 0.27538400888442993,
"learning_rate": 9.940252749219746e-05,
"loss": 1.0326,
"step": 1380
},
{
"epoch": 0.5,
"grad_norm": 0.2451954036951065,
"learning_rate": 9.939372669378048e-05,
"loss": 1.0413,
"step": 1390
},
{
"epoch": 0.51,
"grad_norm": 0.2622232437133789,
"learning_rate": 9.938486194563875e-05,
"loss": 1.0051,
"step": 1400
},
{
"epoch": 0.51,
"eval_loss": 1.011703372001648,
"eval_runtime": 124.6726,
"eval_samples_per_second": 62.572,
"eval_steps_per_second": 3.914,
"step": 1400
},
{
"epoch": 0.51,
"grad_norm": 0.2616746425628662,
"learning_rate": 9.937593325924937e-05,
"loss": 1.0277,
"step": 1410
},
{
"epoch": 0.51,
"grad_norm": 0.2952045202255249,
"learning_rate": 9.936694064617227e-05,
"loss": 0.9802,
"step": 1420
},
{
"epoch": 0.52,
"grad_norm": 0.2611790895462036,
"learning_rate": 9.935788411805011e-05,
"loss": 0.9811,
"step": 1430
},
{
"epoch": 0.52,
"grad_norm": 0.3291374742984772,
"learning_rate": 9.934876368660836e-05,
"loss": 0.9972,
"step": 1440
},
{
"epoch": 0.52,
"grad_norm": 0.32888704538345337,
"learning_rate": 9.933957936365515e-05,
"loss": 1.1006,
"step": 1450
},
{
"epoch": 0.53,
"grad_norm": 0.20011785626411438,
"learning_rate": 9.933033116108134e-05,
"loss": 1.0139,
"step": 1460
},
{
"epoch": 0.53,
"grad_norm": 0.3157961666584015,
"learning_rate": 9.932101909086056e-05,
"loss": 0.993,
"step": 1470
},
{
"epoch": 0.54,
"grad_norm": 0.22981207072734833,
"learning_rate": 9.931164316504904e-05,
"loss": 1.0539,
"step": 1480
},
{
"epoch": 0.54,
"grad_norm": 0.23787029087543488,
"learning_rate": 9.930220339578576e-05,
"loss": 0.9599,
"step": 1490
},
{
"epoch": 0.54,
"grad_norm": 0.2633046507835388,
"learning_rate": 9.929269979529232e-05,
"loss": 0.9813,
"step": 1500
},
{
"epoch": 0.55,
"grad_norm": 0.2666633725166321,
"learning_rate": 9.928313237587296e-05,
"loss": 0.9637,
"step": 1510
},
{
"epoch": 0.55,
"grad_norm": 0.26092538237571716,
"learning_rate": 9.927350114991456e-05,
"loss": 1.0375,
"step": 1520
},
{
"epoch": 0.55,
"grad_norm": 0.2837240397930145,
"learning_rate": 9.92638061298866e-05,
"loss": 1.0053,
"step": 1530
},
{
"epoch": 0.56,
"grad_norm": 0.2586491107940674,
"learning_rate": 9.925404732834117e-05,
"loss": 1.0631,
"step": 1540
},
{
"epoch": 0.56,
"grad_norm": 0.43321874737739563,
"learning_rate": 9.924422475791288e-05,
"loss": 1.0134,
"step": 1550
},
{
"epoch": 0.56,
"grad_norm": 0.19062629342079163,
"learning_rate": 9.923433843131901e-05,
"loss": 0.9989,
"step": 1560
},
{
"epoch": 0.57,
"grad_norm": 0.34545308351516724,
"learning_rate": 9.922438836135928e-05,
"loss": 1.0896,
"step": 1570
},
{
"epoch": 0.57,
"grad_norm": 0.2846600115299225,
"learning_rate": 9.921437456091596e-05,
"loss": 0.9954,
"step": 1580
},
{
"epoch": 0.58,
"grad_norm": 0.25403323769569397,
"learning_rate": 9.920429704295391e-05,
"loss": 0.9937,
"step": 1590
},
{
"epoch": 0.58,
"grad_norm": 0.23549498617649078,
"learning_rate": 9.919415582052036e-05,
"loss": 1.0469,
"step": 1600
},
{
"epoch": 0.58,
"eval_loss": 1.0105613470077515,
"eval_runtime": 124.7139,
"eval_samples_per_second": 62.551,
"eval_steps_per_second": 3.913,
"step": 1600
},
{
"epoch": 0.58,
"grad_norm": 0.21466514468193054,
"learning_rate": 9.918395090674514e-05,
"loss": 1.0408,
"step": 1610
},
{
"epoch": 0.59,
"grad_norm": 0.21247586607933044,
"learning_rate": 9.917368231484045e-05,
"loss": 0.9893,
"step": 1620
},
{
"epoch": 0.59,
"grad_norm": 0.26590731739997864,
"learning_rate": 9.916335005810095e-05,
"loss": 1.0563,
"step": 1630
},
{
"epoch": 0.59,
"grad_norm": 0.2346472591161728,
"learning_rate": 9.91529541499038e-05,
"loss": 1.0061,
"step": 1640
},
{
"epoch": 0.6,
"grad_norm": 0.27766481041908264,
"learning_rate": 9.914249460370846e-05,
"loss": 0.9639,
"step": 1650
},
{
"epoch": 0.6,
"grad_norm": 0.24883978068828583,
"learning_rate": 9.913197143305684e-05,
"loss": 1.0289,
"step": 1660
},
{
"epoch": 0.6,
"grad_norm": 0.2379382699728012,
"learning_rate": 9.912138465157325e-05,
"loss": 1.0154,
"step": 1670
},
{
"epoch": 0.61,
"grad_norm": 0.17160119116306305,
"learning_rate": 9.91107342729643e-05,
"loss": 1.0002,
"step": 1680
},
{
"epoch": 0.61,
"grad_norm": 0.2804344892501831,
"learning_rate": 9.910002031101895e-05,
"loss": 0.9887,
"step": 1690
},
{
"epoch": 0.62,
"grad_norm": 0.2296508252620697,
"learning_rate": 9.908924277960854e-05,
"loss": 1.0703,
"step": 1700
},
{
"epoch": 0.62,
"grad_norm": 0.22265523672103882,
"learning_rate": 9.907840169268662e-05,
"loss": 0.9495,
"step": 1710
},
{
"epoch": 0.62,
"grad_norm": 0.3383825123310089,
"learning_rate": 9.90674970642891e-05,
"loss": 0.9878,
"step": 1720
},
{
"epoch": 0.63,
"grad_norm": 0.2603285312652588,
"learning_rate": 9.905652890853411e-05,
"loss": 1.0351,
"step": 1730
},
{
"epoch": 0.63,
"grad_norm": 0.27001509070396423,
"learning_rate": 9.904549723962206e-05,
"loss": 1.0528,
"step": 1740
},
{
"epoch": 0.63,
"grad_norm": 0.34035804867744446,
"learning_rate": 9.903440207183558e-05,
"loss": 1.0159,
"step": 1750
},
{
"epoch": 0.64,
"grad_norm": 0.3518404960632324,
"learning_rate": 9.90232434195395e-05,
"loss": 0.9879,
"step": 1760
},
{
"epoch": 0.64,
"grad_norm": 0.24958577752113342,
"learning_rate": 9.901202129718086e-05,
"loss": 1.0221,
"step": 1770
},
{
"epoch": 0.64,
"grad_norm": 0.23898568749427795,
"learning_rate": 9.900073571928886e-05,
"loss": 1.037,
"step": 1780
},
{
"epoch": 0.65,
"grad_norm": 0.22275009751319885,
"learning_rate": 9.898938670047486e-05,
"loss": 1.0008,
"step": 1790
},
{
"epoch": 0.65,
"grad_norm": 0.2770971655845642,
"learning_rate": 9.897797425543236e-05,
"loss": 0.9994,
"step": 1800
},
{
"epoch": 0.65,
"eval_loss": 1.0094062089920044,
"eval_runtime": 124.6598,
"eval_samples_per_second": 62.578,
"eval_steps_per_second": 3.915,
"step": 1800
},
{
"epoch": 0.66,
"grad_norm": 0.2470710575580597,
"learning_rate": 9.896649839893699e-05,
"loss": 1.0093,
"step": 1810
},
{
"epoch": 0.66,
"grad_norm": 0.31282275915145874,
"learning_rate": 9.895495914584643e-05,
"loss": 1.0124,
"step": 1820
},
{
"epoch": 0.66,
"grad_norm": 0.2757389545440674,
"learning_rate": 9.894335651110051e-05,
"loss": 1.0197,
"step": 1830
},
{
"epoch": 0.67,
"grad_norm": 0.3123573362827301,
"learning_rate": 9.893169050972106e-05,
"loss": 0.9469,
"step": 1840
},
{
"epoch": 0.67,
"grad_norm": 0.4073740839958191,
"learning_rate": 9.8919961156812e-05,
"loss": 1.0153,
"step": 1850
},
{
"epoch": 0.67,
"grad_norm": 0.24388962984085083,
"learning_rate": 9.89081684675592e-05,
"loss": 1.0124,
"step": 1860
},
{
"epoch": 0.68,
"grad_norm": 0.27508777379989624,
"learning_rate": 9.88963124572306e-05,
"loss": 0.96,
"step": 1870
},
{
"epoch": 0.68,
"grad_norm": 0.2843553125858307,
"learning_rate": 9.88843931411761e-05,
"loss": 1.0448,
"step": 1880
},
{
"epoch": 0.68,
"grad_norm": 0.25155389308929443,
"learning_rate": 9.887241053482757e-05,
"loss": 1.0362,
"step": 1890
},
{
"epoch": 0.69,
"grad_norm": 0.21977895498275757,
"learning_rate": 9.886036465369877e-05,
"loss": 1.0658,
"step": 1900
},
{
"epoch": 0.69,
"grad_norm": 0.22326160967350006,
"learning_rate": 9.884825551338546e-05,
"loss": 1.0068,
"step": 1910
},
{
"epoch": 0.69,
"grad_norm": 0.3339684307575226,
"learning_rate": 9.883608312956524e-05,
"loss": 1.0147,
"step": 1920
},
{
"epoch": 0.7,
"grad_norm": 0.26512840390205383,
"learning_rate": 9.882384751799762e-05,
"loss": 0.9421,
"step": 1930
},
{
"epoch": 0.7,
"grad_norm": 0.313123881816864,
"learning_rate": 9.881154869452395e-05,
"loss": 1.0032,
"step": 1940
},
{
"epoch": 0.71,
"grad_norm": 0.3562926948070526,
"learning_rate": 9.879918667506748e-05,
"loss": 1.0491,
"step": 1950
},
{
"epoch": 0.71,
"grad_norm": 0.373032808303833,
"learning_rate": 9.87867614756332e-05,
"loss": 0.9823,
"step": 1960
},
{
"epoch": 0.71,
"grad_norm": 0.2701728641986847,
"learning_rate": 9.87742731123079e-05,
"loss": 1.0326,
"step": 1970
},
{
"epoch": 0.72,
"grad_norm": 0.4167492687702179,
"learning_rate": 9.876172160126024e-05,
"loss": 1.0256,
"step": 1980
},
{
"epoch": 0.72,
"grad_norm": 0.2636062800884247,
"learning_rate": 9.874910695874053e-05,
"loss": 1.0301,
"step": 1990
},
{
"epoch": 0.72,
"grad_norm": 0.27048760652542114,
"learning_rate": 9.873642920108091e-05,
"loss": 1.0141,
"step": 2000
},
{
"epoch": 0.72,
"eval_loss": 1.0082145929336548,
"eval_runtime": 124.7209,
"eval_samples_per_second": 62.548,
"eval_steps_per_second": 3.913,
"step": 2000
},
{
"epoch": 0.73,
"grad_norm": 0.26596397161483765,
"learning_rate": 9.872368834469514e-05,
"loss": 0.9554,
"step": 2010
},
{
"epoch": 0.73,
"grad_norm": 0.3881726861000061,
"learning_rate": 9.871088440607874e-05,
"loss": 1.0374,
"step": 2020
},
{
"epoch": 0.73,
"grad_norm": 0.345869243144989,
"learning_rate": 9.869801740180889e-05,
"loss": 1.01,
"step": 2030
},
{
"epoch": 0.74,
"grad_norm": 0.3740908205509186,
"learning_rate": 9.86850873485444e-05,
"loss": 1.0244,
"step": 2040
},
{
"epoch": 0.74,
"grad_norm": 0.3265666663646698,
"learning_rate": 9.867209426302572e-05,
"loss": 0.9303,
"step": 2050
},
{
"epoch": 0.75,
"grad_norm": 0.381783664226532,
"learning_rate": 9.865903816207493e-05,
"loss": 1.0851,
"step": 2060
},
{
"epoch": 0.75,
"grad_norm": 0.30846527218818665,
"learning_rate": 9.864591906259568e-05,
"loss": 1.0042,
"step": 2070
},
{
"epoch": 0.75,
"grad_norm": 0.36899617314338684,
"learning_rate": 9.863273698157315e-05,
"loss": 0.9866,
"step": 2080
},
{
"epoch": 0.76,
"grad_norm": 0.25415265560150146,
"learning_rate": 9.861949193607411e-05,
"loss": 1.056,
"step": 2090
},
{
"epoch": 0.76,
"grad_norm": 0.3369081914424896,
"learning_rate": 9.860618394324682e-05,
"loss": 0.9988,
"step": 2100
},
{
"epoch": 0.76,
"grad_norm": 0.19644911587238312,
"learning_rate": 9.859281302032106e-05,
"loss": 0.9562,
"step": 2110
},
{
"epoch": 0.77,
"grad_norm": 0.3449130356311798,
"learning_rate": 9.857937918460808e-05,
"loss": 1.0325,
"step": 2120
},
{
"epoch": 0.77,
"grad_norm": 0.2639143764972687,
"learning_rate": 9.856588245350056e-05,
"loss": 1.0458,
"step": 2130
},
{
"epoch": 0.77,
"grad_norm": 0.2752164602279663,
"learning_rate": 9.855232284447262e-05,
"loss": 1.089,
"step": 2140
},
{
"epoch": 0.78,
"grad_norm": 0.31700417399406433,
"learning_rate": 9.853870037507983e-05,
"loss": 1.0398,
"step": 2150
},
{
"epoch": 0.78,
"grad_norm": 0.24685466289520264,
"learning_rate": 9.852501506295907e-05,
"loss": 1.0038,
"step": 2160
},
{
"epoch": 0.79,
"grad_norm": 0.28860118985176086,
"learning_rate": 9.851126692582864e-05,
"loss": 1.0343,
"step": 2170
},
{
"epoch": 0.79,
"grad_norm": 0.2774854898452759,
"learning_rate": 9.849745598148817e-05,
"loss": 0.9986,
"step": 2180
},
{
"epoch": 0.79,
"grad_norm": 0.28867611289024353,
"learning_rate": 9.848358224781857e-05,
"loss": 1.035,
"step": 2190
},
{
"epoch": 0.8,
"grad_norm": 0.2703929841518402,
"learning_rate": 9.84696457427821e-05,
"loss": 1.0891,
"step": 2200
},
{
"epoch": 0.8,
"eval_loss": 1.0072919130325317,
"eval_runtime": 125.0779,
"eval_samples_per_second": 62.369,
"eval_steps_per_second": 3.902,
"step": 2200
},
{
"epoch": 0.8,
"grad_norm": 0.3247489035129547,
"learning_rate": 9.845564648442222e-05,
"loss": 1.0259,
"step": 2210
},
{
"epoch": 0.8,
"grad_norm": 0.2535197138786316,
"learning_rate": 9.844158449086371e-05,
"loss": 1.0457,
"step": 2220
},
{
"epoch": 0.81,
"grad_norm": 0.26780492067337036,
"learning_rate": 9.842745978031253e-05,
"loss": 0.9869,
"step": 2230
},
{
"epoch": 0.81,
"grad_norm": 0.29711589217185974,
"learning_rate": 9.841327237105585e-05,
"loss": 1.0158,
"step": 2240
},
{
"epoch": 0.81,
"grad_norm": 0.239434614777565,
"learning_rate": 9.8399022281462e-05,
"loss": 0.997,
"step": 2250
},
{
"epoch": 0.82,
"grad_norm": 0.2368830293416977,
"learning_rate": 9.838470952998049e-05,
"loss": 1.0148,
"step": 2260
},
{
"epoch": 0.82,
"grad_norm": 0.2554934322834015,
"learning_rate": 9.837033413514191e-05,
"loss": 0.9787,
"step": 2270
},
{
"epoch": 0.83,
"grad_norm": 0.2310570627450943,
"learning_rate": 9.835589611555805e-05,
"loss": 0.9656,
"step": 2280
},
{
"epoch": 0.83,
"grad_norm": 0.22654668986797333,
"learning_rate": 9.834139548992165e-05,
"loss": 0.9837,
"step": 2290
},
{
"epoch": 0.83,
"grad_norm": 0.25957950949668884,
"learning_rate": 9.832683227700661e-05,
"loss": 1.0513,
"step": 2300
},
{
"epoch": 0.84,
"grad_norm": 0.20669637620449066,
"learning_rate": 9.831220649566782e-05,
"loss": 0.9649,
"step": 2310
},
{
"epoch": 0.84,
"grad_norm": 0.24330663681030273,
"learning_rate": 9.829751816484116e-05,
"loss": 1.0208,
"step": 2320
},
{
"epoch": 0.84,
"grad_norm": 0.28211724758148193,
"learning_rate": 9.828276730354353e-05,
"loss": 0.9512,
"step": 2330
},
{
"epoch": 0.85,
"grad_norm": 0.23784276843070984,
"learning_rate": 9.826795393087278e-05,
"loss": 0.976,
"step": 2340
},
{
"epoch": 0.85,
"grad_norm": 0.2881389260292053,
"learning_rate": 9.825307806600765e-05,
"loss": 1.0036,
"step": 2350
},
{
"epoch": 0.85,
"grad_norm": 0.27906882762908936,
"learning_rate": 9.823813972820786e-05,
"loss": 1.0555,
"step": 2360
},
{
"epoch": 0.86,
"grad_norm": 0.25142115354537964,
"learning_rate": 9.822313893681397e-05,
"loss": 1.0483,
"step": 2370
},
{
"epoch": 0.86,
"grad_norm": 0.244681715965271,
"learning_rate": 9.820807571124738e-05,
"loss": 1.0102,
"step": 2380
},
{
"epoch": 0.87,
"grad_norm": 0.3696367144584656,
"learning_rate": 9.819295007101035e-05,
"loss": 1.0626,
"step": 2390
},
{
"epoch": 0.87,
"grad_norm": 0.26112619042396545,
"learning_rate": 9.817776203568596e-05,
"loss": 1.0141,
"step": 2400
},
{
"epoch": 0.87,
"eval_loss": 1.0063296556472778,
"eval_runtime": 125.7335,
"eval_samples_per_second": 62.044,
"eval_steps_per_second": 3.881,
"step": 2400
},
{
"epoch": 0.87,
"grad_norm": 0.25221410393714905,
"learning_rate": 9.816251162493804e-05,
"loss": 1.0222,
"step": 2410
},
{
"epoch": 0.88,
"grad_norm": 0.19672074913978577,
"learning_rate": 9.814719885851121e-05,
"loss": 0.9891,
"step": 2420
},
{
"epoch": 0.88,
"grad_norm": 0.3084292411804199,
"learning_rate": 9.81318237562308e-05,
"loss": 0.9785,
"step": 2430
},
{
"epoch": 0.88,
"grad_norm": 0.3434545397758484,
"learning_rate": 9.811638633800287e-05,
"loss": 0.9357,
"step": 2440
},
{
"epoch": 0.89,
"grad_norm": 0.23335447907447815,
"learning_rate": 9.81008866238141e-05,
"loss": 1.0485,
"step": 2450
},
{
"epoch": 0.89,
"grad_norm": 0.2942172586917877,
"learning_rate": 9.808532463373188e-05,
"loss": 1.0138,
"step": 2460
},
{
"epoch": 0.89,
"grad_norm": 0.22536420822143555,
"learning_rate": 9.806970038790423e-05,
"loss": 1.0421,
"step": 2470
},
{
"epoch": 0.9,
"grad_norm": 0.30886924266815186,
"learning_rate": 9.805401390655975e-05,
"loss": 0.9926,
"step": 2480
},
{
"epoch": 0.9,
"grad_norm": 0.34105512499809265,
"learning_rate": 9.803826521000761e-05,
"loss": 1.0013,
"step": 2490
},
{
"epoch": 0.9,
"grad_norm": 0.261643648147583,
"learning_rate": 9.802245431863757e-05,
"loss": 0.9937,
"step": 2500
},
{
"epoch": 0.91,
"grad_norm": 0.3864617347717285,
"learning_rate": 9.800658125291984e-05,
"loss": 0.9986,
"step": 2510
},
{
"epoch": 0.91,
"grad_norm": 0.31850436329841614,
"learning_rate": 9.79906460334052e-05,
"loss": 0.9984,
"step": 2520
},
{
"epoch": 0.92,
"grad_norm": 0.25421255826950073,
"learning_rate": 9.797464868072488e-05,
"loss": 1.0273,
"step": 2530
},
{
"epoch": 0.92,
"grad_norm": 0.34440311789512634,
"learning_rate": 9.795858921559052e-05,
"loss": 1.0346,
"step": 2540
},
{
"epoch": 0.92,
"grad_norm": 0.33147209882736206,
"learning_rate": 9.79424676587942e-05,
"loss": 1.0691,
"step": 2550
},
{
"epoch": 0.93,
"grad_norm": 0.2778458893299103,
"learning_rate": 9.792628403120842e-05,
"loss": 1.009,
"step": 2560
},
{
"epoch": 0.93,
"grad_norm": 0.29282572865486145,
"learning_rate": 9.791003835378598e-05,
"loss": 1.0015,
"step": 2570
},
{
"epoch": 0.93,
"grad_norm": 0.25391730666160583,
"learning_rate": 9.789373064756008e-05,
"loss": 1.0177,
"step": 2580
},
{
"epoch": 0.94,
"grad_norm": 0.23779381811618805,
"learning_rate": 9.787736093364416e-05,
"loss": 1.0935,
"step": 2590
},
{
"epoch": 0.94,
"grad_norm": 0.2965840995311737,
"learning_rate": 9.786092923323203e-05,
"loss": 1.0002,
"step": 2600
},
{
"epoch": 0.94,
"eval_loss": 1.005922555923462,
"eval_runtime": 125.0587,
"eval_samples_per_second": 62.379,
"eval_steps_per_second": 3.902,
"step": 2600
},
{
"epoch": 0.94,
"grad_norm": 0.23760788142681122,
"learning_rate": 9.784443556759766e-05,
"loss": 1.0305,
"step": 2610
},
{
"epoch": 0.95,
"grad_norm": 0.22895409166812897,
"learning_rate": 9.78278799580953e-05,
"loss": 1.0427,
"step": 2620
},
{
"epoch": 0.95,
"grad_norm": 0.36007368564605713,
"learning_rate": 9.781126242615939e-05,
"loss": 1.0059,
"step": 2630
},
{
"epoch": 0.96,
"grad_norm": 0.2813151776790619,
"learning_rate": 9.779458299330452e-05,
"loss": 1.0418,
"step": 2640
},
{
"epoch": 0.96,
"grad_norm": 0.27038782835006714,
"learning_rate": 9.777784168112545e-05,
"loss": 1.0092,
"step": 2650
},
{
"epoch": 0.96,
"grad_norm": 0.22898097336292267,
"learning_rate": 9.776103851129706e-05,
"loss": 0.9883,
"step": 2660
},
{
"epoch": 0.97,
"grad_norm": 0.2213810682296753,
"learning_rate": 9.774417350557428e-05,
"loss": 1.0753,
"step": 2670
},
{
"epoch": 0.97,
"grad_norm": 0.22410623729228973,
"learning_rate": 9.772724668579212e-05,
"loss": 1.0524,
"step": 2680
},
{
"epoch": 0.97,
"grad_norm": 0.3005650043487549,
"learning_rate": 9.771025807386562e-05,
"loss": 1.0562,
"step": 2690
},
{
"epoch": 0.98,
"grad_norm": 0.3941683769226074,
"learning_rate": 9.769320769178983e-05,
"loss": 0.9925,
"step": 2700
},
{
"epoch": 0.98,
"grad_norm": 0.2829142212867737,
"learning_rate": 9.767609556163977e-05,
"loss": 1.014,
"step": 2710
},
{
"epoch": 0.98,
"grad_norm": 0.29680418968200684,
"learning_rate": 9.765892170557038e-05,
"loss": 0.9677,
"step": 2720
},
{
"epoch": 0.99,
"grad_norm": 0.22002767026424408,
"learning_rate": 9.764168614581655e-05,
"loss": 0.9954,
"step": 2730
},
{
"epoch": 0.99,
"grad_norm": 0.2758820354938507,
"learning_rate": 9.762438890469304e-05,
"loss": 1.0029,
"step": 2740
},
{
"epoch": 1.0,
"grad_norm": 0.2981850802898407,
"learning_rate": 9.760703000459446e-05,
"loss": 1.0555,
"step": 2750
},
{
"epoch": 1.0,
"grad_norm": 0.22340857982635498,
"learning_rate": 9.758960946799528e-05,
"loss": 1.0394,
"step": 2760
},
{
"epoch": 1.0,
"grad_norm": 0.19991633296012878,
"learning_rate": 9.757212731744974e-05,
"loss": 0.9325,
"step": 2770
},
{
"epoch": 1.01,
"grad_norm": 0.30030888319015503,
"learning_rate": 9.755458357559186e-05,
"loss": 0.9711,
"step": 2780
},
{
"epoch": 1.01,
"grad_norm": 0.3804832696914673,
"learning_rate": 9.753697826513541e-05,
"loss": 0.9651,
"step": 2790
},
{
"epoch": 1.01,
"grad_norm": 0.46047547459602356,
"learning_rate": 9.751931140887387e-05,
"loss": 0.9686,
"step": 2800
},
{
"epoch": 1.01,
"eval_loss": 1.0086077451705933,
"eval_runtime": 124.6354,
"eval_samples_per_second": 62.591,
"eval_steps_per_second": 3.915,
"step": 2800
},
{
"epoch": 1.02,
"grad_norm": 0.30646952986717224,
"learning_rate": 9.750158302968039e-05,
"loss": 0.9267,
"step": 2810
},
{
"epoch": 1.02,
"grad_norm": 0.3007545471191406,
"learning_rate": 9.748379315050778e-05,
"loss": 1.0193,
"step": 2820
},
{
"epoch": 1.02,
"grad_norm": 0.2814784049987793,
"learning_rate": 9.74659417943885e-05,
"loss": 0.8893,
"step": 2830
},
{
"epoch": 1.03,
"grad_norm": 0.2728348970413208,
"learning_rate": 9.744802898443456e-05,
"loss": 0.937,
"step": 2840
},
{
"epoch": 1.03,
"grad_norm": 0.2994844913482666,
"learning_rate": 9.743005474383755e-05,
"loss": 0.949,
"step": 2850
},
{
"epoch": 1.04,
"grad_norm": 0.43111738562583923,
"learning_rate": 9.741201909586861e-05,
"loss": 0.9897,
"step": 2860
},
{
"epoch": 1.04,
"grad_norm": 0.29551658034324646,
"learning_rate": 9.739392206387838e-05,
"loss": 0.9393,
"step": 2870
},
{
"epoch": 1.04,
"grad_norm": 0.40380623936653137,
"learning_rate": 9.737576367129694e-05,
"loss": 0.9365,
"step": 2880
},
{
"epoch": 1.05,
"grad_norm": 0.2757427394390106,
"learning_rate": 9.735754394163386e-05,
"loss": 1.0074,
"step": 2890
},
{
"epoch": 1.05,
"grad_norm": 0.35594430565834045,
"learning_rate": 9.73392628984781e-05,
"loss": 0.9682,
"step": 2900
},
{
"epoch": 1.05,
"grad_norm": 0.32288888096809387,
"learning_rate": 9.732092056549799e-05,
"loss": 0.9753,
"step": 2910
},
{
"epoch": 1.06,
"grad_norm": 0.3491690158843994,
"learning_rate": 9.730251696644122e-05,
"loss": 0.926,
"step": 2920
},
{
"epoch": 1.06,
"grad_norm": 0.41806405782699585,
"learning_rate": 9.728405212513483e-05,
"loss": 0.9993,
"step": 2930
},
{
"epoch": 1.06,
"grad_norm": 0.4885188043117523,
"learning_rate": 9.726552606548512e-05,
"loss": 0.9879,
"step": 2940
},
{
"epoch": 1.07,
"grad_norm": 0.41796302795410156,
"learning_rate": 9.724693881147761e-05,
"loss": 0.9626,
"step": 2950
},
{
"epoch": 1.07,
"grad_norm": 0.39677631855010986,
"learning_rate": 9.722829038717717e-05,
"loss": 0.9767,
"step": 2960
},
{
"epoch": 1.07,
"grad_norm": 0.5329232215881348,
"learning_rate": 9.720958081672773e-05,
"loss": 0.9357,
"step": 2970
},
{
"epoch": 1.08,
"grad_norm": 0.4468931257724762,
"learning_rate": 9.719081012435247e-05,
"loss": 0.9705,
"step": 2980
},
{
"epoch": 1.08,
"grad_norm": 0.4029316306114197,
"learning_rate": 9.717197833435367e-05,
"loss": 0.9727,
"step": 2990
},
{
"epoch": 1.09,
"grad_norm": 0.37598028779029846,
"learning_rate": 9.715308547111273e-05,
"loss": 0.9767,
"step": 3000
},
{
"epoch": 1.09,
"eval_loss": 1.014098048210144,
"eval_runtime": 125.4232,
"eval_samples_per_second": 62.197,
"eval_steps_per_second": 3.891,
"step": 3000
},
{
"epoch": 1.09,
"grad_norm": 0.3833357095718384,
"learning_rate": 9.713413155909009e-05,
"loss": 0.9605,
"step": 3010
},
{
"epoch": 1.09,
"grad_norm": 0.4391871988773346,
"learning_rate": 9.711511662282527e-05,
"loss": 0.9611,
"step": 3020
},
{
"epoch": 1.1,
"grad_norm": 0.39860454201698303,
"learning_rate": 9.709604068693679e-05,
"loss": 0.9222,
"step": 3030
},
{
"epoch": 1.1,
"grad_norm": 0.33882561326026917,
"learning_rate": 9.707690377612211e-05,
"loss": 0.9369,
"step": 3040
},
{
"epoch": 1.1,
"grad_norm": 0.3763039708137512,
"learning_rate": 9.705770591515768e-05,
"loss": 0.8864,
"step": 3050
},
{
"epoch": 1.11,
"grad_norm": 0.3221600353717804,
"learning_rate": 9.703844712889884e-05,
"loss": 0.9753,
"step": 3060
},
{
"epoch": 1.11,
"grad_norm": 0.3342023491859436,
"learning_rate": 9.701912744227979e-05,
"loss": 0.9233,
"step": 3070
},
{
"epoch": 1.11,
"grad_norm": 0.4082651734352112,
"learning_rate": 9.699974688031363e-05,
"loss": 0.987,
"step": 3080
},
{
"epoch": 1.12,
"grad_norm": 0.4198564291000366,
"learning_rate": 9.69803054680922e-05,
"loss": 0.8833,
"step": 3090
},
{
"epoch": 1.12,
"grad_norm": 0.3833492398262024,
"learning_rate": 9.696080323078621e-05,
"loss": 0.9894,
"step": 3100
},
{
"epoch": 1.13,
"grad_norm": 0.35935208201408386,
"learning_rate": 9.694124019364505e-05,
"loss": 0.9417,
"step": 3110
},
{
"epoch": 1.13,
"grad_norm": 0.3433043658733368,
"learning_rate": 9.692161638199686e-05,
"loss": 0.9251,
"step": 3120
},
{
"epoch": 1.13,
"grad_norm": 0.30163127183914185,
"learning_rate": 9.690193182124844e-05,
"loss": 0.9447,
"step": 3130
},
{
"epoch": 1.14,
"grad_norm": 0.4361821711063385,
"learning_rate": 9.68821865368853e-05,
"loss": 0.9984,
"step": 3140
},
{
"epoch": 1.14,
"grad_norm": 0.4263075888156891,
"learning_rate": 9.686238055447148e-05,
"loss": 0.9422,
"step": 3150
},
{
"epoch": 1.14,
"grad_norm": 0.33963072299957275,
"learning_rate": 9.684251389964967e-05,
"loss": 0.9199,
"step": 3160
},
{
"epoch": 1.15,
"grad_norm": 0.41040754318237305,
"learning_rate": 9.68225865981411e-05,
"loss": 0.9249,
"step": 3170
},
{
"epoch": 1.15,
"grad_norm": 0.3697950839996338,
"learning_rate": 9.680259867574552e-05,
"loss": 0.947,
"step": 3180
},
{
"epoch": 1.15,
"grad_norm": 0.3211696743965149,
"learning_rate": 9.678255015834112e-05,
"loss": 0.9956,
"step": 3190
},
{
"epoch": 1.16,
"grad_norm": 0.4463675022125244,
"learning_rate": 9.676244107188463e-05,
"loss": 0.9494,
"step": 3200
},
{
"epoch": 1.16,
"eval_loss": 1.0160499811172485,
"eval_runtime": 124.6588,
"eval_samples_per_second": 62.579,
"eval_steps_per_second": 3.915,
"step": 3200
},
{
"epoch": 1.16,
"step": 3200,
"total_flos": 8.146148608211681e+17,
"train_loss": 1.015018144249916,
"train_runtime": 4695.8401,
"train_samples_per_second": 94.128,
"train_steps_per_second": 5.884
}
],
"logging_steps": 10,
"max_steps": 27630,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"total_flos": 8.146148608211681e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}