llama2_chatdoctor_finetuned / trainer_state.json
SallySun's picture
Upload 11 files
265378d verified
raw
history blame contribute delete
No virus
96.8 kB
{
"best_metric": 0.6168031096458435,
"best_model_checkpoint": "/Lora_models/checkpoint-5000",
"epoch": 0.9752438109527382,
"eval_steps": 200,
"global_step": 5200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018754688672168042,
"grad_norm": 0.9346357583999634,
"learning_rate": 2.7e-06,
"loss": 2.5803,
"step": 10
},
{
"epoch": 0.0037509377344336083,
"grad_norm": 1.251865267753601,
"learning_rate": 5.7000000000000005e-06,
"loss": 2.4539,
"step": 20
},
{
"epoch": 0.005626406601650412,
"grad_norm": 1.055527687072754,
"learning_rate": 8.7e-06,
"loss": 2.5107,
"step": 30
},
{
"epoch": 0.007501875468867217,
"grad_norm": 1.1238517761230469,
"learning_rate": 1.1700000000000001e-05,
"loss": 2.4512,
"step": 40
},
{
"epoch": 0.009377344336084021,
"grad_norm": 1.20820152759552,
"learning_rate": 1.47e-05,
"loss": 2.4078,
"step": 50
},
{
"epoch": 0.011252813203300824,
"grad_norm": 2.085667133331299,
"learning_rate": 1.77e-05,
"loss": 2.2828,
"step": 60
},
{
"epoch": 0.01312828207051763,
"grad_norm": 1.357437252998352,
"learning_rate": 2.07e-05,
"loss": 2.0614,
"step": 70
},
{
"epoch": 0.015003750937734433,
"grad_norm": 1.6979730129241943,
"learning_rate": 2.37e-05,
"loss": 1.9482,
"step": 80
},
{
"epoch": 0.01687921980495124,
"grad_norm": 1.7537882328033447,
"learning_rate": 2.6700000000000002e-05,
"loss": 1.775,
"step": 90
},
{
"epoch": 0.018754688672168042,
"grad_norm": 1.8001635074615479,
"learning_rate": 2.97e-05,
"loss": 1.6462,
"step": 100
},
{
"epoch": 0.020630157539384845,
"grad_norm": 1.744318962097168,
"learning_rate": 2.9948394495412847e-05,
"loss": 1.3796,
"step": 110
},
{
"epoch": 0.02250562640660165,
"grad_norm": 1.6778439283370972,
"learning_rate": 2.989105504587156e-05,
"loss": 1.2642,
"step": 120
},
{
"epoch": 0.024381095273818456,
"grad_norm": 2.0165181159973145,
"learning_rate": 2.9833715596330273e-05,
"loss": 0.9405,
"step": 130
},
{
"epoch": 0.02625656414103526,
"grad_norm": 1.1453020572662354,
"learning_rate": 2.9776376146788993e-05,
"loss": 0.8365,
"step": 140
},
{
"epoch": 0.028132033008252063,
"grad_norm": 0.9237515330314636,
"learning_rate": 2.9719036697247706e-05,
"loss": 0.9324,
"step": 150
},
{
"epoch": 0.030007501875468866,
"grad_norm": 1.1040199995040894,
"learning_rate": 2.9661697247706423e-05,
"loss": 0.8905,
"step": 160
},
{
"epoch": 0.03188297074268567,
"grad_norm": 1.052095651626587,
"learning_rate": 2.9604357798165136e-05,
"loss": 0.7226,
"step": 170
},
{
"epoch": 0.03375843960990248,
"grad_norm": 1.4509942531585693,
"learning_rate": 2.9547018348623853e-05,
"loss": 0.7401,
"step": 180
},
{
"epoch": 0.03563390847711928,
"grad_norm": 1.3202496767044067,
"learning_rate": 2.948967889908257e-05,
"loss": 0.6498,
"step": 190
},
{
"epoch": 0.037509377344336084,
"grad_norm": 1.257535457611084,
"learning_rate": 2.9432339449541286e-05,
"loss": 0.8931,
"step": 200
},
{
"epoch": 0.037509377344336084,
"eval_loss": 0.8721055388450623,
"eval_runtime": 5.4276,
"eval_samples_per_second": 22.109,
"eval_steps_per_second": 2.764,
"step": 200
},
{
"epoch": 0.03938484621155289,
"grad_norm": 1.1922121047973633,
"learning_rate": 2.9375e-05,
"loss": 0.9556,
"step": 210
},
{
"epoch": 0.04126031507876969,
"grad_norm": 1.1076873540878296,
"learning_rate": 2.9317660550458716e-05,
"loss": 0.8148,
"step": 220
},
{
"epoch": 0.043135783945986494,
"grad_norm": 1.3961315155029297,
"learning_rate": 2.9260321100917432e-05,
"loss": 0.7225,
"step": 230
},
{
"epoch": 0.0450112528132033,
"grad_norm": 1.3916189670562744,
"learning_rate": 2.920298165137615e-05,
"loss": 0.8899,
"step": 240
},
{
"epoch": 0.04688672168042011,
"grad_norm": 1.4073107242584229,
"learning_rate": 2.9145642201834862e-05,
"loss": 0.7895,
"step": 250
},
{
"epoch": 0.04876219054763691,
"grad_norm": 1.4054927825927734,
"learning_rate": 2.908830275229358e-05,
"loss": 0.8662,
"step": 260
},
{
"epoch": 0.050637659414853715,
"grad_norm": 1.0531301498413086,
"learning_rate": 2.9030963302752292e-05,
"loss": 0.7619,
"step": 270
},
{
"epoch": 0.05251312828207052,
"grad_norm": 1.3712563514709473,
"learning_rate": 2.8973623853211012e-05,
"loss": 0.8608,
"step": 280
},
{
"epoch": 0.05438859714928732,
"grad_norm": 1.4002102613449097,
"learning_rate": 2.8916284403669725e-05,
"loss": 0.7368,
"step": 290
},
{
"epoch": 0.056264066016504126,
"grad_norm": 1.381103754043579,
"learning_rate": 2.8858944954128442e-05,
"loss": 0.8059,
"step": 300
},
{
"epoch": 0.05813953488372093,
"grad_norm": 1.9642342329025269,
"learning_rate": 2.8801605504587155e-05,
"loss": 0.8149,
"step": 310
},
{
"epoch": 0.06001500375093773,
"grad_norm": 1.4538882970809937,
"learning_rate": 2.8744266055045875e-05,
"loss": 0.6891,
"step": 320
},
{
"epoch": 0.061890472618154536,
"grad_norm": 1.47075617313385,
"learning_rate": 2.8686926605504588e-05,
"loss": 0.8458,
"step": 330
},
{
"epoch": 0.06376594148537135,
"grad_norm": 1.1955201625823975,
"learning_rate": 2.8629587155963305e-05,
"loss": 0.7404,
"step": 340
},
{
"epoch": 0.06564141035258815,
"grad_norm": 1.151567816734314,
"learning_rate": 2.8572247706422018e-05,
"loss": 0.6032,
"step": 350
},
{
"epoch": 0.06751687921980495,
"grad_norm": 1.3864506483078003,
"learning_rate": 2.8514908256880738e-05,
"loss": 0.7951,
"step": 360
},
{
"epoch": 0.06939234808702176,
"grad_norm": 1.3162195682525635,
"learning_rate": 2.845756880733945e-05,
"loss": 0.698,
"step": 370
},
{
"epoch": 0.07126781695423856,
"grad_norm": 1.0740330219268799,
"learning_rate": 2.8400229357798164e-05,
"loss": 0.7008,
"step": 380
},
{
"epoch": 0.07314328582145536,
"grad_norm": 1.4195873737335205,
"learning_rate": 2.834288990825688e-05,
"loss": 0.7324,
"step": 390
},
{
"epoch": 0.07501875468867217,
"grad_norm": 1.6425796747207642,
"learning_rate": 2.8285550458715594e-05,
"loss": 0.6954,
"step": 400
},
{
"epoch": 0.07501875468867217,
"eval_loss": 0.790172815322876,
"eval_runtime": 5.5372,
"eval_samples_per_second": 21.671,
"eval_steps_per_second": 2.709,
"step": 400
},
{
"epoch": 0.07689422355588897,
"grad_norm": 1.4094221591949463,
"learning_rate": 2.8228211009174314e-05,
"loss": 0.6533,
"step": 410
},
{
"epoch": 0.07876969242310577,
"grad_norm": 1.7052984237670898,
"learning_rate": 2.8170871559633027e-05,
"loss": 0.8291,
"step": 420
},
{
"epoch": 0.08064516129032258,
"grad_norm": 1.766396164894104,
"learning_rate": 2.8113532110091744e-05,
"loss": 0.5917,
"step": 430
},
{
"epoch": 0.08252063015753938,
"grad_norm": 1.3280906677246094,
"learning_rate": 2.8056192660550457e-05,
"loss": 0.5834,
"step": 440
},
{
"epoch": 0.08439609902475619,
"grad_norm": 1.472038984298706,
"learning_rate": 2.7998853211009177e-05,
"loss": 0.6189,
"step": 450
},
{
"epoch": 0.08627156789197299,
"grad_norm": 2.434629440307617,
"learning_rate": 2.794151376146789e-05,
"loss": 0.6107,
"step": 460
},
{
"epoch": 0.08814703675918979,
"grad_norm": 1.7748132944107056,
"learning_rate": 2.7884174311926607e-05,
"loss": 0.508,
"step": 470
},
{
"epoch": 0.0900225056264066,
"grad_norm": 1.7380709648132324,
"learning_rate": 2.782683486238532e-05,
"loss": 0.6482,
"step": 480
},
{
"epoch": 0.0918979744936234,
"grad_norm": 1.1493253707885742,
"learning_rate": 2.7769495412844037e-05,
"loss": 0.6531,
"step": 490
},
{
"epoch": 0.09377344336084022,
"grad_norm": 1.384508728981018,
"learning_rate": 2.7712155963302753e-05,
"loss": 0.7061,
"step": 500
},
{
"epoch": 0.09564891222805702,
"grad_norm": 1.792687177658081,
"learning_rate": 2.765481651376147e-05,
"loss": 0.6,
"step": 510
},
{
"epoch": 0.09752438109527382,
"grad_norm": 1.657291054725647,
"learning_rate": 2.7597477064220183e-05,
"loss": 0.612,
"step": 520
},
{
"epoch": 0.09939984996249063,
"grad_norm": 1.2928940057754517,
"learning_rate": 2.75401376146789e-05,
"loss": 0.7446,
"step": 530
},
{
"epoch": 0.10127531882970743,
"grad_norm": 1.3647221326828003,
"learning_rate": 2.7482798165137616e-05,
"loss": 0.6422,
"step": 540
},
{
"epoch": 0.10315078769692423,
"grad_norm": 1.7979224920272827,
"learning_rate": 2.7425458715596333e-05,
"loss": 0.5261,
"step": 550
},
{
"epoch": 0.10502625656414104,
"grad_norm": 1.4330195188522339,
"learning_rate": 2.7368119266055046e-05,
"loss": 0.6801,
"step": 560
},
{
"epoch": 0.10690172543135784,
"grad_norm": 1.4820642471313477,
"learning_rate": 2.7310779816513763e-05,
"loss": 0.6767,
"step": 570
},
{
"epoch": 0.10877719429857464,
"grad_norm": 1.6445374488830566,
"learning_rate": 2.7253440366972476e-05,
"loss": 0.6727,
"step": 580
},
{
"epoch": 0.11065266316579145,
"grad_norm": 1.5297715663909912,
"learning_rate": 2.7196100917431196e-05,
"loss": 0.5425,
"step": 590
},
{
"epoch": 0.11252813203300825,
"grad_norm": 2.9595024585723877,
"learning_rate": 2.713876146788991e-05,
"loss": 0.5758,
"step": 600
},
{
"epoch": 0.11252813203300825,
"eval_loss": 0.7191774249076843,
"eval_runtime": 5.5393,
"eval_samples_per_second": 21.663,
"eval_steps_per_second": 2.708,
"step": 600
},
{
"epoch": 0.11440360090022506,
"grad_norm": 1.300517201423645,
"learning_rate": 2.7081422018348626e-05,
"loss": 0.6598,
"step": 610
},
{
"epoch": 0.11627906976744186,
"grad_norm": 1.4118940830230713,
"learning_rate": 2.702408256880734e-05,
"loss": 0.7422,
"step": 620
},
{
"epoch": 0.11815453863465866,
"grad_norm": 1.1897056102752686,
"learning_rate": 2.6966743119266055e-05,
"loss": 0.6301,
"step": 630
},
{
"epoch": 0.12003000750187547,
"grad_norm": 1.3802241086959839,
"learning_rate": 2.6909403669724772e-05,
"loss": 0.7331,
"step": 640
},
{
"epoch": 0.12190547636909227,
"grad_norm": 1.3182717561721802,
"learning_rate": 2.6852064220183485e-05,
"loss": 0.6309,
"step": 650
},
{
"epoch": 0.12378094523630907,
"grad_norm": 1.1293184757232666,
"learning_rate": 2.6794724770642202e-05,
"loss": 0.657,
"step": 660
},
{
"epoch": 0.12565641410352588,
"grad_norm": 1.525172472000122,
"learning_rate": 2.6737385321100915e-05,
"loss": 0.7025,
"step": 670
},
{
"epoch": 0.1275318829707427,
"grad_norm": 1.1699976921081543,
"learning_rate": 2.6680045871559635e-05,
"loss": 0.6213,
"step": 680
},
{
"epoch": 0.12940735183795948,
"grad_norm": 1.5956225395202637,
"learning_rate": 2.6622706422018348e-05,
"loss": 0.6628,
"step": 690
},
{
"epoch": 0.1312828207051763,
"grad_norm": 1.3606040477752686,
"learning_rate": 2.6565366972477065e-05,
"loss": 0.6042,
"step": 700
},
{
"epoch": 0.1331582895723931,
"grad_norm": 1.310490369796753,
"learning_rate": 2.6508027522935778e-05,
"loss": 0.6976,
"step": 710
},
{
"epoch": 0.1350337584396099,
"grad_norm": 1.0546784400939941,
"learning_rate": 2.6450688073394498e-05,
"loss": 0.6681,
"step": 720
},
{
"epoch": 0.1369092273068267,
"grad_norm": 1.977148175239563,
"learning_rate": 2.639334862385321e-05,
"loss": 0.723,
"step": 730
},
{
"epoch": 0.13878469617404351,
"grad_norm": 1.526698350906372,
"learning_rate": 2.6336009174311928e-05,
"loss": 0.6595,
"step": 740
},
{
"epoch": 0.1406601650412603,
"grad_norm": 1.302213191986084,
"learning_rate": 2.627866972477064e-05,
"loss": 0.6826,
"step": 750
},
{
"epoch": 0.14253563390847712,
"grad_norm": 1.4109597206115723,
"learning_rate": 2.6221330275229358e-05,
"loss": 0.679,
"step": 760
},
{
"epoch": 0.1444111027756939,
"grad_norm": 1.390410304069519,
"learning_rate": 2.6163990825688074e-05,
"loss": 0.5305,
"step": 770
},
{
"epoch": 0.14628657164291073,
"grad_norm": 1.778104543685913,
"learning_rate": 2.610665137614679e-05,
"loss": 0.5224,
"step": 780
},
{
"epoch": 0.14816204051012752,
"grad_norm": 1.4633052349090576,
"learning_rate": 2.6049311926605504e-05,
"loss": 0.6479,
"step": 790
},
{
"epoch": 0.15003750937734434,
"grad_norm": 1.4608770608901978,
"learning_rate": 2.599197247706422e-05,
"loss": 0.6688,
"step": 800
},
{
"epoch": 0.15003750937734434,
"eval_loss": 0.7000935673713684,
"eval_runtime": 5.5538,
"eval_samples_per_second": 21.607,
"eval_steps_per_second": 2.701,
"step": 800
},
{
"epoch": 0.15191297824456115,
"grad_norm": 1.419257640838623,
"learning_rate": 2.5934633027522937e-05,
"loss": 0.4972,
"step": 810
},
{
"epoch": 0.15378844711177794,
"grad_norm": 1.383324146270752,
"learning_rate": 2.5877293577981654e-05,
"loss": 0.6802,
"step": 820
},
{
"epoch": 0.15566391597899476,
"grad_norm": 1.4699604511260986,
"learning_rate": 2.5819954128440367e-05,
"loss": 0.5997,
"step": 830
},
{
"epoch": 0.15753938484621155,
"grad_norm": 1.7882672548294067,
"learning_rate": 2.5762614678899084e-05,
"loss": 0.5495,
"step": 840
},
{
"epoch": 0.15941485371342837,
"grad_norm": 1.4856760501861572,
"learning_rate": 2.57052752293578e-05,
"loss": 0.6453,
"step": 850
},
{
"epoch": 0.16129032258064516,
"grad_norm": 1.4340794086456299,
"learning_rate": 2.5647935779816517e-05,
"loss": 0.6263,
"step": 860
},
{
"epoch": 0.16316579144786197,
"grad_norm": 1.7416778802871704,
"learning_rate": 2.559059633027523e-05,
"loss": 0.5994,
"step": 870
},
{
"epoch": 0.16504126031507876,
"grad_norm": 1.2337450981140137,
"learning_rate": 2.5533256880733947e-05,
"loss": 0.6288,
"step": 880
},
{
"epoch": 0.16691672918229558,
"grad_norm": 1.6339170932769775,
"learning_rate": 2.547591743119266e-05,
"loss": 0.6181,
"step": 890
},
{
"epoch": 0.16879219804951237,
"grad_norm": 1.7602070569992065,
"learning_rate": 2.5418577981651376e-05,
"loss": 0.5742,
"step": 900
},
{
"epoch": 0.1706676669167292,
"grad_norm": 1.3282010555267334,
"learning_rate": 2.5361238532110093e-05,
"loss": 0.6869,
"step": 910
},
{
"epoch": 0.17254313578394598,
"grad_norm": 1.471281886100769,
"learning_rate": 2.5303899082568806e-05,
"loss": 0.6788,
"step": 920
},
{
"epoch": 0.1744186046511628,
"grad_norm": 1.4900857210159302,
"learning_rate": 2.5246559633027523e-05,
"loss": 0.5706,
"step": 930
},
{
"epoch": 0.17629407351837958,
"grad_norm": 1.903571605682373,
"learning_rate": 2.518922018348624e-05,
"loss": 0.7056,
"step": 940
},
{
"epoch": 0.1781695423855964,
"grad_norm": 1.7407480478286743,
"learning_rate": 2.5131880733944956e-05,
"loss": 0.5102,
"step": 950
},
{
"epoch": 0.1800450112528132,
"grad_norm": 1.6534910202026367,
"learning_rate": 2.507454128440367e-05,
"loss": 0.6654,
"step": 960
},
{
"epoch": 0.18192048012003,
"grad_norm": 1.7608113288879395,
"learning_rate": 2.5017201834862386e-05,
"loss": 0.6793,
"step": 970
},
{
"epoch": 0.1837959489872468,
"grad_norm": 1.737579107284546,
"learning_rate": 2.49598623853211e-05,
"loss": 0.5787,
"step": 980
},
{
"epoch": 0.18567141785446362,
"grad_norm": 1.7096258401870728,
"learning_rate": 2.490252293577982e-05,
"loss": 0.7254,
"step": 990
},
{
"epoch": 0.18754688672168043,
"grad_norm": 1.7333779335021973,
"learning_rate": 2.4845183486238532e-05,
"loss": 0.7332,
"step": 1000
},
{
"epoch": 0.18754688672168043,
"eval_loss": 0.6944581270217896,
"eval_runtime": 5.5505,
"eval_samples_per_second": 21.62,
"eval_steps_per_second": 2.702,
"step": 1000
},
{
"epoch": 0.18942235558889722,
"grad_norm": 1.235772967338562,
"learning_rate": 2.478784403669725e-05,
"loss": 0.684,
"step": 1010
},
{
"epoch": 0.19129782445611404,
"grad_norm": 1.3532825708389282,
"learning_rate": 2.4730504587155962e-05,
"loss": 0.5977,
"step": 1020
},
{
"epoch": 0.19317329332333083,
"grad_norm": 1.7143605947494507,
"learning_rate": 2.4673165137614682e-05,
"loss": 0.6109,
"step": 1030
},
{
"epoch": 0.19504876219054765,
"grad_norm": 1.4067035913467407,
"learning_rate": 2.4615825688073395e-05,
"loss": 0.6823,
"step": 1040
},
{
"epoch": 0.19692423105776444,
"grad_norm": 1.713149070739746,
"learning_rate": 2.4558486238532112e-05,
"loss": 0.5852,
"step": 1050
},
{
"epoch": 0.19879969992498125,
"grad_norm": 1.5876083374023438,
"learning_rate": 2.4501146788990825e-05,
"loss": 0.6451,
"step": 1060
},
{
"epoch": 0.20067516879219804,
"grad_norm": 1.3769148588180542,
"learning_rate": 2.444380733944954e-05,
"loss": 0.5712,
"step": 1070
},
{
"epoch": 0.20255063765941486,
"grad_norm": 1.408957839012146,
"learning_rate": 2.4386467889908258e-05,
"loss": 0.5704,
"step": 1080
},
{
"epoch": 0.20442610652663165,
"grad_norm": 1.3725367784500122,
"learning_rate": 2.4329128440366975e-05,
"loss": 0.4405,
"step": 1090
},
{
"epoch": 0.20630157539384847,
"grad_norm": 1.6715686321258545,
"learning_rate": 2.4271788990825688e-05,
"loss": 0.5433,
"step": 1100
},
{
"epoch": 0.20817704426106526,
"grad_norm": 1.4740937948226929,
"learning_rate": 2.4214449541284405e-05,
"loss": 0.6363,
"step": 1110
},
{
"epoch": 0.21005251312828208,
"grad_norm": 1.640724778175354,
"learning_rate": 2.415711009174312e-05,
"loss": 0.6302,
"step": 1120
},
{
"epoch": 0.21192798199549887,
"grad_norm": 1.8040657043457031,
"learning_rate": 2.4099770642201838e-05,
"loss": 0.6726,
"step": 1130
},
{
"epoch": 0.21380345086271568,
"grad_norm": 1.6836594343185425,
"learning_rate": 2.404243119266055e-05,
"loss": 0.6706,
"step": 1140
},
{
"epoch": 0.21567891972993247,
"grad_norm": 1.9649243354797363,
"learning_rate": 2.3985091743119264e-05,
"loss": 0.5189,
"step": 1150
},
{
"epoch": 0.2175543885971493,
"grad_norm": 1.5541070699691772,
"learning_rate": 2.392775229357798e-05,
"loss": 0.7497,
"step": 1160
},
{
"epoch": 0.21942985746436608,
"grad_norm": 1.9473050832748413,
"learning_rate": 2.3870412844036697e-05,
"loss": 0.5373,
"step": 1170
},
{
"epoch": 0.2213053263315829,
"grad_norm": 1.8983582258224487,
"learning_rate": 2.3813073394495414e-05,
"loss": 0.6523,
"step": 1180
},
{
"epoch": 0.2231807951987997,
"grad_norm": 1.6753871440887451,
"learning_rate": 2.3755733944954127e-05,
"loss": 0.6294,
"step": 1190
},
{
"epoch": 0.2250562640660165,
"grad_norm": 1.706829309463501,
"learning_rate": 2.3698394495412844e-05,
"loss": 0.6741,
"step": 1200
},
{
"epoch": 0.2250562640660165,
"eval_loss": 0.6810731887817383,
"eval_runtime": 5.4298,
"eval_samples_per_second": 22.1,
"eval_steps_per_second": 2.763,
"step": 1200
},
{
"epoch": 0.22693173293323332,
"grad_norm": 1.6627461910247803,
"learning_rate": 2.364105504587156e-05,
"loss": 0.6354,
"step": 1210
},
{
"epoch": 0.2288072018004501,
"grad_norm": 2.973555564880371,
"learning_rate": 2.3583715596330277e-05,
"loss": 0.5482,
"step": 1220
},
{
"epoch": 0.23068267066766693,
"grad_norm": 1.9262856245040894,
"learning_rate": 2.352637614678899e-05,
"loss": 0.7558,
"step": 1230
},
{
"epoch": 0.23255813953488372,
"grad_norm": 1.816595435142517,
"learning_rate": 2.3469036697247707e-05,
"loss": 0.6039,
"step": 1240
},
{
"epoch": 0.23443360840210054,
"grad_norm": 1.9634557962417603,
"learning_rate": 2.341169724770642e-05,
"loss": 0.5126,
"step": 1250
},
{
"epoch": 0.23630907726931732,
"grad_norm": 1.7136008739471436,
"learning_rate": 2.335435779816514e-05,
"loss": 0.6365,
"step": 1260
},
{
"epoch": 0.23818454613653414,
"grad_norm": 1.4523965120315552,
"learning_rate": 2.3297018348623853e-05,
"loss": 0.5995,
"step": 1270
},
{
"epoch": 0.24006001500375093,
"grad_norm": 1.6242806911468506,
"learning_rate": 2.323967889908257e-05,
"loss": 0.6412,
"step": 1280
},
{
"epoch": 0.24193548387096775,
"grad_norm": 1.7888171672821045,
"learning_rate": 2.3182339449541283e-05,
"loss": 0.6565,
"step": 1290
},
{
"epoch": 0.24381095273818454,
"grad_norm": 1.6343475580215454,
"learning_rate": 2.3125000000000003e-05,
"loss": 0.6212,
"step": 1300
},
{
"epoch": 0.24568642160540136,
"grad_norm": 1.3897461891174316,
"learning_rate": 2.3067660550458716e-05,
"loss": 0.5839,
"step": 1310
},
{
"epoch": 0.24756189047261815,
"grad_norm": 1.502485752105713,
"learning_rate": 2.3010321100917433e-05,
"loss": 0.6725,
"step": 1320
},
{
"epoch": 0.24943735933983496,
"grad_norm": 1.3770966529846191,
"learning_rate": 2.2952981651376146e-05,
"loss": 0.5998,
"step": 1330
},
{
"epoch": 0.25131282820705175,
"grad_norm": 1.7012661695480347,
"learning_rate": 2.2895642201834863e-05,
"loss": 0.6668,
"step": 1340
},
{
"epoch": 0.25318829707426854,
"grad_norm": 1.747942566871643,
"learning_rate": 2.283830275229358e-05,
"loss": 0.6948,
"step": 1350
},
{
"epoch": 0.2550637659414854,
"grad_norm": 1.4288934469223022,
"learning_rate": 2.2780963302752296e-05,
"loss": 0.6063,
"step": 1360
},
{
"epoch": 0.2569392348087022,
"grad_norm": 1.6301014423370361,
"learning_rate": 2.272362385321101e-05,
"loss": 0.677,
"step": 1370
},
{
"epoch": 0.25881470367591897,
"grad_norm": 1.3200469017028809,
"learning_rate": 2.2666284403669726e-05,
"loss": 0.653,
"step": 1380
},
{
"epoch": 0.26069017254313576,
"grad_norm": 1.5794614553451538,
"learning_rate": 2.2608944954128442e-05,
"loss": 0.6477,
"step": 1390
},
{
"epoch": 0.2625656414103526,
"grad_norm": 1.5092536211013794,
"learning_rate": 2.2551605504587155e-05,
"loss": 0.6202,
"step": 1400
},
{
"epoch": 0.2625656414103526,
"eval_loss": 0.6741260290145874,
"eval_runtime": 5.5003,
"eval_samples_per_second": 21.817,
"eval_steps_per_second": 2.727,
"step": 1400
},
{
"epoch": 0.2644411102775694,
"grad_norm": 1.5101447105407715,
"learning_rate": 2.2494266055045872e-05,
"loss": 0.583,
"step": 1410
},
{
"epoch": 0.2663165791447862,
"grad_norm": 1.5355420112609863,
"learning_rate": 2.2436926605504585e-05,
"loss": 0.5422,
"step": 1420
},
{
"epoch": 0.268192048012003,
"grad_norm": 1.5322073698043823,
"learning_rate": 2.2379587155963305e-05,
"loss": 0.6067,
"step": 1430
},
{
"epoch": 0.2700675168792198,
"grad_norm": 1.5003911256790161,
"learning_rate": 2.232224770642202e-05,
"loss": 0.5578,
"step": 1440
},
{
"epoch": 0.2719429857464366,
"grad_norm": 1.4054975509643555,
"learning_rate": 2.2264908256880735e-05,
"loss": 0.5819,
"step": 1450
},
{
"epoch": 0.2738184546136534,
"grad_norm": 1.7100839614868164,
"learning_rate": 2.2207568807339448e-05,
"loss": 0.7076,
"step": 1460
},
{
"epoch": 0.27569392348087024,
"grad_norm": 1.6358684301376343,
"learning_rate": 2.2150229357798165e-05,
"loss": 0.5748,
"step": 1470
},
{
"epoch": 0.27756939234808703,
"grad_norm": 1.8648029565811157,
"learning_rate": 2.209288990825688e-05,
"loss": 0.5488,
"step": 1480
},
{
"epoch": 0.2794448612153038,
"grad_norm": 2.0715155601501465,
"learning_rate": 2.2035550458715598e-05,
"loss": 0.6121,
"step": 1490
},
{
"epoch": 0.2813203300825206,
"grad_norm": 1.4680354595184326,
"learning_rate": 2.197821100917431e-05,
"loss": 0.5775,
"step": 1500
},
{
"epoch": 0.28319579894973745,
"grad_norm": 1.646637201309204,
"learning_rate": 2.1920871559633028e-05,
"loss": 0.6433,
"step": 1510
},
{
"epoch": 0.28507126781695424,
"grad_norm": 1.9596463441848755,
"learning_rate": 2.1863532110091744e-05,
"loss": 0.6534,
"step": 1520
},
{
"epoch": 0.28694673668417103,
"grad_norm": 2.375546455383301,
"learning_rate": 2.180619266055046e-05,
"loss": 0.5802,
"step": 1530
},
{
"epoch": 0.2888222055513878,
"grad_norm": 1.2877148389816284,
"learning_rate": 2.1748853211009174e-05,
"loss": 0.6626,
"step": 1540
},
{
"epoch": 0.29069767441860467,
"grad_norm": 1.3704779148101807,
"learning_rate": 2.169151376146789e-05,
"loss": 0.7177,
"step": 1550
},
{
"epoch": 0.29257314328582146,
"grad_norm": 1.9320201873779297,
"learning_rate": 2.1634174311926604e-05,
"loss": 0.6648,
"step": 1560
},
{
"epoch": 0.29444861215303825,
"grad_norm": 2.351738452911377,
"learning_rate": 2.1576834862385324e-05,
"loss": 0.5823,
"step": 1570
},
{
"epoch": 0.29632408102025504,
"grad_norm": 1.6075841188430786,
"learning_rate": 2.1519495412844037e-05,
"loss": 0.6363,
"step": 1580
},
{
"epoch": 0.2981995498874719,
"grad_norm": 1.7780178785324097,
"learning_rate": 2.1462155963302754e-05,
"loss": 0.7098,
"step": 1590
},
{
"epoch": 0.30007501875468867,
"grad_norm": 1.8664710521697998,
"learning_rate": 2.1404816513761467e-05,
"loss": 0.5582,
"step": 1600
},
{
"epoch": 0.30007501875468867,
"eval_loss": 0.6711069345474243,
"eval_runtime": 5.5099,
"eval_samples_per_second": 21.779,
"eval_steps_per_second": 2.722,
"step": 1600
},
{
"epoch": 0.30195048762190546,
"grad_norm": 1.7083989381790161,
"learning_rate": 2.1347477064220187e-05,
"loss": 0.6628,
"step": 1610
},
{
"epoch": 0.3038259564891223,
"grad_norm": 1.7052229642868042,
"learning_rate": 2.12901376146789e-05,
"loss": 0.5975,
"step": 1620
},
{
"epoch": 0.3057014253563391,
"grad_norm": 1.5098538398742676,
"learning_rate": 2.1232798165137617e-05,
"loss": 0.5729,
"step": 1630
},
{
"epoch": 0.3075768942235559,
"grad_norm": 1.6489193439483643,
"learning_rate": 2.117545871559633e-05,
"loss": 0.5064,
"step": 1640
},
{
"epoch": 0.3094523630907727,
"grad_norm": 1.9127089977264404,
"learning_rate": 2.1118119266055043e-05,
"loss": 0.582,
"step": 1650
},
{
"epoch": 0.3113278319579895,
"grad_norm": 1.801680326461792,
"learning_rate": 2.1060779816513763e-05,
"loss": 0.6894,
"step": 1660
},
{
"epoch": 0.3132033008252063,
"grad_norm": 1.622673511505127,
"learning_rate": 2.1003440366972476e-05,
"loss": 0.6085,
"step": 1670
},
{
"epoch": 0.3150787696924231,
"grad_norm": 1.9467750787734985,
"learning_rate": 2.0946100917431193e-05,
"loss": 0.6822,
"step": 1680
},
{
"epoch": 0.3169542385596399,
"grad_norm": 1.5031330585479736,
"learning_rate": 2.0888761467889906e-05,
"loss": 0.6694,
"step": 1690
},
{
"epoch": 0.31882970742685673,
"grad_norm": 1.68521249294281,
"learning_rate": 2.0831422018348626e-05,
"loss": 0.6556,
"step": 1700
},
{
"epoch": 0.3207051762940735,
"grad_norm": 1.8257548809051514,
"learning_rate": 2.077408256880734e-05,
"loss": 0.5684,
"step": 1710
},
{
"epoch": 0.3225806451612903,
"grad_norm": 1.6865085363388062,
"learning_rate": 2.0716743119266056e-05,
"loss": 0.6543,
"step": 1720
},
{
"epoch": 0.3244561140285071,
"grad_norm": 1.7781134843826294,
"learning_rate": 2.065940366972477e-05,
"loss": 0.5249,
"step": 1730
},
{
"epoch": 0.32633158289572395,
"grad_norm": 1.9172645807266235,
"learning_rate": 2.0602064220183486e-05,
"loss": 0.564,
"step": 1740
},
{
"epoch": 0.32820705176294074,
"grad_norm": 1.9964970350265503,
"learning_rate": 2.0544724770642202e-05,
"loss": 0.5688,
"step": 1750
},
{
"epoch": 0.3300825206301575,
"grad_norm": 2.0303592681884766,
"learning_rate": 2.048738532110092e-05,
"loss": 0.6081,
"step": 1760
},
{
"epoch": 0.3319579894973743,
"grad_norm": 2.4410409927368164,
"learning_rate": 2.0430045871559632e-05,
"loss": 0.542,
"step": 1770
},
{
"epoch": 0.33383345836459116,
"grad_norm": 1.7117453813552856,
"learning_rate": 2.037270642201835e-05,
"loss": 0.4778,
"step": 1780
},
{
"epoch": 0.33570892723180795,
"grad_norm": 1.5781958103179932,
"learning_rate": 2.0315366972477065e-05,
"loss": 0.5451,
"step": 1790
},
{
"epoch": 0.33758439609902474,
"grad_norm": 1.601178526878357,
"learning_rate": 2.0258027522935782e-05,
"loss": 0.5371,
"step": 1800
},
{
"epoch": 0.33758439609902474,
"eval_loss": 0.667210042476654,
"eval_runtime": 5.5955,
"eval_samples_per_second": 21.446,
"eval_steps_per_second": 2.681,
"step": 1800
},
{
"epoch": 0.3394598649662416,
"grad_norm": 1.520401954650879,
"learning_rate": 2.0200688073394495e-05,
"loss": 0.7463,
"step": 1810
},
{
"epoch": 0.3413353338334584,
"grad_norm": 1.5495413541793823,
"learning_rate": 2.0143348623853212e-05,
"loss": 0.5746,
"step": 1820
},
{
"epoch": 0.34321080270067517,
"grad_norm": 1.656015157699585,
"learning_rate": 2.0086009174311925e-05,
"loss": 0.5587,
"step": 1830
},
{
"epoch": 0.34508627156789196,
"grad_norm": 1.7179194688796997,
"learning_rate": 2.0028669724770645e-05,
"loss": 0.6316,
"step": 1840
},
{
"epoch": 0.3469617404351088,
"grad_norm": 2.026876926422119,
"learning_rate": 1.9971330275229358e-05,
"loss": 0.5859,
"step": 1850
},
{
"epoch": 0.3488372093023256,
"grad_norm": 1.675175428390503,
"learning_rate": 1.9913990825688075e-05,
"loss": 0.5499,
"step": 1860
},
{
"epoch": 0.3507126781695424,
"grad_norm": 1.3794666528701782,
"learning_rate": 1.9856651376146788e-05,
"loss": 0.651,
"step": 1870
},
{
"epoch": 0.35258814703675917,
"grad_norm": 1.6561700105667114,
"learning_rate": 1.9799311926605508e-05,
"loss": 0.52,
"step": 1880
},
{
"epoch": 0.354463615903976,
"grad_norm": 1.9196125268936157,
"learning_rate": 1.974197247706422e-05,
"loss": 0.5643,
"step": 1890
},
{
"epoch": 0.3563390847711928,
"grad_norm": 2.157627820968628,
"learning_rate": 1.9684633027522934e-05,
"loss": 0.5726,
"step": 1900
},
{
"epoch": 0.3582145536384096,
"grad_norm": 1.8069156408309937,
"learning_rate": 1.962729357798165e-05,
"loss": 0.6398,
"step": 1910
},
{
"epoch": 0.3600900225056264,
"grad_norm": 1.7318720817565918,
"learning_rate": 1.9569954128440368e-05,
"loss": 0.4835,
"step": 1920
},
{
"epoch": 0.36196549137284323,
"grad_norm": 2.1636054515838623,
"learning_rate": 1.9512614678899084e-05,
"loss": 0.587,
"step": 1930
},
{
"epoch": 0.36384096024006,
"grad_norm": 2.062150478363037,
"learning_rate": 1.9455275229357797e-05,
"loss": 0.763,
"step": 1940
},
{
"epoch": 0.3657164291072768,
"grad_norm": 1.6775376796722412,
"learning_rate": 1.9397935779816514e-05,
"loss": 0.6575,
"step": 1950
},
{
"epoch": 0.3675918979744936,
"grad_norm": 1.5422090291976929,
"learning_rate": 1.9340596330275227e-05,
"loss": 0.6128,
"step": 1960
},
{
"epoch": 0.36946736684171044,
"grad_norm": 1.7209275960922241,
"learning_rate": 1.9283256880733947e-05,
"loss": 0.5796,
"step": 1970
},
{
"epoch": 0.37134283570892723,
"grad_norm": 1.5626654624938965,
"learning_rate": 1.922591743119266e-05,
"loss": 0.5237,
"step": 1980
},
{
"epoch": 0.373218304576144,
"grad_norm": 1.6950414180755615,
"learning_rate": 1.9168577981651377e-05,
"loss": 0.5983,
"step": 1990
},
{
"epoch": 0.37509377344336087,
"grad_norm": 1.5081120729446411,
"learning_rate": 1.911123853211009e-05,
"loss": 0.5603,
"step": 2000
},
{
"epoch": 0.37509377344336087,
"eval_loss": 0.6599423885345459,
"eval_runtime": 5.5471,
"eval_samples_per_second": 21.633,
"eval_steps_per_second": 2.704,
"step": 2000
},
{
"epoch": 0.37696924231057766,
"grad_norm": 1.7430557012557983,
"learning_rate": 1.905389908256881e-05,
"loss": 0.5582,
"step": 2010
},
{
"epoch": 0.37884471117779445,
"grad_norm": 1.8989301919937134,
"learning_rate": 1.8996559633027523e-05,
"loss": 0.6411,
"step": 2020
},
{
"epoch": 0.38072018004501124,
"grad_norm": 1.9164332151412964,
"learning_rate": 1.893922018348624e-05,
"loss": 0.5733,
"step": 2030
},
{
"epoch": 0.3825956489122281,
"grad_norm": 1.9230120182037354,
"learning_rate": 1.8881880733944953e-05,
"loss": 0.6592,
"step": 2040
},
{
"epoch": 0.38447111777944487,
"grad_norm": 1.9948559999465942,
"learning_rate": 1.882454128440367e-05,
"loss": 0.5918,
"step": 2050
},
{
"epoch": 0.38634658664666166,
"grad_norm": 1.8086504936218262,
"learning_rate": 1.8767201834862386e-05,
"loss": 0.5939,
"step": 2060
},
{
"epoch": 0.38822205551387845,
"grad_norm": 1.715736985206604,
"learning_rate": 1.8709862385321103e-05,
"loss": 0.6624,
"step": 2070
},
{
"epoch": 0.3900975243810953,
"grad_norm": 2.9393413066864014,
"learning_rate": 1.8652522935779816e-05,
"loss": 0.5954,
"step": 2080
},
{
"epoch": 0.3919729932483121,
"grad_norm": 2.3764209747314453,
"learning_rate": 1.8595183486238533e-05,
"loss": 0.6181,
"step": 2090
},
{
"epoch": 0.3938484621155289,
"grad_norm": 1.7462408542633057,
"learning_rate": 1.853784403669725e-05,
"loss": 0.4992,
"step": 2100
},
{
"epoch": 0.39572393098274566,
"grad_norm": 2.006526470184326,
"learning_rate": 1.8480504587155966e-05,
"loss": 0.5331,
"step": 2110
},
{
"epoch": 0.3975993998499625,
"grad_norm": 2.453961133956909,
"learning_rate": 1.842316513761468e-05,
"loss": 0.5828,
"step": 2120
},
{
"epoch": 0.3994748687171793,
"grad_norm": 1.9606050252914429,
"learning_rate": 1.8365825688073396e-05,
"loss": 0.596,
"step": 2130
},
{
"epoch": 0.4013503375843961,
"grad_norm": 1.776755690574646,
"learning_rate": 1.830848623853211e-05,
"loss": 0.6688,
"step": 2140
},
{
"epoch": 0.4032258064516129,
"grad_norm": 1.6970465183258057,
"learning_rate": 1.8251146788990826e-05,
"loss": 0.6403,
"step": 2150
},
{
"epoch": 0.4051012753188297,
"grad_norm": 2.1834471225738525,
"learning_rate": 1.8193807339449542e-05,
"loss": 0.6169,
"step": 2160
},
{
"epoch": 0.4069767441860465,
"grad_norm": 1.4596108198165894,
"learning_rate": 1.8136467889908255e-05,
"loss": 0.5783,
"step": 2170
},
{
"epoch": 0.4088522130532633,
"grad_norm": 1.808875560760498,
"learning_rate": 1.8079128440366972e-05,
"loss": 0.6834,
"step": 2180
},
{
"epoch": 0.41072768192048015,
"grad_norm": 2.0414464473724365,
"learning_rate": 1.802178899082569e-05,
"loss": 0.5546,
"step": 2190
},
{
"epoch": 0.41260315078769694,
"grad_norm": 1.7231241464614868,
"learning_rate": 1.7964449541284405e-05,
"loss": 0.5875,
"step": 2200
},
{
"epoch": 0.41260315078769694,
"eval_loss": 0.6564481258392334,
"eval_runtime": 5.5564,
"eval_samples_per_second": 21.597,
"eval_steps_per_second": 2.7,
"step": 2200
},
{
"epoch": 0.4144786196549137,
"grad_norm": 1.5646345615386963,
"learning_rate": 1.790711009174312e-05,
"loss": 0.4631,
"step": 2210
},
{
"epoch": 0.4163540885221305,
"grad_norm": 2.1417741775512695,
"learning_rate": 1.7849770642201835e-05,
"loss": 0.5978,
"step": 2220
},
{
"epoch": 0.41822955738934736,
"grad_norm": 1.5909672975540161,
"learning_rate": 1.7792431192660548e-05,
"loss": 0.6276,
"step": 2230
},
{
"epoch": 0.42010502625656415,
"grad_norm": 1.5815021991729736,
"learning_rate": 1.7735091743119268e-05,
"loss": 0.5655,
"step": 2240
},
{
"epoch": 0.42198049512378094,
"grad_norm": 2.173349618911743,
"learning_rate": 1.767775229357798e-05,
"loss": 0.5172,
"step": 2250
},
{
"epoch": 0.42385596399099773,
"grad_norm": 1.611697793006897,
"learning_rate": 1.7620412844036698e-05,
"loss": 0.5828,
"step": 2260
},
{
"epoch": 0.4257314328582146,
"grad_norm": 2.148935556411743,
"learning_rate": 1.756307339449541e-05,
"loss": 0.5796,
"step": 2270
},
{
"epoch": 0.42760690172543137,
"grad_norm": 2.8221611976623535,
"learning_rate": 1.750573394495413e-05,
"loss": 0.6098,
"step": 2280
},
{
"epoch": 0.42948237059264815,
"grad_norm": 1.8515477180480957,
"learning_rate": 1.7448394495412844e-05,
"loss": 0.6519,
"step": 2290
},
{
"epoch": 0.43135783945986494,
"grad_norm": 1.9033889770507812,
"learning_rate": 1.739105504587156e-05,
"loss": 0.5771,
"step": 2300
},
{
"epoch": 0.4332333083270818,
"grad_norm": 2.1629979610443115,
"learning_rate": 1.7333715596330274e-05,
"loss": 0.5308,
"step": 2310
},
{
"epoch": 0.4351087771942986,
"grad_norm": 1.713036060333252,
"learning_rate": 1.727637614678899e-05,
"loss": 0.6036,
"step": 2320
},
{
"epoch": 0.43698424606151537,
"grad_norm": 1.626887559890747,
"learning_rate": 1.7219036697247707e-05,
"loss": 0.5932,
"step": 2330
},
{
"epoch": 0.43885971492873216,
"grad_norm": 2.026658535003662,
"learning_rate": 1.7161697247706424e-05,
"loss": 0.509,
"step": 2340
},
{
"epoch": 0.440735183795949,
"grad_norm": 1.617053508758545,
"learning_rate": 1.7104357798165137e-05,
"loss": 0.5841,
"step": 2350
},
{
"epoch": 0.4426106526631658,
"grad_norm": 1.8023245334625244,
"learning_rate": 1.7047018348623854e-05,
"loss": 0.5244,
"step": 2360
},
{
"epoch": 0.4444861215303826,
"grad_norm": 2.0502309799194336,
"learning_rate": 1.698967889908257e-05,
"loss": 0.5936,
"step": 2370
},
{
"epoch": 0.4463615903975994,
"grad_norm": 2.410144567489624,
"learning_rate": 1.6932339449541287e-05,
"loss": 0.6206,
"step": 2380
},
{
"epoch": 0.4482370592648162,
"grad_norm": 2.0925815105438232,
"learning_rate": 1.6875e-05,
"loss": 0.5086,
"step": 2390
},
{
"epoch": 0.450112528132033,
"grad_norm": 1.8199101686477661,
"learning_rate": 1.6817660550458713e-05,
"loss": 0.583,
"step": 2400
},
{
"epoch": 0.450112528132033,
"eval_loss": 0.6492409110069275,
"eval_runtime": 5.5179,
"eval_samples_per_second": 21.747,
"eval_steps_per_second": 2.718,
"step": 2400
},
{
"epoch": 0.4519879969992498,
"grad_norm": 1.7940239906311035,
"learning_rate": 1.6760321100917433e-05,
"loss": 0.6154,
"step": 2410
},
{
"epoch": 0.45386346586646664,
"grad_norm": 2.281325340270996,
"learning_rate": 1.6702981651376147e-05,
"loss": 0.5542,
"step": 2420
},
{
"epoch": 0.45573893473368343,
"grad_norm": 1.8717613220214844,
"learning_rate": 1.6645642201834863e-05,
"loss": 0.5242,
"step": 2430
},
{
"epoch": 0.4576144036009002,
"grad_norm": 2.2120072841644287,
"learning_rate": 1.6588302752293576e-05,
"loss": 0.63,
"step": 2440
},
{
"epoch": 0.459489872468117,
"grad_norm": 2.10752272605896,
"learning_rate": 1.6530963302752293e-05,
"loss": 0.5712,
"step": 2450
},
{
"epoch": 0.46136534133533386,
"grad_norm": 2.3129327297210693,
"learning_rate": 1.647362385321101e-05,
"loss": 0.7044,
"step": 2460
},
{
"epoch": 0.46324081020255065,
"grad_norm": 1.424224853515625,
"learning_rate": 1.6416284403669726e-05,
"loss": 0.5588,
"step": 2470
},
{
"epoch": 0.46511627906976744,
"grad_norm": 1.6627572774887085,
"learning_rate": 1.635894495412844e-05,
"loss": 0.4543,
"step": 2480
},
{
"epoch": 0.4669917479369842,
"grad_norm": 1.6522067785263062,
"learning_rate": 1.6301605504587156e-05,
"loss": 0.6003,
"step": 2490
},
{
"epoch": 0.46886721680420107,
"grad_norm": 2.2070651054382324,
"learning_rate": 1.6244266055045873e-05,
"loss": 0.6294,
"step": 2500
},
{
"epoch": 0.47074268567141786,
"grad_norm": 2.1523821353912354,
"learning_rate": 1.618692660550459e-05,
"loss": 0.6067,
"step": 2510
},
{
"epoch": 0.47261815453863465,
"grad_norm": 2.468892812728882,
"learning_rate": 1.6129587155963302e-05,
"loss": 0.6267,
"step": 2520
},
{
"epoch": 0.47449362340585144,
"grad_norm": 1.9735854864120483,
"learning_rate": 1.607224770642202e-05,
"loss": 0.6124,
"step": 2530
},
{
"epoch": 0.4763690922730683,
"grad_norm": 1.7900265455245972,
"learning_rate": 1.6014908256880732e-05,
"loss": 0.5845,
"step": 2540
},
{
"epoch": 0.4782445611402851,
"grad_norm": 2.2069602012634277,
"learning_rate": 1.5957568807339452e-05,
"loss": 0.6071,
"step": 2550
},
{
"epoch": 0.48012003000750186,
"grad_norm": 2.3752589225769043,
"learning_rate": 1.5900229357798165e-05,
"loss": 0.6074,
"step": 2560
},
{
"epoch": 0.48199549887471865,
"grad_norm": 1.8114852905273438,
"learning_rate": 1.5842889908256882e-05,
"loss": 0.5358,
"step": 2570
},
{
"epoch": 0.4838709677419355,
"grad_norm": 1.6503331661224365,
"learning_rate": 1.5785550458715595e-05,
"loss": 0.6664,
"step": 2580
},
{
"epoch": 0.4857464366091523,
"grad_norm": 1.7421520948410034,
"learning_rate": 1.5728211009174315e-05,
"loss": 0.6137,
"step": 2590
},
{
"epoch": 0.4876219054763691,
"grad_norm": 1.865038275718689,
"learning_rate": 1.567087155963303e-05,
"loss": 0.6328,
"step": 2600
},
{
"epoch": 0.4876219054763691,
"eval_loss": 0.6427852511405945,
"eval_runtime": 5.6723,
"eval_samples_per_second": 21.155,
"eval_steps_per_second": 2.644,
"step": 2600
},
{
"epoch": 0.4894973743435859,
"grad_norm": 2.0670528411865234,
"learning_rate": 1.5613532110091745e-05,
"loss": 0.5686,
"step": 2610
},
{
"epoch": 0.4913728432108027,
"grad_norm": 2.00549054145813,
"learning_rate": 1.5556192660550458e-05,
"loss": 0.5701,
"step": 2620
},
{
"epoch": 0.4932483120780195,
"grad_norm": 2.3382251262664795,
"learning_rate": 1.5498853211009175e-05,
"loss": 0.6212,
"step": 2630
},
{
"epoch": 0.4951237809452363,
"grad_norm": 1.849523901939392,
"learning_rate": 1.544151376146789e-05,
"loss": 0.5844,
"step": 2640
},
{
"epoch": 0.49699924981245314,
"grad_norm": 2.0589709281921387,
"learning_rate": 1.5384174311926605e-05,
"loss": 0.5736,
"step": 2650
},
{
"epoch": 0.4988747186796699,
"grad_norm": 2.3713736534118652,
"learning_rate": 1.532683486238532e-05,
"loss": 0.5543,
"step": 2660
},
{
"epoch": 0.5007501875468867,
"grad_norm": 1.4133175611495972,
"learning_rate": 1.5269495412844034e-05,
"loss": 0.5916,
"step": 2670
},
{
"epoch": 0.5026256564141035,
"grad_norm": 1.828869104385376,
"learning_rate": 1.5212155963302753e-05,
"loss": 0.6424,
"step": 2680
},
{
"epoch": 0.5045011252813203,
"grad_norm": 1.8340333700180054,
"learning_rate": 1.5154816513761468e-05,
"loss": 0.5335,
"step": 2690
},
{
"epoch": 0.5063765941485371,
"grad_norm": 2.287064790725708,
"learning_rate": 1.5097477064220184e-05,
"loss": 0.5604,
"step": 2700
},
{
"epoch": 0.508252063015754,
"grad_norm": 2.0678904056549072,
"learning_rate": 1.5040137614678897e-05,
"loss": 0.6822,
"step": 2710
},
{
"epoch": 0.5101275318829708,
"grad_norm": 1.848810076713562,
"learning_rate": 1.4982798165137616e-05,
"loss": 0.5741,
"step": 2720
},
{
"epoch": 0.5120030007501876,
"grad_norm": 1.8436052799224854,
"learning_rate": 1.492545871559633e-05,
"loss": 0.592,
"step": 2730
},
{
"epoch": 0.5138784696174044,
"grad_norm": 1.8554112911224365,
"learning_rate": 1.4868119266055047e-05,
"loss": 0.5372,
"step": 2740
},
{
"epoch": 0.5157539384846211,
"grad_norm": 1.7678755521774292,
"learning_rate": 1.4810779816513762e-05,
"loss": 0.5748,
"step": 2750
},
{
"epoch": 0.5176294073518379,
"grad_norm": 1.71146821975708,
"learning_rate": 1.4753440366972479e-05,
"loss": 0.6795,
"step": 2760
},
{
"epoch": 0.5195048762190547,
"grad_norm": 1.6599249839782715,
"learning_rate": 1.4696100917431192e-05,
"loss": 0.5559,
"step": 2770
},
{
"epoch": 0.5213803450862715,
"grad_norm": 2.273698568344116,
"learning_rate": 1.4638761467889908e-05,
"loss": 0.4864,
"step": 2780
},
{
"epoch": 0.5232558139534884,
"grad_norm": 2.400425434112549,
"learning_rate": 1.4581422018348623e-05,
"loss": 0.6152,
"step": 2790
},
{
"epoch": 0.5251312828207052,
"grad_norm": 2.1009607315063477,
"learning_rate": 1.452408256880734e-05,
"loss": 0.5518,
"step": 2800
},
{
"epoch": 0.5251312828207052,
"eval_loss": 0.6440523266792297,
"eval_runtime": 5.5215,
"eval_samples_per_second": 21.733,
"eval_steps_per_second": 2.717,
"step": 2800
},
{
"epoch": 0.527006751687922,
"grad_norm": 1.724177360534668,
"learning_rate": 1.4466743119266055e-05,
"loss": 0.5635,
"step": 2810
},
{
"epoch": 0.5288822205551388,
"grad_norm": 1.6806992292404175,
"learning_rate": 1.440940366972477e-05,
"loss": 0.6072,
"step": 2820
},
{
"epoch": 0.5307576894223556,
"grad_norm": 1.914863109588623,
"learning_rate": 1.4352064220183486e-05,
"loss": 0.5919,
"step": 2830
},
{
"epoch": 0.5326331582895724,
"grad_norm": 1.9246379137039185,
"learning_rate": 1.4294724770642201e-05,
"loss": 0.6282,
"step": 2840
},
{
"epoch": 0.5345086271567892,
"grad_norm": 2.0513482093811035,
"learning_rate": 1.4237385321100918e-05,
"loss": 0.5117,
"step": 2850
},
{
"epoch": 0.536384096024006,
"grad_norm": 2.160053253173828,
"learning_rate": 1.4180045871559633e-05,
"loss": 0.5916,
"step": 2860
},
{
"epoch": 0.5382595648912228,
"grad_norm": 1.3989676237106323,
"learning_rate": 1.412270642201835e-05,
"loss": 0.6169,
"step": 2870
},
{
"epoch": 0.5401350337584396,
"grad_norm": 1.9387221336364746,
"learning_rate": 1.4065366972477064e-05,
"loss": 0.6374,
"step": 2880
},
{
"epoch": 0.5420105026256564,
"grad_norm": 2.054593563079834,
"learning_rate": 1.4008027522935781e-05,
"loss": 0.5947,
"step": 2890
},
{
"epoch": 0.5438859714928732,
"grad_norm": 1.8106393814086914,
"learning_rate": 1.3950688073394496e-05,
"loss": 0.6232,
"step": 2900
},
{
"epoch": 0.54576144036009,
"grad_norm": 2.042513132095337,
"learning_rate": 1.389334862385321e-05,
"loss": 0.481,
"step": 2910
},
{
"epoch": 0.5476369092273068,
"grad_norm": 1.6872574090957642,
"learning_rate": 1.3836009174311927e-05,
"loss": 0.5128,
"step": 2920
},
{
"epoch": 0.5495123780945236,
"grad_norm": 1.8918819427490234,
"learning_rate": 1.3778669724770642e-05,
"loss": 0.6205,
"step": 2930
},
{
"epoch": 0.5513878469617405,
"grad_norm": 2.6372804641723633,
"learning_rate": 1.3721330275229359e-05,
"loss": 0.4981,
"step": 2940
},
{
"epoch": 0.5532633158289573,
"grad_norm": 1.8915632963180542,
"learning_rate": 1.3663990825688074e-05,
"loss": 0.5481,
"step": 2950
},
{
"epoch": 0.5551387846961741,
"grad_norm": 2.0230934619903564,
"learning_rate": 1.360665137614679e-05,
"loss": 0.5806,
"step": 2960
},
{
"epoch": 0.5570142535633908,
"grad_norm": 2.1508560180664062,
"learning_rate": 1.3549311926605505e-05,
"loss": 0.6057,
"step": 2970
},
{
"epoch": 0.5588897224306076,
"grad_norm": 1.7368062734603882,
"learning_rate": 1.3491972477064222e-05,
"loss": 0.5743,
"step": 2980
},
{
"epoch": 0.5607651912978244,
"grad_norm": 1.9738160371780396,
"learning_rate": 1.3434633027522937e-05,
"loss": 0.6063,
"step": 2990
},
{
"epoch": 0.5626406601650412,
"grad_norm": 1.9070963859558105,
"learning_rate": 1.3377293577981652e-05,
"loss": 0.5965,
"step": 3000
},
{
"epoch": 0.5626406601650412,
"eval_loss": 0.637257993221283,
"eval_runtime": 5.3838,
"eval_samples_per_second": 22.289,
"eval_steps_per_second": 2.786,
"step": 3000
},
{
"epoch": 0.5645161290322581,
"grad_norm": 1.9798016548156738,
"learning_rate": 1.3319954128440368e-05,
"loss": 0.5758,
"step": 3010
},
{
"epoch": 0.5663915978994749,
"grad_norm": 1.68988037109375,
"learning_rate": 1.3262614678899081e-05,
"loss": 0.5435,
"step": 3020
},
{
"epoch": 0.5682670667666917,
"grad_norm": 1.9612882137298584,
"learning_rate": 1.3205275229357798e-05,
"loss": 0.7064,
"step": 3030
},
{
"epoch": 0.5701425356339085,
"grad_norm": 1.9069509506225586,
"learning_rate": 1.3147935779816513e-05,
"loss": 0.6531,
"step": 3040
},
{
"epoch": 0.5720180045011253,
"grad_norm": 2.185046434402466,
"learning_rate": 1.309059633027523e-05,
"loss": 0.548,
"step": 3050
},
{
"epoch": 0.5738934733683421,
"grad_norm": 1.6375807523727417,
"learning_rate": 1.3033256880733944e-05,
"loss": 0.5555,
"step": 3060
},
{
"epoch": 0.5757689422355589,
"grad_norm": 2.4809699058532715,
"learning_rate": 1.2975917431192661e-05,
"loss": 0.5071,
"step": 3070
},
{
"epoch": 0.5776444111027756,
"grad_norm": 2.071410894393921,
"learning_rate": 1.2918577981651376e-05,
"loss": 0.6192,
"step": 3080
},
{
"epoch": 0.5795198799699925,
"grad_norm": 1.9961457252502441,
"learning_rate": 1.2861238532110092e-05,
"loss": 0.6463,
"step": 3090
},
{
"epoch": 0.5813953488372093,
"grad_norm": 1.7288352251052856,
"learning_rate": 1.2803899082568807e-05,
"loss": 0.5121,
"step": 3100
},
{
"epoch": 0.5832708177044261,
"grad_norm": 2.855468988418579,
"learning_rate": 1.2746559633027522e-05,
"loss": 0.5797,
"step": 3110
},
{
"epoch": 0.5851462865716429,
"grad_norm": 2.2987215518951416,
"learning_rate": 1.2689220183486239e-05,
"loss": 0.5607,
"step": 3120
},
{
"epoch": 0.5870217554388597,
"grad_norm": 1.4077903032302856,
"learning_rate": 1.2631880733944954e-05,
"loss": 0.6127,
"step": 3130
},
{
"epoch": 0.5888972243060765,
"grad_norm": 2.1426985263824463,
"learning_rate": 1.257454128440367e-05,
"loss": 0.5774,
"step": 3140
},
{
"epoch": 0.5907726931732933,
"grad_norm": 1.681693196296692,
"learning_rate": 1.2517201834862385e-05,
"loss": 0.5311,
"step": 3150
},
{
"epoch": 0.5926481620405101,
"grad_norm": 2.1285390853881836,
"learning_rate": 1.2459862385321102e-05,
"loss": 0.7334,
"step": 3160
},
{
"epoch": 0.594523630907727,
"grad_norm": 1.7066893577575684,
"learning_rate": 1.2402522935779817e-05,
"loss": 0.4741,
"step": 3170
},
{
"epoch": 0.5963990997749438,
"grad_norm": 2.3069071769714355,
"learning_rate": 1.2345183486238533e-05,
"loss": 0.6068,
"step": 3180
},
{
"epoch": 0.5982745686421606,
"grad_norm": 1.898915410041809,
"learning_rate": 1.2287844036697248e-05,
"loss": 0.4881,
"step": 3190
},
{
"epoch": 0.6001500375093773,
"grad_norm": 1.9187260866165161,
"learning_rate": 1.2230504587155963e-05,
"loss": 0.5603,
"step": 3200
},
{
"epoch": 0.6001500375093773,
"eval_loss": 0.6345093250274658,
"eval_runtime": 5.5367,
"eval_samples_per_second": 21.674,
"eval_steps_per_second": 2.709,
"step": 3200
},
{
"epoch": 0.6020255063765941,
"grad_norm": 1.7056176662445068,
"learning_rate": 1.217316513761468e-05,
"loss": 0.4889,
"step": 3210
},
{
"epoch": 0.6039009752438109,
"grad_norm": 1.7351319789886475,
"learning_rate": 1.2115825688073395e-05,
"loss": 0.5233,
"step": 3220
},
{
"epoch": 0.6057764441110277,
"grad_norm": 3.0656421184539795,
"learning_rate": 1.2058486238532111e-05,
"loss": 0.5427,
"step": 3230
},
{
"epoch": 0.6076519129782446,
"grad_norm": 2.4634621143341064,
"learning_rate": 1.2001146788990826e-05,
"loss": 0.5901,
"step": 3240
},
{
"epoch": 0.6095273818454614,
"grad_norm": 1.7477375268936157,
"learning_rate": 1.1943807339449543e-05,
"loss": 0.6393,
"step": 3250
},
{
"epoch": 0.6114028507126782,
"grad_norm": 2.034407377243042,
"learning_rate": 1.1886467889908258e-05,
"loss": 0.4688,
"step": 3260
},
{
"epoch": 0.613278319579895,
"grad_norm": 1.604066014289856,
"learning_rate": 1.1829128440366974e-05,
"loss": 0.5928,
"step": 3270
},
{
"epoch": 0.6151537884471118,
"grad_norm": 1.9047834873199463,
"learning_rate": 1.1771788990825687e-05,
"loss": 0.512,
"step": 3280
},
{
"epoch": 0.6170292573143286,
"grad_norm": 2.166414737701416,
"learning_rate": 1.1714449541284404e-05,
"loss": 0.6807,
"step": 3290
},
{
"epoch": 0.6189047261815454,
"grad_norm": 2.463648796081543,
"learning_rate": 1.1657110091743119e-05,
"loss": 0.7143,
"step": 3300
},
{
"epoch": 0.6207801950487621,
"grad_norm": 1.8840951919555664,
"learning_rate": 1.1599770642201834e-05,
"loss": 0.6137,
"step": 3310
},
{
"epoch": 0.622655663915979,
"grad_norm": 2.49739408493042,
"learning_rate": 1.154243119266055e-05,
"loss": 0.6415,
"step": 3320
},
{
"epoch": 0.6245311327831958,
"grad_norm": 2.0638840198516846,
"learning_rate": 1.1485091743119265e-05,
"loss": 0.5118,
"step": 3330
},
{
"epoch": 0.6264066016504126,
"grad_norm": 2.0733895301818848,
"learning_rate": 1.1427752293577982e-05,
"loss": 0.6573,
"step": 3340
},
{
"epoch": 0.6282820705176294,
"grad_norm": 2.006185293197632,
"learning_rate": 1.1370412844036697e-05,
"loss": 0.4634,
"step": 3350
},
{
"epoch": 0.6301575393848462,
"grad_norm": 2.2666101455688477,
"learning_rate": 1.1313073394495413e-05,
"loss": 0.6536,
"step": 3360
},
{
"epoch": 0.632033008252063,
"grad_norm": 2.7148234844207764,
"learning_rate": 1.1255733944954128e-05,
"loss": 0.7032,
"step": 3370
},
{
"epoch": 0.6339084771192798,
"grad_norm": 1.6289362907409668,
"learning_rate": 1.1198394495412845e-05,
"loss": 0.5175,
"step": 3380
},
{
"epoch": 0.6357839459864967,
"grad_norm": 2.742385149002075,
"learning_rate": 1.114105504587156e-05,
"loss": 0.582,
"step": 3390
},
{
"epoch": 0.6376594148537135,
"grad_norm": 2.092541217803955,
"learning_rate": 1.1083715596330275e-05,
"loss": 0.6501,
"step": 3400
},
{
"epoch": 0.6376594148537135,
"eval_loss": 0.631032407283783,
"eval_runtime": 5.5482,
"eval_samples_per_second": 21.629,
"eval_steps_per_second": 2.704,
"step": 3400
},
{
"epoch": 0.6395348837209303,
"grad_norm": 1.8964581489562988,
"learning_rate": 1.1026376146788991e-05,
"loss": 0.5905,
"step": 3410
},
{
"epoch": 0.641410352588147,
"grad_norm": 1.6054551601409912,
"learning_rate": 1.0969036697247706e-05,
"loss": 0.4636,
"step": 3420
},
{
"epoch": 0.6432858214553638,
"grad_norm": 2.0726969242095947,
"learning_rate": 1.0911697247706423e-05,
"loss": 0.6542,
"step": 3430
},
{
"epoch": 0.6451612903225806,
"grad_norm": 3.5420382022857666,
"learning_rate": 1.0854357798165138e-05,
"loss": 0.5573,
"step": 3440
},
{
"epoch": 0.6470367591897974,
"grad_norm": 2.462528705596924,
"learning_rate": 1.0797018348623854e-05,
"loss": 0.5605,
"step": 3450
},
{
"epoch": 0.6489122280570142,
"grad_norm": 2.0307133197784424,
"learning_rate": 1.073967889908257e-05,
"loss": 0.5594,
"step": 3460
},
{
"epoch": 0.6507876969242311,
"grad_norm": 2.2088277339935303,
"learning_rate": 1.0682339449541286e-05,
"loss": 0.6143,
"step": 3470
},
{
"epoch": 0.6526631657914479,
"grad_norm": 1.4962677955627441,
"learning_rate": 1.0625e-05,
"loss": 0.5801,
"step": 3480
},
{
"epoch": 0.6545386346586647,
"grad_norm": 1.796766996383667,
"learning_rate": 1.0567660550458716e-05,
"loss": 0.6032,
"step": 3490
},
{
"epoch": 0.6564141035258815,
"grad_norm": 2.6135787963867188,
"learning_rate": 1.0510321100917432e-05,
"loss": 0.5422,
"step": 3500
},
{
"epoch": 0.6582895723930983,
"grad_norm": 2.0830154418945312,
"learning_rate": 1.0452981651376147e-05,
"loss": 0.5509,
"step": 3510
},
{
"epoch": 0.660165041260315,
"grad_norm": 2.061523675918579,
"learning_rate": 1.0395642201834864e-05,
"loss": 0.5258,
"step": 3520
},
{
"epoch": 0.6620405101275318,
"grad_norm": 1.8006651401519775,
"learning_rate": 1.0338302752293577e-05,
"loss": 0.5546,
"step": 3530
},
{
"epoch": 0.6639159789947486,
"grad_norm": 2.187450647354126,
"learning_rate": 1.0280963302752294e-05,
"loss": 0.598,
"step": 3540
},
{
"epoch": 0.6657914478619655,
"grad_norm": 1.984383463859558,
"learning_rate": 1.0223623853211008e-05,
"loss": 0.5181,
"step": 3550
},
{
"epoch": 0.6676669167291823,
"grad_norm": 2.5804004669189453,
"learning_rate": 1.0166284403669725e-05,
"loss": 0.4769,
"step": 3560
},
{
"epoch": 0.6695423855963991,
"grad_norm": 2.4561312198638916,
"learning_rate": 1.010894495412844e-05,
"loss": 0.5985,
"step": 3570
},
{
"epoch": 0.6714178544636159,
"grad_norm": 2.456256866455078,
"learning_rate": 1.0051605504587157e-05,
"loss": 0.6127,
"step": 3580
},
{
"epoch": 0.6732933233308327,
"grad_norm": 2.1540181636810303,
"learning_rate": 9.994266055045871e-06,
"loss": 0.6621,
"step": 3590
},
{
"epoch": 0.6751687921980495,
"grad_norm": 2.0861988067626953,
"learning_rate": 9.936926605504586e-06,
"loss": 0.5981,
"step": 3600
},
{
"epoch": 0.6751687921980495,
"eval_loss": 0.6278895735740662,
"eval_runtime": 5.5329,
"eval_samples_per_second": 21.688,
"eval_steps_per_second": 2.711,
"step": 3600
},
{
"epoch": 0.6770442610652663,
"grad_norm": 2.0881967544555664,
"learning_rate": 9.879587155963303e-06,
"loss": 0.5829,
"step": 3610
},
{
"epoch": 0.6789197299324832,
"grad_norm": 1.6255912780761719,
"learning_rate": 9.822247706422018e-06,
"loss": 0.5327,
"step": 3620
},
{
"epoch": 0.6807951987997,
"grad_norm": 1.970249891281128,
"learning_rate": 9.764908256880734e-06,
"loss": 0.5841,
"step": 3630
},
{
"epoch": 0.6826706676669168,
"grad_norm": 2.4903528690338135,
"learning_rate": 9.70756880733945e-06,
"loss": 0.6067,
"step": 3640
},
{
"epoch": 0.6845461365341335,
"grad_norm": 1.9478775262832642,
"learning_rate": 9.650229357798166e-06,
"loss": 0.5565,
"step": 3650
},
{
"epoch": 0.6864216054013503,
"grad_norm": 1.8559181690216064,
"learning_rate": 9.592889908256881e-06,
"loss": 0.5934,
"step": 3660
},
{
"epoch": 0.6882970742685671,
"grad_norm": 1.7717185020446777,
"learning_rate": 9.535550458715597e-06,
"loss": 0.5408,
"step": 3670
},
{
"epoch": 0.6901725431357839,
"grad_norm": 2.0449588298797607,
"learning_rate": 9.478211009174312e-06,
"loss": 0.5795,
"step": 3680
},
{
"epoch": 0.6920480120030007,
"grad_norm": 2.3706321716308594,
"learning_rate": 9.420871559633027e-06,
"loss": 0.5183,
"step": 3690
},
{
"epoch": 0.6939234808702176,
"grad_norm": 1.7281607389450073,
"learning_rate": 9.363532110091744e-06,
"loss": 0.494,
"step": 3700
},
{
"epoch": 0.6957989497374344,
"grad_norm": 3.5256292819976807,
"learning_rate": 9.306192660550459e-06,
"loss": 0.5757,
"step": 3710
},
{
"epoch": 0.6976744186046512,
"grad_norm": 1.58797025680542,
"learning_rate": 9.248853211009175e-06,
"loss": 0.5399,
"step": 3720
},
{
"epoch": 0.699549887471868,
"grad_norm": 1.9900200366973877,
"learning_rate": 9.19151376146789e-06,
"loss": 0.6236,
"step": 3730
},
{
"epoch": 0.7014253563390848,
"grad_norm": 1.7843225002288818,
"learning_rate": 9.134174311926607e-06,
"loss": 0.549,
"step": 3740
},
{
"epoch": 0.7033008252063015,
"grad_norm": 1.9925148487091064,
"learning_rate": 9.076834862385322e-06,
"loss": 0.49,
"step": 3750
},
{
"epoch": 0.7051762940735183,
"grad_norm": 2.0657670497894287,
"learning_rate": 9.019495412844038e-06,
"loss": 0.5305,
"step": 3760
},
{
"epoch": 0.7070517629407351,
"grad_norm": 2.2417612075805664,
"learning_rate": 8.962155963302753e-06,
"loss": 0.6312,
"step": 3770
},
{
"epoch": 0.708927231807952,
"grad_norm": 2.196537733078003,
"learning_rate": 8.904816513761468e-06,
"loss": 0.6512,
"step": 3780
},
{
"epoch": 0.7108027006751688,
"grad_norm": 1.830484390258789,
"learning_rate": 8.847477064220183e-06,
"loss": 0.5919,
"step": 3790
},
{
"epoch": 0.7126781695423856,
"grad_norm": 2.0573606491088867,
"learning_rate": 8.790137614678898e-06,
"loss": 0.5749,
"step": 3800
},
{
"epoch": 0.7126781695423856,
"eval_loss": 0.6254046559333801,
"eval_runtime": 5.5087,
"eval_samples_per_second": 21.784,
"eval_steps_per_second": 2.723,
"step": 3800
},
{
"epoch": 0.7145536384096024,
"grad_norm": 1.9237205982208252,
"learning_rate": 8.732798165137615e-06,
"loss": 0.5669,
"step": 3810
},
{
"epoch": 0.7164291072768192,
"grad_norm": 1.9309799671173096,
"learning_rate": 8.67545871559633e-06,
"loss": 0.4759,
"step": 3820
},
{
"epoch": 0.718304576144036,
"grad_norm": 1.7976388931274414,
"learning_rate": 8.618119266055046e-06,
"loss": 0.5962,
"step": 3830
},
{
"epoch": 0.7201800450112528,
"grad_norm": 2.3641951084136963,
"learning_rate": 8.560779816513761e-06,
"loss": 0.627,
"step": 3840
},
{
"epoch": 0.7220555138784697,
"grad_norm": 1.5216801166534424,
"learning_rate": 8.503440366972478e-06,
"loss": 0.6098,
"step": 3850
},
{
"epoch": 0.7239309827456865,
"grad_norm": 1.8570992946624756,
"learning_rate": 8.446100917431192e-06,
"loss": 0.5813,
"step": 3860
},
{
"epoch": 0.7258064516129032,
"grad_norm": 3.1294426918029785,
"learning_rate": 8.388761467889909e-06,
"loss": 0.5876,
"step": 3870
},
{
"epoch": 0.72768192048012,
"grad_norm": 2.678264617919922,
"learning_rate": 8.331422018348624e-06,
"loss": 0.6336,
"step": 3880
},
{
"epoch": 0.7295573893473368,
"grad_norm": 1.5208237171173096,
"learning_rate": 8.274082568807339e-06,
"loss": 0.5342,
"step": 3890
},
{
"epoch": 0.7314328582145536,
"grad_norm": 2.246694326400757,
"learning_rate": 8.216743119266055e-06,
"loss": 0.58,
"step": 3900
},
{
"epoch": 0.7333083270817704,
"grad_norm": 1.5300601720809937,
"learning_rate": 8.15940366972477e-06,
"loss": 0.5734,
"step": 3910
},
{
"epoch": 0.7351837959489872,
"grad_norm": 2.032264471054077,
"learning_rate": 8.102064220183487e-06,
"loss": 0.5499,
"step": 3920
},
{
"epoch": 0.7370592648162041,
"grad_norm": 2.2106308937072754,
"learning_rate": 8.044724770642202e-06,
"loss": 0.5158,
"step": 3930
},
{
"epoch": 0.7389347336834209,
"grad_norm": 1.91170334815979,
"learning_rate": 7.987385321100918e-06,
"loss": 0.6973,
"step": 3940
},
{
"epoch": 0.7408102025506377,
"grad_norm": 1.750429391860962,
"learning_rate": 7.930045871559633e-06,
"loss": 0.5268,
"step": 3950
},
{
"epoch": 0.7426856714178545,
"grad_norm": 3.0469017028808594,
"learning_rate": 7.87270642201835e-06,
"loss": 0.6026,
"step": 3960
},
{
"epoch": 0.7445611402850713,
"grad_norm": 1.8385506868362427,
"learning_rate": 7.815366972477065e-06,
"loss": 0.6316,
"step": 3970
},
{
"epoch": 0.746436609152288,
"grad_norm": 2.0888671875,
"learning_rate": 7.75802752293578e-06,
"loss": 0.6271,
"step": 3980
},
{
"epoch": 0.7483120780195048,
"grad_norm": 2.3192808628082275,
"learning_rate": 7.700688073394496e-06,
"loss": 0.5864,
"step": 3990
},
{
"epoch": 0.7501875468867217,
"grad_norm": 1.7646706104278564,
"learning_rate": 7.643348623853211e-06,
"loss": 0.5462,
"step": 4000
},
{
"epoch": 0.7501875468867217,
"eval_loss": 0.6223539710044861,
"eval_runtime": 5.5777,
"eval_samples_per_second": 21.514,
"eval_steps_per_second": 2.689,
"step": 4000
},
{
"epoch": 0.7520630157539385,
"grad_norm": 2.0803816318511963,
"learning_rate": 7.586009174311928e-06,
"loss": 0.7064,
"step": 4010
},
{
"epoch": 0.7539384846211553,
"grad_norm": 2.42698073387146,
"learning_rate": 7.528669724770644e-06,
"loss": 0.5476,
"step": 4020
},
{
"epoch": 0.7558139534883721,
"grad_norm": 2.320164442062378,
"learning_rate": 7.471330275229358e-06,
"loss": 0.6527,
"step": 4030
},
{
"epoch": 0.7576894223555889,
"grad_norm": 2.235037088394165,
"learning_rate": 7.413990825688073e-06,
"loss": 0.6471,
"step": 4040
},
{
"epoch": 0.7595648912228057,
"grad_norm": 2.3369674682617188,
"learning_rate": 7.356651376146789e-06,
"loss": 0.6455,
"step": 4050
},
{
"epoch": 0.7614403600900225,
"grad_norm": 2.7365052700042725,
"learning_rate": 7.299311926605505e-06,
"loss": 0.6133,
"step": 4060
},
{
"epoch": 0.7633158289572393,
"grad_norm": 1.987430453300476,
"learning_rate": 7.241972477064221e-06,
"loss": 0.5892,
"step": 4070
},
{
"epoch": 0.7651912978244562,
"grad_norm": 2.2822089195251465,
"learning_rate": 7.184633027522936e-06,
"loss": 0.5558,
"step": 4080
},
{
"epoch": 0.767066766691673,
"grad_norm": 2.1317837238311768,
"learning_rate": 7.127293577981651e-06,
"loss": 0.5939,
"step": 4090
},
{
"epoch": 0.7689422355588897,
"grad_norm": 2.5483336448669434,
"learning_rate": 7.069954128440367e-06,
"loss": 0.6171,
"step": 4100
},
{
"epoch": 0.7708177044261065,
"grad_norm": 1.9714287519454956,
"learning_rate": 7.012614678899083e-06,
"loss": 0.6518,
"step": 4110
},
{
"epoch": 0.7726931732933233,
"grad_norm": 1.9111765623092651,
"learning_rate": 6.9552752293577985e-06,
"loss": 0.5786,
"step": 4120
},
{
"epoch": 0.7745686421605401,
"grad_norm": 1.9817109107971191,
"learning_rate": 6.8979357798165134e-06,
"loss": 0.5463,
"step": 4130
},
{
"epoch": 0.7764441110277569,
"grad_norm": 1.834665060043335,
"learning_rate": 6.840596330275229e-06,
"loss": 0.5541,
"step": 4140
},
{
"epoch": 0.7783195798949737,
"grad_norm": 2.018120765686035,
"learning_rate": 6.783256880733945e-06,
"loss": 0.5399,
"step": 4150
},
{
"epoch": 0.7801950487621906,
"grad_norm": 2.5197436809539795,
"learning_rate": 6.725917431192661e-06,
"loss": 0.5581,
"step": 4160
},
{
"epoch": 0.7820705176294074,
"grad_norm": 2.2083163261413574,
"learning_rate": 6.6685779816513764e-06,
"loss": 0.5535,
"step": 4170
},
{
"epoch": 0.7839459864966242,
"grad_norm": 2.2999789714813232,
"learning_rate": 6.611238532110092e-06,
"loss": 0.5212,
"step": 4180
},
{
"epoch": 0.785821455363841,
"grad_norm": 2.2333500385284424,
"learning_rate": 6.553899082568808e-06,
"loss": 0.6867,
"step": 4190
},
{
"epoch": 0.7876969242310577,
"grad_norm": 2.5943992137908936,
"learning_rate": 6.496559633027524e-06,
"loss": 0.4554,
"step": 4200
},
{
"epoch": 0.7876969242310577,
"eval_loss": 0.6219611763954163,
"eval_runtime": 5.5648,
"eval_samples_per_second": 21.564,
"eval_steps_per_second": 2.696,
"step": 4200
},
{
"epoch": 0.7895723930982745,
"grad_norm": 2.9401698112487793,
"learning_rate": 6.4392201834862394e-06,
"loss": 0.6103,
"step": 4210
},
{
"epoch": 0.7914478619654913,
"grad_norm": 2.275641679763794,
"learning_rate": 6.381880733944954e-06,
"loss": 0.5384,
"step": 4220
},
{
"epoch": 0.7933233308327082,
"grad_norm": 1.5987143516540527,
"learning_rate": 6.324541284403669e-06,
"loss": 0.6324,
"step": 4230
},
{
"epoch": 0.795198799699925,
"grad_norm": 1.6601738929748535,
"learning_rate": 6.267201834862385e-06,
"loss": 0.5049,
"step": 4240
},
{
"epoch": 0.7970742685671418,
"grad_norm": 2.5912208557128906,
"learning_rate": 6.209862385321101e-06,
"loss": 0.6013,
"step": 4250
},
{
"epoch": 0.7989497374343586,
"grad_norm": 2.051008701324463,
"learning_rate": 6.1525229357798165e-06,
"loss": 0.6521,
"step": 4260
},
{
"epoch": 0.8008252063015754,
"grad_norm": 2.331805467605591,
"learning_rate": 6.095183486238532e-06,
"loss": 0.5396,
"step": 4270
},
{
"epoch": 0.8027006751687922,
"grad_norm": 2.048785924911499,
"learning_rate": 6.037844036697248e-06,
"loss": 0.5918,
"step": 4280
},
{
"epoch": 0.804576144036009,
"grad_norm": 2.387164354324341,
"learning_rate": 5.980504587155964e-06,
"loss": 0.624,
"step": 4290
},
{
"epoch": 0.8064516129032258,
"grad_norm": 1.7921018600463867,
"learning_rate": 5.9231651376146795e-06,
"loss": 0.5066,
"step": 4300
},
{
"epoch": 0.8083270817704427,
"grad_norm": 1.3692150115966797,
"learning_rate": 5.865825688073395e-06,
"loss": 0.509,
"step": 4310
},
{
"epoch": 0.8102025506376594,
"grad_norm": 1.9718056917190552,
"learning_rate": 5.80848623853211e-06,
"loss": 0.6208,
"step": 4320
},
{
"epoch": 0.8120780195048762,
"grad_norm": 1.9130088090896606,
"learning_rate": 5.751146788990826e-06,
"loss": 0.5508,
"step": 4330
},
{
"epoch": 0.813953488372093,
"grad_norm": 2.5534584522247314,
"learning_rate": 5.693807339449541e-06,
"loss": 0.6473,
"step": 4340
},
{
"epoch": 0.8158289572393098,
"grad_norm": 2.3137259483337402,
"learning_rate": 5.6364678899082565e-06,
"loss": 0.5723,
"step": 4350
},
{
"epoch": 0.8177044261065266,
"grad_norm": 2.2267236709594727,
"learning_rate": 5.579128440366972e-06,
"loss": 0.516,
"step": 4360
},
{
"epoch": 0.8195798949737434,
"grad_norm": 2.8468329906463623,
"learning_rate": 5.521788990825688e-06,
"loss": 0.6196,
"step": 4370
},
{
"epoch": 0.8214553638409603,
"grad_norm": 1.7340741157531738,
"learning_rate": 5.464449541284404e-06,
"loss": 0.5489,
"step": 4380
},
{
"epoch": 0.8233308327081771,
"grad_norm": 1.9742332696914673,
"learning_rate": 5.4071100917431195e-06,
"loss": 0.585,
"step": 4390
},
{
"epoch": 0.8252063015753939,
"grad_norm": 2.408601999282837,
"learning_rate": 5.349770642201835e-06,
"loss": 0.5685,
"step": 4400
},
{
"epoch": 0.8252063015753939,
"eval_loss": 0.6204274296760559,
"eval_runtime": 5.5455,
"eval_samples_per_second": 21.639,
"eval_steps_per_second": 2.705,
"step": 4400
},
{
"epoch": 0.8270817704426107,
"grad_norm": 2.1270008087158203,
"learning_rate": 5.292431192660551e-06,
"loss": 0.5759,
"step": 4410
},
{
"epoch": 0.8289572393098275,
"grad_norm": 2.048781156539917,
"learning_rate": 5.235091743119266e-06,
"loss": 0.5268,
"step": 4420
},
{
"epoch": 0.8308327081770442,
"grad_norm": 1.643114686012268,
"learning_rate": 5.177752293577982e-06,
"loss": 0.5481,
"step": 4430
},
{
"epoch": 0.832708177044261,
"grad_norm": 1.9851353168487549,
"learning_rate": 5.120412844036697e-06,
"loss": 0.6492,
"step": 4440
},
{
"epoch": 0.8345836459114778,
"grad_norm": 2.3454835414886475,
"learning_rate": 5.063073394495413e-06,
"loss": 0.5475,
"step": 4450
},
{
"epoch": 0.8364591147786947,
"grad_norm": 2.1236870288848877,
"learning_rate": 5.005733944954129e-06,
"loss": 0.4889,
"step": 4460
},
{
"epoch": 0.8383345836459115,
"grad_norm": 2.490607738494873,
"learning_rate": 4.948394495412844e-06,
"loss": 0.6648,
"step": 4470
},
{
"epoch": 0.8402100525131283,
"grad_norm": 2.781184434890747,
"learning_rate": 4.8910550458715596e-06,
"loss": 0.6362,
"step": 4480
},
{
"epoch": 0.8420855213803451,
"grad_norm": 1.488677740097046,
"learning_rate": 4.833715596330275e-06,
"loss": 0.6213,
"step": 4490
},
{
"epoch": 0.8439609902475619,
"grad_norm": 1.9841208457946777,
"learning_rate": 4.776376146788991e-06,
"loss": 0.6166,
"step": 4500
},
{
"epoch": 0.8458364591147787,
"grad_norm": 1.4909323453903198,
"learning_rate": 4.719036697247707e-06,
"loss": 0.4612,
"step": 4510
},
{
"epoch": 0.8477119279819955,
"grad_norm": 1.927198886871338,
"learning_rate": 4.661697247706422e-06,
"loss": 0.5697,
"step": 4520
},
{
"epoch": 0.8495873968492123,
"grad_norm": 2.1951193809509277,
"learning_rate": 4.6043577981651375e-06,
"loss": 0.6029,
"step": 4530
},
{
"epoch": 0.8514628657164292,
"grad_norm": 1.6474297046661377,
"learning_rate": 4.547018348623853e-06,
"loss": 0.5997,
"step": 4540
},
{
"epoch": 0.8533383345836459,
"grad_norm": 2.8692142963409424,
"learning_rate": 4.489678899082569e-06,
"loss": 0.5052,
"step": 4550
},
{
"epoch": 0.8552138034508627,
"grad_norm": 2.2251393795013428,
"learning_rate": 4.432339449541285e-06,
"loss": 0.5406,
"step": 4560
},
{
"epoch": 0.8570892723180795,
"grad_norm": 1.9672750234603882,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.6556,
"step": 4570
},
{
"epoch": 0.8589647411852963,
"grad_norm": 1.9112441539764404,
"learning_rate": 4.317660550458716e-06,
"loss": 0.6307,
"step": 4580
},
{
"epoch": 0.8608402100525131,
"grad_norm": 2.0552773475646973,
"learning_rate": 4.260321100917432e-06,
"loss": 0.5682,
"step": 4590
},
{
"epoch": 0.8627156789197299,
"grad_norm": 1.927811622619629,
"learning_rate": 4.202981651376147e-06,
"loss": 0.5006,
"step": 4600
},
{
"epoch": 0.8627156789197299,
"eval_loss": 0.6183449625968933,
"eval_runtime": 5.5214,
"eval_samples_per_second": 21.734,
"eval_steps_per_second": 2.717,
"step": 4600
},
{
"epoch": 0.8645911477869468,
"grad_norm": 2.3974733352661133,
"learning_rate": 4.145642201834863e-06,
"loss": 0.6954,
"step": 4610
},
{
"epoch": 0.8664666166541636,
"grad_norm": 2.214097738265991,
"learning_rate": 4.0883027522935775e-06,
"loss": 0.619,
"step": 4620
},
{
"epoch": 0.8683420855213804,
"grad_norm": 2.094970464706421,
"learning_rate": 4.030963302752293e-06,
"loss": 0.5883,
"step": 4630
},
{
"epoch": 0.8702175543885972,
"grad_norm": 1.908461570739746,
"learning_rate": 3.973623853211009e-06,
"loss": 0.4951,
"step": 4640
},
{
"epoch": 0.872093023255814,
"grad_norm": 2.1103639602661133,
"learning_rate": 3.916284403669725e-06,
"loss": 0.5969,
"step": 4650
},
{
"epoch": 0.8739684921230307,
"grad_norm": 1.8500175476074219,
"learning_rate": 3.8589449541284405e-06,
"loss": 0.5824,
"step": 4660
},
{
"epoch": 0.8758439609902475,
"grad_norm": 2.222599506378174,
"learning_rate": 3.8016055045871563e-06,
"loss": 0.654,
"step": 4670
},
{
"epoch": 0.8777194298574643,
"grad_norm": 2.0447375774383545,
"learning_rate": 3.744266055045872e-06,
"loss": 0.5774,
"step": 4680
},
{
"epoch": 0.8795948987246812,
"grad_norm": 2.4672482013702393,
"learning_rate": 3.686926605504587e-06,
"loss": 0.6135,
"step": 4690
},
{
"epoch": 0.881470367591898,
"grad_norm": 2.1856000423431396,
"learning_rate": 3.6295871559633027e-06,
"loss": 0.5876,
"step": 4700
},
{
"epoch": 0.8833458364591148,
"grad_norm": 2.358637809753418,
"learning_rate": 3.5722477064220184e-06,
"loss": 0.5665,
"step": 4710
},
{
"epoch": 0.8852213053263316,
"grad_norm": 1.8287360668182373,
"learning_rate": 3.514908256880734e-06,
"loss": 0.5067,
"step": 4720
},
{
"epoch": 0.8870967741935484,
"grad_norm": 2.045971155166626,
"learning_rate": 3.45756880733945e-06,
"loss": 0.54,
"step": 4730
},
{
"epoch": 0.8889722430607652,
"grad_norm": 2.5090229511260986,
"learning_rate": 3.4002293577981652e-06,
"loss": 0.6294,
"step": 4740
},
{
"epoch": 0.890847711927982,
"grad_norm": 2.9200639724731445,
"learning_rate": 3.3428899082568806e-06,
"loss": 0.5443,
"step": 4750
},
{
"epoch": 0.8927231807951987,
"grad_norm": 2.0221188068389893,
"learning_rate": 3.2855504587155963e-06,
"loss": 0.628,
"step": 4760
},
{
"epoch": 0.8945986496624156,
"grad_norm": 2.6036345958709717,
"learning_rate": 3.228211009174312e-06,
"loss": 0.6387,
"step": 4770
},
{
"epoch": 0.8964741185296324,
"grad_norm": 3.309267044067383,
"learning_rate": 3.170871559633028e-06,
"loss": 0.5863,
"step": 4780
},
{
"epoch": 0.8983495873968492,
"grad_norm": 3.4704477787017822,
"learning_rate": 3.113532110091743e-06,
"loss": 0.5955,
"step": 4790
},
{
"epoch": 0.900225056264066,
"grad_norm": 2.056976556777954,
"learning_rate": 3.056192660550459e-06,
"loss": 0.5984,
"step": 4800
},
{
"epoch": 0.900225056264066,
"eval_loss": 0.6168529987335205,
"eval_runtime": 5.5639,
"eval_samples_per_second": 21.568,
"eval_steps_per_second": 2.696,
"step": 4800
},
{
"epoch": 0.9021005251312828,
"grad_norm": 2.358440399169922,
"learning_rate": 2.9988532110091746e-06,
"loss": 0.5657,
"step": 4810
},
{
"epoch": 0.9039759939984996,
"grad_norm": 2.124436140060425,
"learning_rate": 2.94151376146789e-06,
"loss": 0.642,
"step": 4820
},
{
"epoch": 0.9058514628657164,
"grad_norm": 1.5845674276351929,
"learning_rate": 2.8841743119266057e-06,
"loss": 0.5054,
"step": 4830
},
{
"epoch": 0.9077269317329333,
"grad_norm": 2.296250820159912,
"learning_rate": 2.8325688073394495e-06,
"loss": 0.6108,
"step": 4840
},
{
"epoch": 0.9096024006001501,
"grad_norm": 1.7618379592895508,
"learning_rate": 2.7752293577981653e-06,
"loss": 0.6623,
"step": 4850
},
{
"epoch": 0.9114778694673669,
"grad_norm": 2.3502273559570312,
"learning_rate": 2.7178899082568806e-06,
"loss": 0.5688,
"step": 4860
},
{
"epoch": 0.9133533383345837,
"grad_norm": 2.141451597213745,
"learning_rate": 2.6605504587155964e-06,
"loss": 0.6076,
"step": 4870
},
{
"epoch": 0.9152288072018004,
"grad_norm": 2.2488343715667725,
"learning_rate": 2.603211009174312e-06,
"loss": 0.6399,
"step": 4880
},
{
"epoch": 0.9171042760690172,
"grad_norm": 2.0450565814971924,
"learning_rate": 2.545871559633028e-06,
"loss": 0.5864,
"step": 4890
},
{
"epoch": 0.918979744936234,
"grad_norm": 2.490226984024048,
"learning_rate": 2.488532110091743e-06,
"loss": 0.5644,
"step": 4900
},
{
"epoch": 0.9208552138034508,
"grad_norm": 2.630089282989502,
"learning_rate": 2.4311926605504585e-06,
"loss": 0.6637,
"step": 4910
},
{
"epoch": 0.9227306826706677,
"grad_norm": 2.2584402561187744,
"learning_rate": 2.3738532110091743e-06,
"loss": 0.5784,
"step": 4920
},
{
"epoch": 0.9246061515378845,
"grad_norm": 2.9330437183380127,
"learning_rate": 2.31651376146789e-06,
"loss": 0.5286,
"step": 4930
},
{
"epoch": 0.9264816204051013,
"grad_norm": 2.6167702674865723,
"learning_rate": 2.2591743119266058e-06,
"loss": 0.5273,
"step": 4940
},
{
"epoch": 0.9283570892723181,
"grad_norm": 2.414607286453247,
"learning_rate": 2.201834862385321e-06,
"loss": 0.5177,
"step": 4950
},
{
"epoch": 0.9302325581395349,
"grad_norm": 2.5905508995056152,
"learning_rate": 2.144495412844037e-06,
"loss": 0.5818,
"step": 4960
},
{
"epoch": 0.9321080270067517,
"grad_norm": 2.9565694332122803,
"learning_rate": 2.087155963302752e-06,
"loss": 0.5307,
"step": 4970
},
{
"epoch": 0.9339834958739685,
"grad_norm": 2.3778281211853027,
"learning_rate": 2.029816513761468e-06,
"loss": 0.5477,
"step": 4980
},
{
"epoch": 0.9358589647411854,
"grad_norm": 2.004302978515625,
"learning_rate": 1.9724770642201837e-06,
"loss": 0.55,
"step": 4990
},
{
"epoch": 0.9377344336084021,
"grad_norm": 2.098611354827881,
"learning_rate": 1.915137614678899e-06,
"loss": 0.6077,
"step": 5000
},
{
"epoch": 0.9377344336084021,
"eval_loss": 0.6168031096458435,
"eval_runtime": 5.423,
"eval_samples_per_second": 22.128,
"eval_steps_per_second": 2.766,
"step": 5000
},
{
"epoch": 0.9396099024756189,
"grad_norm": 2.273273229598999,
"learning_rate": 1.8577981651376147e-06,
"loss": 0.5192,
"step": 5010
},
{
"epoch": 0.9414853713428357,
"grad_norm": 2.40267276763916,
"learning_rate": 1.8004587155963303e-06,
"loss": 0.6307,
"step": 5020
},
{
"epoch": 0.9433608402100525,
"grad_norm": 2.205829620361328,
"learning_rate": 1.743119266055046e-06,
"loss": 0.5307,
"step": 5030
},
{
"epoch": 0.9452363090772693,
"grad_norm": 2.208779811859131,
"learning_rate": 1.6857798165137616e-06,
"loss": 0.5816,
"step": 5040
},
{
"epoch": 0.9471117779444861,
"grad_norm": 2.550372838973999,
"learning_rate": 1.628440366972477e-06,
"loss": 0.588,
"step": 5050
},
{
"epoch": 0.9489872468117029,
"grad_norm": 2.062358856201172,
"learning_rate": 1.5711009174311926e-06,
"loss": 0.6606,
"step": 5060
},
{
"epoch": 0.9508627156789198,
"grad_norm": 2.3175814151763916,
"learning_rate": 1.5137614678899084e-06,
"loss": 0.4878,
"step": 5070
},
{
"epoch": 0.9527381845461366,
"grad_norm": 3.7666046619415283,
"learning_rate": 1.456422018348624e-06,
"loss": 0.566,
"step": 5080
},
{
"epoch": 0.9546136534133534,
"grad_norm": 2.467745304107666,
"learning_rate": 1.3990825688073395e-06,
"loss": 0.6029,
"step": 5090
},
{
"epoch": 0.9564891222805701,
"grad_norm": 1.9065784215927124,
"learning_rate": 1.3417431192660552e-06,
"loss": 0.5577,
"step": 5100
},
{
"epoch": 0.9583645911477869,
"grad_norm": 2.447404623031616,
"learning_rate": 1.2844036697247705e-06,
"loss": 0.513,
"step": 5110
},
{
"epoch": 0.9602400600150037,
"grad_norm": 2.8181941509246826,
"learning_rate": 1.2270642201834863e-06,
"loss": 0.4997,
"step": 5120
},
{
"epoch": 0.9621155288822205,
"grad_norm": 2.414186954498291,
"learning_rate": 1.169724770642202e-06,
"loss": 0.5174,
"step": 5130
},
{
"epoch": 0.9639909977494373,
"grad_norm": 2.9557716846466064,
"learning_rate": 1.1123853211009173e-06,
"loss": 0.5591,
"step": 5140
},
{
"epoch": 0.9658664666166542,
"grad_norm": 1.9689189195632935,
"learning_rate": 1.055045871559633e-06,
"loss": 0.5722,
"step": 5150
},
{
"epoch": 0.967741935483871,
"grad_norm": 2.1190686225891113,
"learning_rate": 9.977064220183486e-07,
"loss": 0.558,
"step": 5160
},
{
"epoch": 0.9696174043510878,
"grad_norm": 2.7399091720581055,
"learning_rate": 9.403669724770642e-07,
"loss": 0.5672,
"step": 5170
},
{
"epoch": 0.9714928732183046,
"grad_norm": 2.2235541343688965,
"learning_rate": 8.830275229357798e-07,
"loss": 0.5904,
"step": 5180
},
{
"epoch": 0.9733683420855214,
"grad_norm": 2.248394727706909,
"learning_rate": 8.256880733944955e-07,
"loss": 0.5505,
"step": 5190
},
{
"epoch": 0.9752438109527382,
"grad_norm": 2.1956896781921387,
"learning_rate": 7.68348623853211e-07,
"loss": 0.596,
"step": 5200
},
{
"epoch": 0.9752438109527382,
"eval_loss": 0.6171349287033081,
"eval_runtime": 5.3961,
"eval_samples_per_second": 22.238,
"eval_steps_per_second": 2.78,
"step": 5200
}
],
"logging_steps": 10,
"max_steps": 5332,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.618912552812544e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}