bhuvanmdev's picture
Training in progress, step 1590, checkpoint
f5886e9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.34134821811936455,
"eval_steps": 500,
"global_step": 1590,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021468441391155,
"grad_norm": 1.7282733917236328,
"learning_rate": 0.0004989265779304422,
"loss": 1.4129,
"step": 10
},
{
"epoch": 0.004293688278231,
"grad_norm": 2.1508498191833496,
"learning_rate": 0.0004978531558608846,
"loss": 1.2225,
"step": 20
},
{
"epoch": 0.006440532417346501,
"grad_norm": 1.6386512517929077,
"learning_rate": 0.0004967797337913268,
"loss": 1.1663,
"step": 30
},
{
"epoch": 0.008587376556462,
"grad_norm": 1.2367421388626099,
"learning_rate": 0.000495706311721769,
"loss": 1.1373,
"step": 40
},
{
"epoch": 0.010734220695577501,
"grad_norm": 1.2300989627838135,
"learning_rate": 0.0004946328896522112,
"loss": 1.1143,
"step": 50
},
{
"epoch": 0.012881064834693002,
"grad_norm": 1.1807990074157715,
"learning_rate": 0.0004935594675826536,
"loss": 1.0937,
"step": 60
},
{
"epoch": 0.015027908973808502,
"grad_norm": 0.9375188946723938,
"learning_rate": 0.0004924860455130958,
"loss": 1.0732,
"step": 70
},
{
"epoch": 0.017174753112924,
"grad_norm": 0.9801538586616516,
"learning_rate": 0.000491412623443538,
"loss": 1.0369,
"step": 80
},
{
"epoch": 0.019321597252039503,
"grad_norm": 0.9229792356491089,
"learning_rate": 0.0004903392013739802,
"loss": 1.0093,
"step": 90
},
{
"epoch": 0.021468441391155002,
"grad_norm": 1.011305570602417,
"learning_rate": 0.0004892657793044225,
"loss": 1.0161,
"step": 100
},
{
"epoch": 0.0236152855302705,
"grad_norm": 0.9356452822685242,
"learning_rate": 0.00048819235723486477,
"loss": 0.9939,
"step": 110
},
{
"epoch": 0.025762129669386003,
"grad_norm": 1.0092449188232422,
"learning_rate": 0.00048711893516530704,
"loss": 0.9647,
"step": 120
},
{
"epoch": 0.027908973808501502,
"grad_norm": 0.9663442373275757,
"learning_rate": 0.0004860455130957492,
"loss": 0.9595,
"step": 130
},
{
"epoch": 0.030055817947617004,
"grad_norm": 1.1502243280410767,
"learning_rate": 0.0004849720910261915,
"loss": 0.9422,
"step": 140
},
{
"epoch": 0.0322026620867325,
"grad_norm": 0.970102846622467,
"learning_rate": 0.00048389866895663376,
"loss": 0.945,
"step": 150
},
{
"epoch": 0.034349506225848,
"grad_norm": 1.2466392517089844,
"learning_rate": 0.00048282524688707604,
"loss": 0.9385,
"step": 160
},
{
"epoch": 0.0364963503649635,
"grad_norm": 1.0010186433792114,
"learning_rate": 0.00048175182481751826,
"loss": 0.9301,
"step": 170
},
{
"epoch": 0.038643194504079006,
"grad_norm": 1.2516905069351196,
"learning_rate": 0.0004806784027479605,
"loss": 0.919,
"step": 180
},
{
"epoch": 0.040790038643194505,
"grad_norm": 0.8497525453567505,
"learning_rate": 0.00047960498067840275,
"loss": 0.9054,
"step": 190
},
{
"epoch": 0.042936882782310004,
"grad_norm": 1.0371205806732178,
"learning_rate": 0.00047853155860884503,
"loss": 0.9109,
"step": 200
},
{
"epoch": 0.0450837269214255,
"grad_norm": 1.3313541412353516,
"learning_rate": 0.00047745813653928725,
"loss": 0.9131,
"step": 210
},
{
"epoch": 0.047230571060541,
"grad_norm": 0.9448315501213074,
"learning_rate": 0.0004763847144697295,
"loss": 0.9014,
"step": 220
},
{
"epoch": 0.04937741519965651,
"grad_norm": 1.274882435798645,
"learning_rate": 0.00047531129240017175,
"loss": 0.8786,
"step": 230
},
{
"epoch": 0.051524259338772006,
"grad_norm": 1.3116368055343628,
"learning_rate": 0.000474237870330614,
"loss": 0.9075,
"step": 240
},
{
"epoch": 0.053671103477887505,
"grad_norm": 0.9970440864562988,
"learning_rate": 0.00047316444826105624,
"loss": 0.8932,
"step": 250
},
{
"epoch": 0.055817947617003004,
"grad_norm": 1.698472499847412,
"learning_rate": 0.0004720910261914985,
"loss": 0.8838,
"step": 260
},
{
"epoch": 0.0579647917561185,
"grad_norm": 1.0129982233047485,
"learning_rate": 0.0004710176041219408,
"loss": 0.8779,
"step": 270
},
{
"epoch": 0.06011163589523401,
"grad_norm": 1.0594947338104248,
"learning_rate": 0.00046994418205238296,
"loss": 0.8631,
"step": 280
},
{
"epoch": 0.06225848003434951,
"grad_norm": 0.7768178582191467,
"learning_rate": 0.00046887075998282524,
"loss": 0.8666,
"step": 290
},
{
"epoch": 0.064405324173465,
"grad_norm": 0.9108049869537354,
"learning_rate": 0.0004677973379132675,
"loss": 0.8676,
"step": 300
},
{
"epoch": 0.06655216831258051,
"grad_norm": 1.4127992391586304,
"learning_rate": 0.0004667239158437098,
"loss": 0.8951,
"step": 310
},
{
"epoch": 0.068699012451696,
"grad_norm": 1.1507939100265503,
"learning_rate": 0.000465650493774152,
"loss": 0.863,
"step": 320
},
{
"epoch": 0.07084585659081151,
"grad_norm": 1.1579265594482422,
"learning_rate": 0.00046457707170459423,
"loss": 0.8716,
"step": 330
},
{
"epoch": 0.072992700729927,
"grad_norm": 0.9873006343841553,
"learning_rate": 0.0004635036496350365,
"loss": 0.8569,
"step": 340
},
{
"epoch": 0.07513954486904251,
"grad_norm": 1.1990203857421875,
"learning_rate": 0.0004624302275654788,
"loss": 0.8776,
"step": 350
},
{
"epoch": 0.07728638900815801,
"grad_norm": 1.1173065900802612,
"learning_rate": 0.000461356805495921,
"loss": 0.865,
"step": 360
},
{
"epoch": 0.0794332331472735,
"grad_norm": 1.2493510246276855,
"learning_rate": 0.0004602833834263633,
"loss": 0.8609,
"step": 370
},
{
"epoch": 0.08158007728638901,
"grad_norm": 1.1254737377166748,
"learning_rate": 0.0004592099613568055,
"loss": 0.8697,
"step": 380
},
{
"epoch": 0.0837269214255045,
"grad_norm": 1.1009331941604614,
"learning_rate": 0.0004581365392872477,
"loss": 0.8653,
"step": 390
},
{
"epoch": 0.08587376556462001,
"grad_norm": 1.3970990180969238,
"learning_rate": 0.00045706311721769,
"loss": 0.8542,
"step": 400
},
{
"epoch": 0.08802060970373551,
"grad_norm": 1.278136968612671,
"learning_rate": 0.00045598969514813227,
"loss": 0.8485,
"step": 410
},
{
"epoch": 0.090167453842851,
"grad_norm": 1.3295845985412598,
"learning_rate": 0.00045491627307857454,
"loss": 0.8501,
"step": 420
},
{
"epoch": 0.09231429798196651,
"grad_norm": 1.310677170753479,
"learning_rate": 0.0004538428510090167,
"loss": 0.849,
"step": 430
},
{
"epoch": 0.094461142121082,
"grad_norm": 1.0189110040664673,
"learning_rate": 0.000452769428939459,
"loss": 0.8566,
"step": 440
},
{
"epoch": 0.09660798626019751,
"grad_norm": 1.2950178384780884,
"learning_rate": 0.00045169600686990126,
"loss": 0.8369,
"step": 450
},
{
"epoch": 0.09875483039931301,
"grad_norm": 0.8336394429206848,
"learning_rate": 0.00045062258480034354,
"loss": 0.853,
"step": 460
},
{
"epoch": 0.1009016745384285,
"grad_norm": 1.1623280048370361,
"learning_rate": 0.00044954916273078576,
"loss": 0.8437,
"step": 470
},
{
"epoch": 0.10304851867754401,
"grad_norm": 1.5341142416000366,
"learning_rate": 0.000448475740661228,
"loss": 0.8388,
"step": 480
},
{
"epoch": 0.1051953628166595,
"grad_norm": 1.154572606086731,
"learning_rate": 0.00044740231859167025,
"loss": 0.8499,
"step": 490
},
{
"epoch": 0.10734220695577501,
"grad_norm": 1.291874885559082,
"learning_rate": 0.00044632889652211253,
"loss": 0.8508,
"step": 500
},
{
"epoch": 0.10948905109489052,
"grad_norm": 2.017030954360962,
"learning_rate": 0.00044525547445255475,
"loss": 0.8163,
"step": 510
},
{
"epoch": 0.11163589523400601,
"grad_norm": 1.2181349992752075,
"learning_rate": 0.000444182052382997,
"loss": 0.8304,
"step": 520
},
{
"epoch": 0.11378273937312151,
"grad_norm": 1.1240856647491455,
"learning_rate": 0.00044310863031343925,
"loss": 0.8339,
"step": 530
},
{
"epoch": 0.115929583512237,
"grad_norm": 1.5953660011291504,
"learning_rate": 0.00044203520824388147,
"loss": 0.8416,
"step": 540
},
{
"epoch": 0.11807642765135251,
"grad_norm": 0.9097370505332947,
"learning_rate": 0.00044096178617432374,
"loss": 0.8362,
"step": 550
},
{
"epoch": 0.12022327179046802,
"grad_norm": 1.0670212507247925,
"learning_rate": 0.000439888364104766,
"loss": 0.8395,
"step": 560
},
{
"epoch": 0.12237011592958351,
"grad_norm": 1.1179403066635132,
"learning_rate": 0.0004388149420352083,
"loss": 0.8477,
"step": 570
},
{
"epoch": 0.12451696006869901,
"grad_norm": 1.218599557876587,
"learning_rate": 0.00043774151996565046,
"loss": 0.8295,
"step": 580
},
{
"epoch": 0.1266638042078145,
"grad_norm": 0.9557531476020813,
"learning_rate": 0.00043666809789609274,
"loss": 0.8257,
"step": 590
},
{
"epoch": 0.12881064834693,
"grad_norm": 0.8345034122467041,
"learning_rate": 0.000435594675826535,
"loss": 0.8607,
"step": 600
},
{
"epoch": 0.13095749248604552,
"grad_norm": 0.9946607947349548,
"learning_rate": 0.0004345212537569773,
"loss": 0.8173,
"step": 610
},
{
"epoch": 0.13310433662516102,
"grad_norm": 1.3076237440109253,
"learning_rate": 0.0004334478316874195,
"loss": 0.8293,
"step": 620
},
{
"epoch": 0.1352511807642765,
"grad_norm": 1.6002768278121948,
"learning_rate": 0.00043237440961786173,
"loss": 0.8328,
"step": 630
},
{
"epoch": 0.137398024903392,
"grad_norm": 1.03147554397583,
"learning_rate": 0.000431300987548304,
"loss": 0.8297,
"step": 640
},
{
"epoch": 0.1395448690425075,
"grad_norm": 1.42938232421875,
"learning_rate": 0.0004302275654787463,
"loss": 0.8328,
"step": 650
},
{
"epoch": 0.14169171318162302,
"grad_norm": 1.319884181022644,
"learning_rate": 0.0004291541434091885,
"loss": 0.8496,
"step": 660
},
{
"epoch": 0.14383855732073852,
"grad_norm": 1.289533019065857,
"learning_rate": 0.0004280807213396308,
"loss": 0.8171,
"step": 670
},
{
"epoch": 0.145985401459854,
"grad_norm": 1.4401450157165527,
"learning_rate": 0.000427007299270073,
"loss": 0.8259,
"step": 680
},
{
"epoch": 0.1481322455989695,
"grad_norm": 1.403343677520752,
"learning_rate": 0.0004259338772005152,
"loss": 0.8116,
"step": 690
},
{
"epoch": 0.15027908973808501,
"grad_norm": 1.0387822389602661,
"learning_rate": 0.0004248604551309575,
"loss": 0.8233,
"step": 700
},
{
"epoch": 0.15242593387720052,
"grad_norm": 1.0579140186309814,
"learning_rate": 0.00042378703306139977,
"loss": 0.8205,
"step": 710
},
{
"epoch": 0.15457277801631603,
"grad_norm": 1.7332643270492554,
"learning_rate": 0.00042271361099184204,
"loss": 0.845,
"step": 720
},
{
"epoch": 0.1567196221554315,
"grad_norm": 1.8401075601577759,
"learning_rate": 0.0004216401889222842,
"loss": 0.8441,
"step": 730
},
{
"epoch": 0.158866466294547,
"grad_norm": 1.3133872747421265,
"learning_rate": 0.0004205667668527265,
"loss": 0.8427,
"step": 740
},
{
"epoch": 0.16101331043366252,
"grad_norm": 2.1324663162231445,
"learning_rate": 0.00041949334478316876,
"loss": 0.8298,
"step": 750
},
{
"epoch": 0.16316015457277802,
"grad_norm": 1.1304748058319092,
"learning_rate": 0.00041841992271361104,
"loss": 0.836,
"step": 760
},
{
"epoch": 0.16530699871189353,
"grad_norm": 1.1530399322509766,
"learning_rate": 0.00041734650064405326,
"loss": 0.803,
"step": 770
},
{
"epoch": 0.167453842851009,
"grad_norm": 0.8117969632148743,
"learning_rate": 0.0004162730785744955,
"loss": 0.8177,
"step": 780
},
{
"epoch": 0.1696006869901245,
"grad_norm": 1.217517375946045,
"learning_rate": 0.00041519965650493775,
"loss": 0.8383,
"step": 790
},
{
"epoch": 0.17174753112924002,
"grad_norm": 1.2580839395523071,
"learning_rate": 0.00041412623443538,
"loss": 0.8257,
"step": 800
},
{
"epoch": 0.17389437526835552,
"grad_norm": 1.7408099174499512,
"learning_rate": 0.00041305281236582225,
"loss": 0.8201,
"step": 810
},
{
"epoch": 0.17604121940747103,
"grad_norm": 1.1754316091537476,
"learning_rate": 0.0004119793902962645,
"loss": 0.8094,
"step": 820
},
{
"epoch": 0.1781880635465865,
"grad_norm": 1.5301543474197388,
"learning_rate": 0.00041090596822670675,
"loss": 0.8112,
"step": 830
},
{
"epoch": 0.180334907685702,
"grad_norm": 0.8299456834793091,
"learning_rate": 0.00040983254615714897,
"loss": 0.8518,
"step": 840
},
{
"epoch": 0.18248175182481752,
"grad_norm": 1.3171818256378174,
"learning_rate": 0.00040875912408759124,
"loss": 0.8292,
"step": 850
},
{
"epoch": 0.18462859596393302,
"grad_norm": 1.4290481805801392,
"learning_rate": 0.0004076857020180335,
"loss": 0.8147,
"step": 860
},
{
"epoch": 0.18677544010304853,
"grad_norm": 0.9816901683807373,
"learning_rate": 0.0004066122799484758,
"loss": 0.825,
"step": 870
},
{
"epoch": 0.188922284242164,
"grad_norm": 0.8896159529685974,
"learning_rate": 0.00040553885787891796,
"loss": 0.8245,
"step": 880
},
{
"epoch": 0.1910691283812795,
"grad_norm": 1.5641008615493774,
"learning_rate": 0.00040446543580936024,
"loss": 0.8204,
"step": 890
},
{
"epoch": 0.19321597252039502,
"grad_norm": 1.174325704574585,
"learning_rate": 0.0004033920137398025,
"loss": 0.8046,
"step": 900
},
{
"epoch": 0.19536281665951052,
"grad_norm": 1.0568900108337402,
"learning_rate": 0.0004023185916702448,
"loss": 0.835,
"step": 910
},
{
"epoch": 0.19750966079862603,
"grad_norm": 1.4573074579238892,
"learning_rate": 0.000401245169600687,
"loss": 0.8151,
"step": 920
},
{
"epoch": 0.1996565049377415,
"grad_norm": 1.7658246755599976,
"learning_rate": 0.00040017174753112923,
"loss": 0.8012,
"step": 930
},
{
"epoch": 0.201803349076857,
"grad_norm": 1.3144532442092896,
"learning_rate": 0.0003990983254615715,
"loss": 0.8155,
"step": 940
},
{
"epoch": 0.20395019321597252,
"grad_norm": 1.302480697631836,
"learning_rate": 0.0003980249033920137,
"loss": 0.8125,
"step": 950
},
{
"epoch": 0.20609703735508803,
"grad_norm": 1.6297829151153564,
"learning_rate": 0.000396951481322456,
"loss": 0.8157,
"step": 960
},
{
"epoch": 0.20824388149420353,
"grad_norm": 1.2462539672851562,
"learning_rate": 0.0003958780592528983,
"loss": 0.8135,
"step": 970
},
{
"epoch": 0.210390725633319,
"grad_norm": 1.3543071746826172,
"learning_rate": 0.0003948046371833405,
"loss": 0.8154,
"step": 980
},
{
"epoch": 0.21253756977243451,
"grad_norm": 1.5854978561401367,
"learning_rate": 0.0003937312151137827,
"loss": 0.7982,
"step": 990
},
{
"epoch": 0.21468441391155002,
"grad_norm": 1.0589042901992798,
"learning_rate": 0.000392657793044225,
"loss": 0.8267,
"step": 1000
},
{
"epoch": 0.21683125805066553,
"grad_norm": 1.226970911026001,
"learning_rate": 0.00039158437097466727,
"loss": 0.8055,
"step": 1010
},
{
"epoch": 0.21897810218978103,
"grad_norm": 1.390030860900879,
"learning_rate": 0.00039051094890510954,
"loss": 0.8272,
"step": 1020
},
{
"epoch": 0.2211249463288965,
"grad_norm": 1.102220892906189,
"learning_rate": 0.0003894375268355517,
"loss": 0.8246,
"step": 1030
},
{
"epoch": 0.22327179046801202,
"grad_norm": 1.094040870666504,
"learning_rate": 0.000388364104765994,
"loss": 0.814,
"step": 1040
},
{
"epoch": 0.22541863460712752,
"grad_norm": 1.4209458827972412,
"learning_rate": 0.00038729068269643626,
"loss": 0.7972,
"step": 1050
},
{
"epoch": 0.22756547874624303,
"grad_norm": 1.3925952911376953,
"learning_rate": 0.0003862172606268785,
"loss": 0.809,
"step": 1060
},
{
"epoch": 0.22971232288535853,
"grad_norm": 1.0035127401351929,
"learning_rate": 0.00038514383855732076,
"loss": 0.8012,
"step": 1070
},
{
"epoch": 0.231859167024474,
"grad_norm": 1.0175857543945312,
"learning_rate": 0.000384070416487763,
"loss": 0.7916,
"step": 1080
},
{
"epoch": 0.23400601116358952,
"grad_norm": 1.3213493824005127,
"learning_rate": 0.00038299699441820525,
"loss": 0.8084,
"step": 1090
},
{
"epoch": 0.23615285530270502,
"grad_norm": 1.4422920942306519,
"learning_rate": 0.0003819235723486475,
"loss": 0.8135,
"step": 1100
},
{
"epoch": 0.23829969944182053,
"grad_norm": 1.228966474533081,
"learning_rate": 0.00038085015027908975,
"loss": 0.8221,
"step": 1110
},
{
"epoch": 0.24044654358093603,
"grad_norm": 1.5089335441589355,
"learning_rate": 0.000379776728209532,
"loss": 0.8183,
"step": 1120
},
{
"epoch": 0.2425933877200515,
"grad_norm": 1.2208846807479858,
"learning_rate": 0.00037870330613997425,
"loss": 0.7888,
"step": 1130
},
{
"epoch": 0.24474023185916702,
"grad_norm": 1.057085633277893,
"learning_rate": 0.00037762988407041647,
"loss": 0.8064,
"step": 1140
},
{
"epoch": 0.24688707599828252,
"grad_norm": 1.746360421180725,
"learning_rate": 0.00037655646200085874,
"loss": 0.8209,
"step": 1150
},
{
"epoch": 0.24903392013739803,
"grad_norm": 1.4103171825408936,
"learning_rate": 0.000375483039931301,
"loss": 0.8161,
"step": 1160
},
{
"epoch": 0.25118076427651354,
"grad_norm": 1.0949628353118896,
"learning_rate": 0.0003744096178617433,
"loss": 0.7999,
"step": 1170
},
{
"epoch": 0.253327608415629,
"grad_norm": 1.1674295663833618,
"learning_rate": 0.00037333619579218546,
"loss": 0.7999,
"step": 1180
},
{
"epoch": 0.25547445255474455,
"grad_norm": 1.729760766029358,
"learning_rate": 0.00037226277372262774,
"loss": 0.8091,
"step": 1190
},
{
"epoch": 0.25762129669386,
"grad_norm": 1.3376595973968506,
"learning_rate": 0.00037118935165307,
"loss": 0.7909,
"step": 1200
},
{
"epoch": 0.2597681408329755,
"grad_norm": 2.1753225326538086,
"learning_rate": 0.00037011592958351223,
"loss": 0.7844,
"step": 1210
},
{
"epoch": 0.26191498497209104,
"grad_norm": 1.7476351261138916,
"learning_rate": 0.0003690425075139545,
"loss": 0.7972,
"step": 1220
},
{
"epoch": 0.2640618291112065,
"grad_norm": 1.241102933883667,
"learning_rate": 0.00036796908544439673,
"loss": 0.8046,
"step": 1230
},
{
"epoch": 0.26620867325032205,
"grad_norm": 1.7534103393554688,
"learning_rate": 0.000366895663374839,
"loss": 0.7938,
"step": 1240
},
{
"epoch": 0.2683555173894375,
"grad_norm": 1.2782504558563232,
"learning_rate": 0.0003658222413052812,
"loss": 0.7891,
"step": 1250
},
{
"epoch": 0.270502361528553,
"grad_norm": 1.1518951654434204,
"learning_rate": 0.0003647488192357235,
"loss": 0.7995,
"step": 1260
},
{
"epoch": 0.27264920566766854,
"grad_norm": 1.1520744562149048,
"learning_rate": 0.0003636753971661658,
"loss": 0.7934,
"step": 1270
},
{
"epoch": 0.274796049806784,
"grad_norm": 1.4017630815505981,
"learning_rate": 0.000362601975096608,
"loss": 0.8272,
"step": 1280
},
{
"epoch": 0.27694289394589955,
"grad_norm": 1.7796710729599,
"learning_rate": 0.0003615285530270502,
"loss": 0.7782,
"step": 1290
},
{
"epoch": 0.279089738085015,
"grad_norm": 1.5225216150283813,
"learning_rate": 0.0003604551309574925,
"loss": 0.7978,
"step": 1300
},
{
"epoch": 0.2812365822241305,
"grad_norm": 1.0838427543640137,
"learning_rate": 0.00035938170888793477,
"loss": 0.8185,
"step": 1310
},
{
"epoch": 0.28338342636324604,
"grad_norm": 1.5116959810256958,
"learning_rate": 0.000358308286818377,
"loss": 0.7929,
"step": 1320
},
{
"epoch": 0.2855302705023615,
"grad_norm": 1.2074556350708008,
"learning_rate": 0.0003572348647488192,
"loss": 0.804,
"step": 1330
},
{
"epoch": 0.28767711464147705,
"grad_norm": 1.004355788230896,
"learning_rate": 0.0003561614426792615,
"loss": 0.813,
"step": 1340
},
{
"epoch": 0.2898239587805925,
"grad_norm": 1.4230481386184692,
"learning_rate": 0.00035508802060970376,
"loss": 0.7831,
"step": 1350
},
{
"epoch": 0.291970802919708,
"grad_norm": 1.1971302032470703,
"learning_rate": 0.000354014598540146,
"loss": 0.7673,
"step": 1360
},
{
"epoch": 0.29411764705882354,
"grad_norm": 1.3551030158996582,
"learning_rate": 0.00035294117647058826,
"loss": 0.7757,
"step": 1370
},
{
"epoch": 0.296264491197939,
"grad_norm": 1.0632190704345703,
"learning_rate": 0.0003518677544010305,
"loss": 0.7824,
"step": 1380
},
{
"epoch": 0.29841133533705455,
"grad_norm": 1.5460542440414429,
"learning_rate": 0.00035079433233147275,
"loss": 0.7871,
"step": 1390
},
{
"epoch": 0.30055817947617003,
"grad_norm": 1.8900117874145508,
"learning_rate": 0.000349720910261915,
"loss": 0.7967,
"step": 1400
},
{
"epoch": 0.3027050236152855,
"grad_norm": 1.5339765548706055,
"learning_rate": 0.00034864748819235725,
"loss": 0.7759,
"step": 1410
},
{
"epoch": 0.30485186775440104,
"grad_norm": 1.721113681793213,
"learning_rate": 0.0003475740661227995,
"loss": 0.7792,
"step": 1420
},
{
"epoch": 0.3069987118935165,
"grad_norm": 1.0442615747451782,
"learning_rate": 0.0003465006440532417,
"loss": 0.7734,
"step": 1430
},
{
"epoch": 0.30914555603263205,
"grad_norm": 1.4723149538040161,
"learning_rate": 0.00034542722198368397,
"loss": 0.7839,
"step": 1440
},
{
"epoch": 0.31129240017174753,
"grad_norm": 1.4786028861999512,
"learning_rate": 0.00034435379991412624,
"loss": 0.7995,
"step": 1450
},
{
"epoch": 0.313439244310863,
"grad_norm": 1.392654538154602,
"learning_rate": 0.0003432803778445685,
"loss": 0.8046,
"step": 1460
},
{
"epoch": 0.31558608844997854,
"grad_norm": 1.730966567993164,
"learning_rate": 0.00034220695577501074,
"loss": 0.7909,
"step": 1470
},
{
"epoch": 0.317732932589094,
"grad_norm": 1.365211844444275,
"learning_rate": 0.00034113353370545296,
"loss": 0.7881,
"step": 1480
},
{
"epoch": 0.31987977672820955,
"grad_norm": 1.2406139373779297,
"learning_rate": 0.00034006011163589524,
"loss": 0.8095,
"step": 1490
},
{
"epoch": 0.32202662086732503,
"grad_norm": 2.0166332721710205,
"learning_rate": 0.0003389866895663375,
"loss": 0.7694,
"step": 1500
},
{
"epoch": 0.3241734650064405,
"grad_norm": 1.9494292736053467,
"learning_rate": 0.00033791326749677973,
"loss": 0.8033,
"step": 1510
},
{
"epoch": 0.32632030914555604,
"grad_norm": 1.6693007946014404,
"learning_rate": 0.000336839845427222,
"loss": 0.8158,
"step": 1520
},
{
"epoch": 0.3284671532846715,
"grad_norm": 1.595958948135376,
"learning_rate": 0.00033576642335766423,
"loss": 0.7974,
"step": 1530
},
{
"epoch": 0.33061399742378705,
"grad_norm": 1.8875946998596191,
"learning_rate": 0.0003346930012881065,
"loss": 0.7835,
"step": 1540
},
{
"epoch": 0.33276084156290253,
"grad_norm": 1.5482693910598755,
"learning_rate": 0.0003336195792185487,
"loss": 0.7866,
"step": 1550
},
{
"epoch": 0.334907685702018,
"grad_norm": 1.1274839639663696,
"learning_rate": 0.000332546157148991,
"loss": 0.7964,
"step": 1560
},
{
"epoch": 0.33705452984113354,
"grad_norm": 1.5397554636001587,
"learning_rate": 0.0003314727350794333,
"loss": 0.7802,
"step": 1570
},
{
"epoch": 0.339201373980249,
"grad_norm": 1.2875391244888306,
"learning_rate": 0.00033039931300987544,
"loss": 0.7764,
"step": 1580
},
{
"epoch": 0.34134821811936455,
"grad_norm": 1.0845388174057007,
"learning_rate": 0.0003293258909403177,
"loss": 0.7839,
"step": 1590
}
],
"logging_steps": 10,
"max_steps": 4658,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 30,
"total_flos": 1.2016090886217754e+17,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}