abhishek's picture
abhishek HF staff
Duplicate from abhishek/autotrain-llama3-oh-sft-v0-2
3aef20a verified
raw
history blame contribute delete
No virus
39.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 5592,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00894134477825465,
"grad_norm": 0.5095388845429333,
"learning_rate": 4.4642857142857145e-08,
"loss": 1.8582,
"step": 25
},
{
"epoch": 0.0178826895565093,
"grad_norm": 0.41826013911071735,
"learning_rate": 8.928571428571429e-08,
"loss": 1.8696,
"step": 50
},
{
"epoch": 0.02682403433476395,
"grad_norm": 0.5293401027825467,
"learning_rate": 1.3392857142857142e-07,
"loss": 1.863,
"step": 75
},
{
"epoch": 0.0357653791130186,
"grad_norm": 0.5080324323077997,
"learning_rate": 1.7857142857142858e-07,
"loss": 1.8639,
"step": 100
},
{
"epoch": 0.044706723891273246,
"grad_norm": 0.5015276110428923,
"learning_rate": 2.232142857142857e-07,
"loss": 1.8466,
"step": 125
},
{
"epoch": 0.0536480686695279,
"grad_norm": 0.5192534238774392,
"learning_rate": 2.6785714285714284e-07,
"loss": 1.8646,
"step": 150
},
{
"epoch": 0.06258941344778254,
"grad_norm": 0.5036209286188108,
"learning_rate": 3.1249999999999997e-07,
"loss": 1.8602,
"step": 175
},
{
"epoch": 0.0715307582260372,
"grad_norm": 0.4912106467940169,
"learning_rate": 3.5714285714285716e-07,
"loss": 1.8529,
"step": 200
},
{
"epoch": 0.08047210300429185,
"grad_norm": 0.9830108153719349,
"learning_rate": 4.017857142857143e-07,
"loss": 1.8574,
"step": 225
},
{
"epoch": 0.08941344778254649,
"grad_norm": 0.5138072684249293,
"learning_rate": 4.464285714285714e-07,
"loss": 1.8498,
"step": 250
},
{
"epoch": 0.09835479256080114,
"grad_norm": 0.6040471717328045,
"learning_rate": 4.910714285714285e-07,
"loss": 1.853,
"step": 275
},
{
"epoch": 0.1072961373390558,
"grad_norm": 0.6457522376165349,
"learning_rate": 5.357142857142857e-07,
"loss": 1.849,
"step": 300
},
{
"epoch": 0.11623748211731044,
"grad_norm": 0.6334455155915341,
"learning_rate": 5.803571428571429e-07,
"loss": 1.8404,
"step": 325
},
{
"epoch": 0.1251788268955651,
"grad_norm": 0.6379008256976824,
"learning_rate": 6.249999999999999e-07,
"loss": 1.8591,
"step": 350
},
{
"epoch": 0.13412017167381973,
"grad_norm": 0.7299148775857892,
"learning_rate": 6.69642857142857e-07,
"loss": 1.8275,
"step": 375
},
{
"epoch": 0.1430615164520744,
"grad_norm": 0.7276311175701822,
"learning_rate": 7.142857142857143e-07,
"loss": 1.7953,
"step": 400
},
{
"epoch": 0.15200286123032905,
"grad_norm": 0.7388527300740464,
"learning_rate": 7.589285714285714e-07,
"loss": 1.7676,
"step": 425
},
{
"epoch": 0.1609442060085837,
"grad_norm": 0.8114217398767946,
"learning_rate": 8.035714285714286e-07,
"loss": 1.7693,
"step": 450
},
{
"epoch": 0.16988555078683834,
"grad_norm": 0.8300068303598062,
"learning_rate": 8.482142857142857e-07,
"loss": 1.7249,
"step": 475
},
{
"epoch": 0.17882689556509299,
"grad_norm": 0.7407086667105495,
"learning_rate": 8.928571428571428e-07,
"loss": 1.7008,
"step": 500
},
{
"epoch": 0.18776824034334763,
"grad_norm": 0.6185854806094319,
"learning_rate": 9.374999999999999e-07,
"loss": 1.6907,
"step": 525
},
{
"epoch": 0.19670958512160228,
"grad_norm": 0.6491552501216883,
"learning_rate": 9.82142857142857e-07,
"loss": 1.6586,
"step": 550
},
{
"epoch": 0.20565092989985695,
"grad_norm": 0.6005414444540486,
"learning_rate": 9.970190779014309e-07,
"loss": 1.6455,
"step": 575
},
{
"epoch": 0.2145922746781116,
"grad_norm": 0.584948620571865,
"learning_rate": 9.920508744038154e-07,
"loss": 1.6181,
"step": 600
},
{
"epoch": 0.22353361945636624,
"grad_norm": 0.5689579459890339,
"learning_rate": 9.870826709062002e-07,
"loss": 1.5861,
"step": 625
},
{
"epoch": 0.23247496423462088,
"grad_norm": 0.5479875081985119,
"learning_rate": 9.821144674085851e-07,
"loss": 1.5784,
"step": 650
},
{
"epoch": 0.24141630901287553,
"grad_norm": 0.5642004770422899,
"learning_rate": 9.771462639109697e-07,
"loss": 1.5524,
"step": 675
},
{
"epoch": 0.2503576537911302,
"grad_norm": 0.5928876776119925,
"learning_rate": 9.721780604133544e-07,
"loss": 1.5371,
"step": 700
},
{
"epoch": 0.2592989985693848,
"grad_norm": 0.5977759151905918,
"learning_rate": 9.672098569157392e-07,
"loss": 1.5132,
"step": 725
},
{
"epoch": 0.26824034334763946,
"grad_norm": 0.5575061860434638,
"learning_rate": 9.62241653418124e-07,
"loss": 1.5064,
"step": 750
},
{
"epoch": 0.2771816881258941,
"grad_norm": 0.5630188145960676,
"learning_rate": 9.572734499205087e-07,
"loss": 1.4761,
"step": 775
},
{
"epoch": 0.2861230329041488,
"grad_norm": 0.5611716812815478,
"learning_rate": 9.523052464228934e-07,
"loss": 1.4614,
"step": 800
},
{
"epoch": 0.29506437768240346,
"grad_norm": 0.46795354610537326,
"learning_rate": 9.473370429252782e-07,
"loss": 1.4298,
"step": 825
},
{
"epoch": 0.3040057224606581,
"grad_norm": 0.44554190069369415,
"learning_rate": 9.423688394276629e-07,
"loss": 1.4318,
"step": 850
},
{
"epoch": 0.31294706723891275,
"grad_norm": 1.3485889211129667,
"learning_rate": 9.374006359300477e-07,
"loss": 1.4183,
"step": 875
},
{
"epoch": 0.3218884120171674,
"grad_norm": 0.5688806165123188,
"learning_rate": 9.324324324324324e-07,
"loss": 1.3918,
"step": 900
},
{
"epoch": 0.33082975679542204,
"grad_norm": 0.5874567786702063,
"learning_rate": 9.274642289348172e-07,
"loss": 1.3886,
"step": 925
},
{
"epoch": 0.3397711015736767,
"grad_norm": 0.3766300486264232,
"learning_rate": 9.224960254372018e-07,
"loss": 1.3839,
"step": 950
},
{
"epoch": 0.3487124463519313,
"grad_norm": 0.39812336031911777,
"learning_rate": 9.175278219395866e-07,
"loss": 1.3677,
"step": 975
},
{
"epoch": 0.35765379113018597,
"grad_norm": 0.3843707797396072,
"learning_rate": 9.125596184419714e-07,
"loss": 1.358,
"step": 1000
},
{
"epoch": 0.3665951359084406,
"grad_norm": 0.3847089855262342,
"learning_rate": 9.075914149443561e-07,
"loss": 1.3469,
"step": 1025
},
{
"epoch": 0.37553648068669526,
"grad_norm": 0.5315974805229721,
"learning_rate": 9.026232114467408e-07,
"loss": 1.3377,
"step": 1050
},
{
"epoch": 0.3844778254649499,
"grad_norm": 0.37212834114959115,
"learning_rate": 8.976550079491256e-07,
"loss": 1.326,
"step": 1075
},
{
"epoch": 0.39341917024320455,
"grad_norm": 0.3369886595418098,
"learning_rate": 8.926868044515103e-07,
"loss": 1.3141,
"step": 1100
},
{
"epoch": 0.40236051502145925,
"grad_norm": 0.3232053255299734,
"learning_rate": 8.877186009538951e-07,
"loss": 1.3176,
"step": 1125
},
{
"epoch": 0.4113018597997139,
"grad_norm": 0.5405167043382479,
"learning_rate": 8.827503974562798e-07,
"loss": 1.3125,
"step": 1150
},
{
"epoch": 0.42024320457796854,
"grad_norm": 0.28257166431534414,
"learning_rate": 8.777821939586645e-07,
"loss": 1.2911,
"step": 1175
},
{
"epoch": 0.4291845493562232,
"grad_norm": 0.29182342370487174,
"learning_rate": 8.728139904610492e-07,
"loss": 1.2935,
"step": 1200
},
{
"epoch": 0.43812589413447783,
"grad_norm": 0.22773442408616737,
"learning_rate": 8.678457869634341e-07,
"loss": 1.2888,
"step": 1225
},
{
"epoch": 0.4470672389127325,
"grad_norm": 0.22436263710764737,
"learning_rate": 8.628775834658187e-07,
"loss": 1.2745,
"step": 1250
},
{
"epoch": 0.4560085836909871,
"grad_norm": 0.21775793386766282,
"learning_rate": 8.579093799682035e-07,
"loss": 1.269,
"step": 1275
},
{
"epoch": 0.46494992846924177,
"grad_norm": 0.22214064111096374,
"learning_rate": 8.529411764705882e-07,
"loss": 1.267,
"step": 1300
},
{
"epoch": 0.4738912732474964,
"grad_norm": 0.20243266736371351,
"learning_rate": 8.47972972972973e-07,
"loss": 1.276,
"step": 1325
},
{
"epoch": 0.48283261802575106,
"grad_norm": 0.20705292614851298,
"learning_rate": 8.430047694753577e-07,
"loss": 1.272,
"step": 1350
},
{
"epoch": 0.4917739628040057,
"grad_norm": 0.19119742077379076,
"learning_rate": 8.380365659777425e-07,
"loss": 1.2724,
"step": 1375
},
{
"epoch": 0.5007153075822603,
"grad_norm": 0.23794007928706434,
"learning_rate": 8.330683624801271e-07,
"loss": 1.2583,
"step": 1400
},
{
"epoch": 0.509656652360515,
"grad_norm": 0.18340837203234114,
"learning_rate": 8.281001589825118e-07,
"loss": 1.2499,
"step": 1425
},
{
"epoch": 0.5185979971387696,
"grad_norm": 0.193128543688102,
"learning_rate": 8.231319554848967e-07,
"loss": 1.2612,
"step": 1450
},
{
"epoch": 0.5275393419170243,
"grad_norm": 0.1751575826741444,
"learning_rate": 8.181637519872813e-07,
"loss": 1.2502,
"step": 1475
},
{
"epoch": 0.5364806866952789,
"grad_norm": 0.2364503348095419,
"learning_rate": 8.131955484896661e-07,
"loss": 1.243,
"step": 1500
},
{
"epoch": 0.5454220314735336,
"grad_norm": 0.25652812638575234,
"learning_rate": 8.082273449920508e-07,
"loss": 1.2559,
"step": 1525
},
{
"epoch": 0.5543633762517882,
"grad_norm": 0.1737798761662478,
"learning_rate": 8.032591414944355e-07,
"loss": 1.2529,
"step": 1550
},
{
"epoch": 0.5633047210300429,
"grad_norm": 0.16909231999113233,
"learning_rate": 7.982909379968203e-07,
"loss": 1.2425,
"step": 1575
},
{
"epoch": 0.5722460658082976,
"grad_norm": 0.16621585163311706,
"learning_rate": 7.933227344992051e-07,
"loss": 1.2428,
"step": 1600
},
{
"epoch": 0.5811874105865522,
"grad_norm": 0.29941509287671747,
"learning_rate": 7.883545310015897e-07,
"loss": 1.2501,
"step": 1625
},
{
"epoch": 0.5901287553648069,
"grad_norm": 0.17035369815136836,
"learning_rate": 7.833863275039745e-07,
"loss": 1.2515,
"step": 1650
},
{
"epoch": 0.5990701001430615,
"grad_norm": 0.17629538126188918,
"learning_rate": 7.784181240063593e-07,
"loss": 1.2463,
"step": 1675
},
{
"epoch": 0.6080114449213162,
"grad_norm": 0.1697497467753232,
"learning_rate": 7.73449920508744e-07,
"loss": 1.2311,
"step": 1700
},
{
"epoch": 0.6169527896995708,
"grad_norm": 0.15981908691911512,
"learning_rate": 7.684817170111287e-07,
"loss": 1.2449,
"step": 1725
},
{
"epoch": 0.6258941344778255,
"grad_norm": 0.1529795356453295,
"learning_rate": 7.635135135135135e-07,
"loss": 1.2558,
"step": 1750
},
{
"epoch": 0.6348354792560801,
"grad_norm": 0.1609652679207339,
"learning_rate": 7.585453100158981e-07,
"loss": 1.238,
"step": 1775
},
{
"epoch": 0.6437768240343348,
"grad_norm": 0.14621942114492062,
"learning_rate": 7.53577106518283e-07,
"loss": 1.2281,
"step": 1800
},
{
"epoch": 0.6527181688125894,
"grad_norm": 0.1618220669804669,
"learning_rate": 7.486089030206677e-07,
"loss": 1.2362,
"step": 1825
},
{
"epoch": 0.6616595135908441,
"grad_norm": 0.16201225587717186,
"learning_rate": 7.436406995230524e-07,
"loss": 1.2261,
"step": 1850
},
{
"epoch": 0.6706008583690987,
"grad_norm": 0.16891334320378984,
"learning_rate": 7.386724960254371e-07,
"loss": 1.2233,
"step": 1875
},
{
"epoch": 0.6795422031473534,
"grad_norm": 0.15836253423551233,
"learning_rate": 7.33704292527822e-07,
"loss": 1.2041,
"step": 1900
},
{
"epoch": 0.6884835479256081,
"grad_norm": 0.16173299579695544,
"learning_rate": 7.287360890302066e-07,
"loss": 1.2112,
"step": 1925
},
{
"epoch": 0.6974248927038627,
"grad_norm": 0.15311569879619952,
"learning_rate": 7.237678855325914e-07,
"loss": 1.2045,
"step": 1950
},
{
"epoch": 0.7063662374821174,
"grad_norm": 0.1454174761720946,
"learning_rate": 7.187996820349761e-07,
"loss": 1.2142,
"step": 1975
},
{
"epoch": 0.7153075822603719,
"grad_norm": 0.14358617606783605,
"learning_rate": 7.138314785373608e-07,
"loss": 1.2254,
"step": 2000
},
{
"epoch": 0.7242489270386266,
"grad_norm": 0.15123594116890207,
"learning_rate": 7.088632750397456e-07,
"loss": 1.2043,
"step": 2025
},
{
"epoch": 0.7331902718168812,
"grad_norm": 0.1484178022383261,
"learning_rate": 7.038950715421304e-07,
"loss": 1.2102,
"step": 2050
},
{
"epoch": 0.7421316165951359,
"grad_norm": 0.15779752084285467,
"learning_rate": 6.98926868044515e-07,
"loss": 1.219,
"step": 2075
},
{
"epoch": 0.7510729613733905,
"grad_norm": 0.14639571745115018,
"learning_rate": 6.939586645468998e-07,
"loss": 1.2053,
"step": 2100
},
{
"epoch": 0.7600143061516452,
"grad_norm": 0.18349123256744296,
"learning_rate": 6.889904610492846e-07,
"loss": 1.2154,
"step": 2125
},
{
"epoch": 0.7689556509298998,
"grad_norm": 0.1564150328615537,
"learning_rate": 6.840222575516693e-07,
"loss": 1.2004,
"step": 2150
},
{
"epoch": 0.7778969957081545,
"grad_norm": 0.15591360624463577,
"learning_rate": 6.79054054054054e-07,
"loss": 1.2097,
"step": 2175
},
{
"epoch": 0.7868383404864091,
"grad_norm": 0.1525230509402887,
"learning_rate": 6.740858505564388e-07,
"loss": 1.2009,
"step": 2200
},
{
"epoch": 0.7957796852646638,
"grad_norm": 0.13632266793566936,
"learning_rate": 6.691176470588234e-07,
"loss": 1.2169,
"step": 2225
},
{
"epoch": 0.8047210300429185,
"grad_norm": 0.165619157435053,
"learning_rate": 6.641494435612083e-07,
"loss": 1.2032,
"step": 2250
},
{
"epoch": 0.8136623748211731,
"grad_norm": 0.1382749276518515,
"learning_rate": 6.59181240063593e-07,
"loss": 1.1998,
"step": 2275
},
{
"epoch": 0.8226037195994278,
"grad_norm": 0.15894676488709247,
"learning_rate": 6.542130365659777e-07,
"loss": 1.2046,
"step": 2300
},
{
"epoch": 0.8315450643776824,
"grad_norm": 0.14139510562282273,
"learning_rate": 6.492448330683624e-07,
"loss": 1.2022,
"step": 2325
},
{
"epoch": 0.8404864091559371,
"grad_norm": 0.16356442362082368,
"learning_rate": 6.442766295707473e-07,
"loss": 1.1888,
"step": 2350
},
{
"epoch": 0.8494277539341917,
"grad_norm": 0.13001938286669296,
"learning_rate": 6.393084260731319e-07,
"loss": 1.2036,
"step": 2375
},
{
"epoch": 0.8583690987124464,
"grad_norm": 0.1376171055331258,
"learning_rate": 6.343402225755167e-07,
"loss": 1.1967,
"step": 2400
},
{
"epoch": 0.867310443490701,
"grad_norm": 0.1555553003869712,
"learning_rate": 6.293720190779014e-07,
"loss": 1.202,
"step": 2425
},
{
"epoch": 0.8762517882689557,
"grad_norm": 0.1536545724212523,
"learning_rate": 6.24403815580286e-07,
"loss": 1.1993,
"step": 2450
},
{
"epoch": 0.8851931330472103,
"grad_norm": 0.15017510970577388,
"learning_rate": 6.194356120826709e-07,
"loss": 1.1878,
"step": 2475
},
{
"epoch": 0.894134477825465,
"grad_norm": 0.1501666434092056,
"learning_rate": 6.144674085850557e-07,
"loss": 1.1855,
"step": 2500
},
{
"epoch": 0.9030758226037195,
"grad_norm": 0.14083948154628234,
"learning_rate": 6.094992050874403e-07,
"loss": 1.1858,
"step": 2525
},
{
"epoch": 0.9120171673819742,
"grad_norm": 0.1480314047950012,
"learning_rate": 6.04531001589825e-07,
"loss": 1.1718,
"step": 2550
},
{
"epoch": 0.920958512160229,
"grad_norm": 0.14172604863631122,
"learning_rate": 5.995627980922098e-07,
"loss": 1.1819,
"step": 2575
},
{
"epoch": 0.9298998569384835,
"grad_norm": 0.12782051260136137,
"learning_rate": 5.945945945945947e-07,
"loss": 1.1956,
"step": 2600
},
{
"epoch": 0.9388412017167382,
"grad_norm": 0.1451085538055058,
"learning_rate": 5.896263910969793e-07,
"loss": 1.1744,
"step": 2625
},
{
"epoch": 0.9477825464949928,
"grad_norm": 0.40345434298159216,
"learning_rate": 5.84658187599364e-07,
"loss": 1.1783,
"step": 2650
},
{
"epoch": 0.9567238912732475,
"grad_norm": 0.13772381526739436,
"learning_rate": 5.796899841017488e-07,
"loss": 1.1839,
"step": 2675
},
{
"epoch": 0.9656652360515021,
"grad_norm": 0.15802899189862257,
"learning_rate": 5.747217806041335e-07,
"loss": 1.1792,
"step": 2700
},
{
"epoch": 0.9746065808297568,
"grad_norm": 0.1467263131203197,
"learning_rate": 5.697535771065183e-07,
"loss": 1.1881,
"step": 2725
},
{
"epoch": 0.9835479256080114,
"grad_norm": 0.18086918496034873,
"learning_rate": 5.64785373608903e-07,
"loss": 1.1847,
"step": 2750
},
{
"epoch": 0.9924892703862661,
"grad_norm": 0.13322108072315525,
"learning_rate": 5.598171701112877e-07,
"loss": 1.174,
"step": 2775
},
{
"epoch": 1.0014306151645207,
"grad_norm": 0.13758424586995027,
"learning_rate": 5.548489666136724e-07,
"loss": 1.1744,
"step": 2800
},
{
"epoch": 1.0103719599427754,
"grad_norm": 0.14695511905283026,
"learning_rate": 5.498807631160573e-07,
"loss": 1.1753,
"step": 2825
},
{
"epoch": 1.01931330472103,
"grad_norm": 0.13583673106446567,
"learning_rate": 5.449125596184419e-07,
"loss": 1.177,
"step": 2850
},
{
"epoch": 1.0282546494992848,
"grad_norm": 0.14863281353489802,
"learning_rate": 5.399443561208267e-07,
"loss": 1.1698,
"step": 2875
},
{
"epoch": 1.0371959942775393,
"grad_norm": 0.1719230111879567,
"learning_rate": 5.349761526232114e-07,
"loss": 1.17,
"step": 2900
},
{
"epoch": 1.046137339055794,
"grad_norm": 0.14198220020823107,
"learning_rate": 5.300079491255962e-07,
"loss": 1.1599,
"step": 2925
},
{
"epoch": 1.0550786838340487,
"grad_norm": 0.14656880263067942,
"learning_rate": 5.250397456279809e-07,
"loss": 1.1613,
"step": 2950
},
{
"epoch": 1.0640200286123034,
"grad_norm": 0.17984614771854038,
"learning_rate": 5.200715421303657e-07,
"loss": 1.1614,
"step": 2975
},
{
"epoch": 1.0729613733905579,
"grad_norm": 0.16125844481926355,
"learning_rate": 5.151033386327503e-07,
"loss": 1.1456,
"step": 3000
},
{
"epoch": 1.0819027181688126,
"grad_norm": 0.15302132985828396,
"learning_rate": 5.101351351351351e-07,
"loss": 1.1582,
"step": 3025
},
{
"epoch": 1.0908440629470673,
"grad_norm": 0.17416017697667222,
"learning_rate": 5.051669316375199e-07,
"loss": 1.1645,
"step": 3050
},
{
"epoch": 1.099785407725322,
"grad_norm": 0.1576441191719269,
"learning_rate": 5.001987281399046e-07,
"loss": 1.1614,
"step": 3075
},
{
"epoch": 1.1087267525035764,
"grad_norm": 0.15708931920894004,
"learning_rate": 4.952305246422893e-07,
"loss": 1.1589,
"step": 3100
},
{
"epoch": 1.1176680972818311,
"grad_norm": 0.18088098015223084,
"learning_rate": 4.902623211446741e-07,
"loss": 1.1516,
"step": 3125
},
{
"epoch": 1.1266094420600858,
"grad_norm": 0.15266497982250174,
"learning_rate": 4.852941176470588e-07,
"loss": 1.1432,
"step": 3150
},
{
"epoch": 1.1355507868383405,
"grad_norm": 0.16769944079369944,
"learning_rate": 4.803259141494435e-07,
"loss": 1.151,
"step": 3175
},
{
"epoch": 1.144492131616595,
"grad_norm": 0.183750642878192,
"learning_rate": 4.7535771065182827e-07,
"loss": 1.1629,
"step": 3200
},
{
"epoch": 1.1534334763948497,
"grad_norm": 0.15008294024377883,
"learning_rate": 4.70389507154213e-07,
"loss": 1.1497,
"step": 3225
},
{
"epoch": 1.1623748211731044,
"grad_norm": 0.1792182775394022,
"learning_rate": 4.6542130365659777e-07,
"loss": 1.1442,
"step": 3250
},
{
"epoch": 1.1713161659513591,
"grad_norm": 0.1815534124009153,
"learning_rate": 4.6045310015898247e-07,
"loss": 1.1424,
"step": 3275
},
{
"epoch": 1.1802575107296138,
"grad_norm": 0.1634530123009294,
"learning_rate": 4.5548489666136727e-07,
"loss": 1.1445,
"step": 3300
},
{
"epoch": 1.1891988555078683,
"grad_norm": 0.15595048975318404,
"learning_rate": 4.5051669316375196e-07,
"loss": 1.1471,
"step": 3325
},
{
"epoch": 1.198140200286123,
"grad_norm": 0.15375493348738128,
"learning_rate": 4.455484896661367e-07,
"loss": 1.1559,
"step": 3350
},
{
"epoch": 1.2070815450643777,
"grad_norm": 0.15595330109052555,
"learning_rate": 4.4058028616852146e-07,
"loss": 1.1454,
"step": 3375
},
{
"epoch": 1.2160228898426324,
"grad_norm": 0.1531828453320636,
"learning_rate": 4.3561208267090616e-07,
"loss": 1.1447,
"step": 3400
},
{
"epoch": 1.224964234620887,
"grad_norm": 0.13496531066929612,
"learning_rate": 4.306438791732909e-07,
"loss": 1.1228,
"step": 3425
},
{
"epoch": 1.2339055793991416,
"grad_norm": 0.16876704458729294,
"learning_rate": 4.2567567567567566e-07,
"loss": 1.1267,
"step": 3450
},
{
"epoch": 1.2428469241773963,
"grad_norm": 0.35531759007502683,
"learning_rate": 4.207074721780604e-07,
"loss": 1.1282,
"step": 3475
},
{
"epoch": 1.251788268955651,
"grad_norm": 0.14942448217724147,
"learning_rate": 4.157392686804451e-07,
"loss": 1.1321,
"step": 3500
},
{
"epoch": 1.2607296137339055,
"grad_norm": 0.16823840198797532,
"learning_rate": 4.107710651828299e-07,
"loss": 1.1297,
"step": 3525
},
{
"epoch": 1.2696709585121602,
"grad_norm": 0.1373099361223873,
"learning_rate": 4.058028616852146e-07,
"loss": 1.1289,
"step": 3550
},
{
"epoch": 1.2786123032904149,
"grad_norm": 0.1238133749616558,
"learning_rate": 4.0083465818759935e-07,
"loss": 1.1197,
"step": 3575
},
{
"epoch": 1.2875536480686696,
"grad_norm": 0.16232957883088006,
"learning_rate": 3.958664546899841e-07,
"loss": 1.1517,
"step": 3600
},
{
"epoch": 1.2964949928469243,
"grad_norm": 0.15052195486822473,
"learning_rate": 3.908982511923688e-07,
"loss": 1.1115,
"step": 3625
},
{
"epoch": 1.3054363376251787,
"grad_norm": 0.1464299159190494,
"learning_rate": 3.8593004769475355e-07,
"loss": 1.121,
"step": 3650
},
{
"epoch": 1.3143776824034334,
"grad_norm": 0.16556774569358795,
"learning_rate": 3.809618441971383e-07,
"loss": 1.1248,
"step": 3675
},
{
"epoch": 1.3233190271816881,
"grad_norm": 0.14677494246881193,
"learning_rate": 3.7599364069952305e-07,
"loss": 1.1301,
"step": 3700
},
{
"epoch": 1.3322603719599428,
"grad_norm": 0.1385988595636303,
"learning_rate": 3.7102543720190775e-07,
"loss": 1.1217,
"step": 3725
},
{
"epoch": 1.3412017167381975,
"grad_norm": 0.132770703438204,
"learning_rate": 3.6605723370429255e-07,
"loss": 1.128,
"step": 3750
},
{
"epoch": 1.350143061516452,
"grad_norm": 0.14471226652721184,
"learning_rate": 3.6108903020667724e-07,
"loss": 1.1258,
"step": 3775
},
{
"epoch": 1.3590844062947067,
"grad_norm": 0.15829087639433673,
"learning_rate": 3.5612082670906194e-07,
"loss": 1.1232,
"step": 3800
},
{
"epoch": 1.3680257510729614,
"grad_norm": 0.14881510104209672,
"learning_rate": 3.5115262321144674e-07,
"loss": 1.1326,
"step": 3825
},
{
"epoch": 1.376967095851216,
"grad_norm": 0.14588466142604242,
"learning_rate": 3.4618441971383144e-07,
"loss": 1.1164,
"step": 3850
},
{
"epoch": 1.3859084406294706,
"grad_norm": 0.13477416021173907,
"learning_rate": 3.412162162162162e-07,
"loss": 1.1285,
"step": 3875
},
{
"epoch": 1.3948497854077253,
"grad_norm": 0.21722908586608208,
"learning_rate": 3.3624801271860094e-07,
"loss": 1.1201,
"step": 3900
},
{
"epoch": 1.40379113018598,
"grad_norm": 0.13558184653823538,
"learning_rate": 3.312798092209857e-07,
"loss": 1.1156,
"step": 3925
},
{
"epoch": 1.4127324749642347,
"grad_norm": 0.14185323455679683,
"learning_rate": 3.263116057233704e-07,
"loss": 1.1177,
"step": 3950
},
{
"epoch": 1.4216738197424892,
"grad_norm": 0.14569471186135233,
"learning_rate": 3.213434022257552e-07,
"loss": 1.117,
"step": 3975
},
{
"epoch": 1.4306151645207439,
"grad_norm": 0.1303851673327878,
"learning_rate": 3.163751987281399e-07,
"loss": 1.1194,
"step": 4000
},
{
"epoch": 1.4395565092989986,
"grad_norm": 0.13823108648889798,
"learning_rate": 3.114069952305246e-07,
"loss": 1.1153,
"step": 4025
},
{
"epoch": 1.4484978540772533,
"grad_norm": 0.14544212946759183,
"learning_rate": 3.064387917329094e-07,
"loss": 1.1043,
"step": 4050
},
{
"epoch": 1.457439198855508,
"grad_norm": 0.1290933120501116,
"learning_rate": 3.014705882352941e-07,
"loss": 1.1213,
"step": 4075
},
{
"epoch": 1.4663805436337625,
"grad_norm": 0.12809510682493896,
"learning_rate": 2.965023847376789e-07,
"loss": 1.113,
"step": 4100
},
{
"epoch": 1.4753218884120172,
"grad_norm": 0.1419793306501672,
"learning_rate": 2.915341812400636e-07,
"loss": 1.1309,
"step": 4125
},
{
"epoch": 1.4842632331902719,
"grad_norm": 0.13406734402981146,
"learning_rate": 2.8656597774244833e-07,
"loss": 1.1256,
"step": 4150
},
{
"epoch": 1.4932045779685263,
"grad_norm": 0.13587793689155384,
"learning_rate": 2.815977742448331e-07,
"loss": 1.1187,
"step": 4175
},
{
"epoch": 1.5021459227467813,
"grad_norm": 0.13258607901887373,
"learning_rate": 2.766295707472178e-07,
"loss": 1.1228,
"step": 4200
},
{
"epoch": 1.5110872675250357,
"grad_norm": 0.1247874755702546,
"learning_rate": 2.716613672496025e-07,
"loss": 1.1164,
"step": 4225
},
{
"epoch": 1.5200286123032904,
"grad_norm": 0.12239855788322539,
"learning_rate": 2.666931637519873e-07,
"loss": 1.1117,
"step": 4250
},
{
"epoch": 1.5289699570815452,
"grad_norm": 0.1369183999332412,
"learning_rate": 2.61724960254372e-07,
"loss": 1.1172,
"step": 4275
},
{
"epoch": 1.5379113018597996,
"grad_norm": 1.1332621790402788,
"learning_rate": 2.567567567567567e-07,
"loss": 1.0993,
"step": 4300
},
{
"epoch": 1.5468526466380543,
"grad_norm": 0.1253338454363431,
"learning_rate": 2.517885532591415e-07,
"loss": 1.1165,
"step": 4325
},
{
"epoch": 1.555793991416309,
"grad_norm": 0.12665869326621307,
"learning_rate": 2.468203497615262e-07,
"loss": 1.1075,
"step": 4350
},
{
"epoch": 1.5647353361945635,
"grad_norm": 0.12307463927377256,
"learning_rate": 2.4185214626391097e-07,
"loss": 1.1091,
"step": 4375
},
{
"epoch": 1.5736766809728184,
"grad_norm": 0.13556347702963126,
"learning_rate": 2.368839427662957e-07,
"loss": 1.1106,
"step": 4400
},
{
"epoch": 1.582618025751073,
"grad_norm": 0.1437851683344807,
"learning_rate": 2.3191573926868044e-07,
"loss": 1.1258,
"step": 4425
},
{
"epoch": 1.5915593705293276,
"grad_norm": 0.1483471734540286,
"learning_rate": 2.269475357710652e-07,
"loss": 1.1185,
"step": 4450
},
{
"epoch": 1.6005007153075823,
"grad_norm": 0.1417713425850489,
"learning_rate": 2.2197933227344991e-07,
"loss": 1.1091,
"step": 4475
},
{
"epoch": 1.6094420600858368,
"grad_norm": 0.11258187644158707,
"learning_rate": 2.1701112877583466e-07,
"loss": 1.1141,
"step": 4500
},
{
"epoch": 1.6183834048640917,
"grad_norm": 1.4844013392798963,
"learning_rate": 2.120429252782194e-07,
"loss": 1.1083,
"step": 4525
},
{
"epoch": 1.6273247496423462,
"grad_norm": 0.14485866801127223,
"learning_rate": 2.070747217806041e-07,
"loss": 1.1128,
"step": 4550
},
{
"epoch": 1.636266094420601,
"grad_norm": 0.1310091219191068,
"learning_rate": 2.0210651828298886e-07,
"loss": 1.1081,
"step": 4575
},
{
"epoch": 1.6452074391988556,
"grad_norm": 0.1324864798139755,
"learning_rate": 1.971383147853736e-07,
"loss": 1.1218,
"step": 4600
},
{
"epoch": 1.65414878397711,
"grad_norm": 0.16408287715823638,
"learning_rate": 1.9217011128775833e-07,
"loss": 1.1128,
"step": 4625
},
{
"epoch": 1.6630901287553648,
"grad_norm": 0.13453997589781927,
"learning_rate": 1.8720190779014308e-07,
"loss": 1.109,
"step": 4650
},
{
"epoch": 1.6720314735336195,
"grad_norm": 0.11431592419320451,
"learning_rate": 1.8223370429252783e-07,
"loss": 1.1128,
"step": 4675
},
{
"epoch": 1.680972818311874,
"grad_norm": 0.12937079012352717,
"learning_rate": 1.7726550079491255e-07,
"loss": 1.107,
"step": 4700
},
{
"epoch": 1.6899141630901289,
"grad_norm": 0.13493617023722984,
"learning_rate": 1.7229729729729728e-07,
"loss": 1.1026,
"step": 4725
},
{
"epoch": 1.6988555078683834,
"grad_norm": 0.12427164940365912,
"learning_rate": 1.6732909379968203e-07,
"loss": 1.1002,
"step": 4750
},
{
"epoch": 1.707796852646638,
"grad_norm": 0.12582353400466703,
"learning_rate": 1.6236089030206675e-07,
"loss": 1.1091,
"step": 4775
},
{
"epoch": 1.7167381974248928,
"grad_norm": 0.1209250119003803,
"learning_rate": 1.573926868044515e-07,
"loss": 1.0988,
"step": 4800
},
{
"epoch": 1.7256795422031472,
"grad_norm": 0.1416217314930489,
"learning_rate": 1.5242448330683625e-07,
"loss": 1.1083,
"step": 4825
},
{
"epoch": 1.7346208869814022,
"grad_norm": 0.13057929166120621,
"learning_rate": 1.4745627980922097e-07,
"loss": 1.1164,
"step": 4850
},
{
"epoch": 1.7435622317596566,
"grad_norm": 0.13275668946657931,
"learning_rate": 1.4248807631160572e-07,
"loss": 1.1164,
"step": 4875
},
{
"epoch": 1.7525035765379113,
"grad_norm": 0.19255405164097672,
"learning_rate": 1.3751987281399047e-07,
"loss": 1.0945,
"step": 4900
},
{
"epoch": 1.761444921316166,
"grad_norm": 0.13958161437492914,
"learning_rate": 1.3255166931637517e-07,
"loss": 1.1094,
"step": 4925
},
{
"epoch": 1.7703862660944205,
"grad_norm": 0.1379908503816713,
"learning_rate": 1.2758346581875992e-07,
"loss": 1.1027,
"step": 4950
},
{
"epoch": 1.7793276108726752,
"grad_norm": 0.1146403989881727,
"learning_rate": 1.2261526232114467e-07,
"loss": 1.0966,
"step": 4975
},
{
"epoch": 1.78826895565093,
"grad_norm": 0.1296359557728573,
"learning_rate": 1.176470588235294e-07,
"loss": 1.1163,
"step": 5000
},
{
"epoch": 1.7972103004291844,
"grad_norm": 0.1242709050610396,
"learning_rate": 1.1267885532591414e-07,
"loss": 1.109,
"step": 5025
},
{
"epoch": 1.8061516452074393,
"grad_norm": 0.12401932092728496,
"learning_rate": 1.0771065182829889e-07,
"loss": 1.0986,
"step": 5050
},
{
"epoch": 1.8150929899856938,
"grad_norm": 0.1268226445976438,
"learning_rate": 1.0274244833068361e-07,
"loss": 1.1023,
"step": 5075
},
{
"epoch": 1.8240343347639485,
"grad_norm": 0.12184905639120043,
"learning_rate": 9.777424483306836e-08,
"loss": 1.0985,
"step": 5100
},
{
"epoch": 1.8329756795422032,
"grad_norm": 0.12734274404323123,
"learning_rate": 9.28060413354531e-08,
"loss": 1.1154,
"step": 5125
},
{
"epoch": 1.8419170243204577,
"grad_norm": 0.13235924797466722,
"learning_rate": 8.783783783783784e-08,
"loss": 1.098,
"step": 5150
},
{
"epoch": 1.8508583690987126,
"grad_norm": 0.11628761020569767,
"learning_rate": 8.286963434022257e-08,
"loss": 1.106,
"step": 5175
},
{
"epoch": 1.859799713876967,
"grad_norm": 0.19214819884604664,
"learning_rate": 7.790143084260731e-08,
"loss": 1.0889,
"step": 5200
},
{
"epoch": 1.8687410586552218,
"grad_norm": 0.487598636016336,
"learning_rate": 7.293322734499204e-08,
"loss": 1.1045,
"step": 5225
},
{
"epoch": 1.8776824034334765,
"grad_norm": 0.13127933734364353,
"learning_rate": 6.79650238473768e-08,
"loss": 1.1017,
"step": 5250
},
{
"epoch": 1.886623748211731,
"grad_norm": 0.1314482800625486,
"learning_rate": 6.299682034976152e-08,
"loss": 1.1044,
"step": 5275
},
{
"epoch": 1.8955650929899857,
"grad_norm": 0.12676434160993588,
"learning_rate": 5.802861685214626e-08,
"loss": 1.1043,
"step": 5300
},
{
"epoch": 1.9045064377682404,
"grad_norm": 0.1323305276547178,
"learning_rate": 5.3060413354531e-08,
"loss": 1.1026,
"step": 5325
},
{
"epoch": 1.9134477825464948,
"grad_norm": 0.13391474060102793,
"learning_rate": 4.809220985691573e-08,
"loss": 1.0969,
"step": 5350
},
{
"epoch": 1.9223891273247498,
"grad_norm": 0.12245753375271169,
"learning_rate": 4.3124006359300475e-08,
"loss": 1.1081,
"step": 5375
},
{
"epoch": 1.9313304721030042,
"grad_norm": 0.11951679306957765,
"learning_rate": 3.815580286168521e-08,
"loss": 1.1092,
"step": 5400
},
{
"epoch": 1.940271816881259,
"grad_norm": 0.11961090020470673,
"learning_rate": 3.3187599364069955e-08,
"loss": 1.1007,
"step": 5425
},
{
"epoch": 1.9492131616595136,
"grad_norm": 0.12422327512042645,
"learning_rate": 2.8219395866454688e-08,
"loss": 1.098,
"step": 5450
},
{
"epoch": 1.9581545064377681,
"grad_norm": 0.125577214373895,
"learning_rate": 2.3251192368839427e-08,
"loss": 1.1009,
"step": 5475
},
{
"epoch": 1.967095851216023,
"grad_norm": 0.12978760398009362,
"learning_rate": 1.8282988871224164e-08,
"loss": 1.0965,
"step": 5500
},
{
"epoch": 1.9760371959942775,
"grad_norm": 0.15452388925779964,
"learning_rate": 1.3314785373608903e-08,
"loss": 1.1058,
"step": 5525
},
{
"epoch": 1.9849785407725322,
"grad_norm": 0.12170241302022762,
"learning_rate": 8.346581875993641e-09,
"loss": 1.1066,
"step": 5550
},
{
"epoch": 1.993919885550787,
"grad_norm": 0.13568263553279267,
"learning_rate": 3.3783783783783785e-09,
"loss": 1.0966,
"step": 5575
}
],
"logging_steps": 25,
"max_steps": 5592,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 1.1563888596221952e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}