ss-llama3.1_8B_v1 / trainer_state.json
Tung177's picture
Upload trainer_state.json with huggingface_hub
2577b4e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2282,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008764241893076249,
"grad_norm": 3.7227299213409424,
"learning_rate": 9.999654593547529e-05,
"loss": 0.8928,
"step": 10
},
{
"epoch": 0.017528483786152498,
"grad_norm": 2.602724552154541,
"learning_rate": 9.998460664009619e-05,
"loss": 0.7339,
"step": 20
},
{
"epoch": 0.026292725679228746,
"grad_norm": 2.0315356254577637,
"learning_rate": 9.996414173332143e-05,
"loss": 0.8093,
"step": 30
},
{
"epoch": 0.035056967572304996,
"grad_norm": 2.265153408050537,
"learning_rate": 9.993515509371871e-05,
"loss": 0.7516,
"step": 40
},
{
"epoch": 0.04382120946538125,
"grad_norm": 1.9205143451690674,
"learning_rate": 9.989765221491895e-05,
"loss": 0.7691,
"step": 50
},
{
"epoch": 0.05258545135845749,
"grad_norm": 2.219170331954956,
"learning_rate": 9.985164020457504e-05,
"loss": 0.7539,
"step": 60
},
{
"epoch": 0.06134969325153374,
"grad_norm": 2.3345134258270264,
"learning_rate": 9.979712778301489e-05,
"loss": 0.6932,
"step": 70
},
{
"epoch": 0.07011393514460999,
"grad_norm": 1.9078369140625,
"learning_rate": 9.973412528158862e-05,
"loss": 0.7058,
"step": 80
},
{
"epoch": 0.07887817703768624,
"grad_norm": 2.3402259349823,
"learning_rate": 9.966264464071064e-05,
"loss": 0.7181,
"step": 90
},
{
"epoch": 0.0876424189307625,
"grad_norm": 1.6266059875488281,
"learning_rate": 9.958269940759659e-05,
"loss": 0.6715,
"step": 100
},
{
"epoch": 0.09640666082383874,
"grad_norm": 1.8910496234893799,
"learning_rate": 9.94943047336958e-05,
"loss": 0.6983,
"step": 110
},
{
"epoch": 0.10517090271691498,
"grad_norm": 1.970109224319458,
"learning_rate": 9.939747737181993e-05,
"loss": 0.7639,
"step": 120
},
{
"epoch": 0.11393514460999124,
"grad_norm": 1.9310839176177979,
"learning_rate": 9.929223567296766e-05,
"loss": 0.6289,
"step": 130
},
{
"epoch": 0.12269938650306748,
"grad_norm": 1.5292866230010986,
"learning_rate": 9.917859958284699e-05,
"loss": 0.6575,
"step": 140
},
{
"epoch": 0.13146362839614373,
"grad_norm": 1.3497545719146729,
"learning_rate": 9.905659063809492e-05,
"loss": 0.6986,
"step": 150
},
{
"epoch": 0.14022787028921999,
"grad_norm": 2.1464974880218506,
"learning_rate": 9.892623196219586e-05,
"loss": 0.7055,
"step": 160
},
{
"epoch": 0.14899211218229624,
"grad_norm": 2.5107662677764893,
"learning_rate": 9.878754826109915e-05,
"loss": 0.7421,
"step": 170
},
{
"epoch": 0.15775635407537247,
"grad_norm": 1.9968948364257812,
"learning_rate": 9.864056581853674e-05,
"loss": 0.6956,
"step": 180
},
{
"epoch": 0.16652059596844873,
"grad_norm": 1.5380990505218506,
"learning_rate": 9.84853124910418e-05,
"loss": 0.7439,
"step": 190
},
{
"epoch": 0.175284837861525,
"grad_norm": 1.8293559551239014,
"learning_rate": 9.832181770266927e-05,
"loss": 0.6924,
"step": 200
},
{
"epoch": 0.18404907975460122,
"grad_norm": 1.5421850681304932,
"learning_rate": 9.815011243941939e-05,
"loss": 0.6443,
"step": 210
},
{
"epoch": 0.19281332164767748,
"grad_norm": 1.7461329698562622,
"learning_rate": 9.797022924336504e-05,
"loss": 0.7049,
"step": 220
},
{
"epoch": 0.20157756354075373,
"grad_norm": 1.9567017555236816,
"learning_rate": 9.778220220648439e-05,
"loss": 0.6797,
"step": 230
},
{
"epoch": 0.21034180543382996,
"grad_norm": 2.222736358642578,
"learning_rate": 9.75860669641996e-05,
"loss": 0.6958,
"step": 240
},
{
"epoch": 0.21910604732690622,
"grad_norm": 1.6702378988265991,
"learning_rate": 9.738186068862311e-05,
"loss": 0.6895,
"step": 250
},
{
"epoch": 0.22787028921998248,
"grad_norm": 1.9190706014633179,
"learning_rate": 9.716962208151269e-05,
"loss": 0.6747,
"step": 260
},
{
"epoch": 0.2366345311130587,
"grad_norm": 2.277719020843506,
"learning_rate": 9.69493913669366e-05,
"loss": 0.704,
"step": 270
},
{
"epoch": 0.24539877300613497,
"grad_norm": 1.6245454549789429,
"learning_rate": 9.672121028365014e-05,
"loss": 0.6023,
"step": 280
},
{
"epoch": 0.2541630148992112,
"grad_norm": 1.7889143228530884,
"learning_rate": 9.648512207718532e-05,
"loss": 0.7218,
"step": 290
},
{
"epoch": 0.26292725679228746,
"grad_norm": 2.422360897064209,
"learning_rate": 9.624117149165466e-05,
"loss": 0.6986,
"step": 300
},
{
"epoch": 0.27169149868536374,
"grad_norm": 2.14021372795105,
"learning_rate": 9.598940476127131e-05,
"loss": 0.6476,
"step": 310
},
{
"epoch": 0.28045574057843997,
"grad_norm": 1.9130802154541016,
"learning_rate": 9.57298696015866e-05,
"loss": 0.7056,
"step": 320
},
{
"epoch": 0.2892199824715162,
"grad_norm": 2.4466943740844727,
"learning_rate": 9.546261520044675e-05,
"loss": 0.6542,
"step": 330
},
{
"epoch": 0.2979842243645925,
"grad_norm": 1.6736353635787964,
"learning_rate": 9.518769220867076e-05,
"loss": 0.6648,
"step": 340
},
{
"epoch": 0.3067484662576687,
"grad_norm": 1.745496153831482,
"learning_rate": 9.490515273045085e-05,
"loss": 0.6411,
"step": 350
},
{
"epoch": 0.31551270815074495,
"grad_norm": 2.82519268989563,
"learning_rate": 9.461505031347753e-05,
"loss": 0.6597,
"step": 360
},
{
"epoch": 0.32427695004382123,
"grad_norm": 2.2791597843170166,
"learning_rate": 9.431743993879119e-05,
"loss": 0.6931,
"step": 370
},
{
"epoch": 0.33304119193689746,
"grad_norm": 1.6820751428604126,
"learning_rate": 9.401237801036176e-05,
"loss": 0.6529,
"step": 380
},
{
"epoch": 0.3418054338299737,
"grad_norm": 1.7358545064926147,
"learning_rate": 9.369992234439899e-05,
"loss": 0.7001,
"step": 390
},
{
"epoch": 0.35056967572305,
"grad_norm": 1.7271596193313599,
"learning_rate": 9.338013215839495e-05,
"loss": 0.6801,
"step": 400
},
{
"epoch": 0.3593339176161262,
"grad_norm": 1.7952642440795898,
"learning_rate": 9.305306805990093e-05,
"loss": 0.7023,
"step": 410
},
{
"epoch": 0.36809815950920244,
"grad_norm": 2.275848865509033,
"learning_rate": 9.271879203504094e-05,
"loss": 0.6641,
"step": 420
},
{
"epoch": 0.3768624014022787,
"grad_norm": 1.435059666633606,
"learning_rate": 9.237736743676386e-05,
"loss": 0.6739,
"step": 430
},
{
"epoch": 0.38562664329535495,
"grad_norm": 1.6760456562042236,
"learning_rate": 9.202885897283674e-05,
"loss": 0.6245,
"step": 440
},
{
"epoch": 0.3943908851884312,
"grad_norm": 1.5440309047698975,
"learning_rate": 9.167333269358109e-05,
"loss": 0.6436,
"step": 450
},
{
"epoch": 0.40315512708150747,
"grad_norm": 1.798532485961914,
"learning_rate": 9.131085597935487e-05,
"loss": 0.6097,
"step": 460
},
{
"epoch": 0.4119193689745837,
"grad_norm": 1.5643911361694336,
"learning_rate": 9.094149752778233e-05,
"loss": 0.7032,
"step": 470
},
{
"epoch": 0.42068361086765993,
"grad_norm": 1.9498308897018433,
"learning_rate": 9.056532734073434e-05,
"loss": 0.6429,
"step": 480
},
{
"epoch": 0.4294478527607362,
"grad_norm": 2.7362070083618164,
"learning_rate": 9.018241671106134e-05,
"loss": 0.6518,
"step": 490
},
{
"epoch": 0.43821209465381245,
"grad_norm": 1.755315899848938,
"learning_rate": 8.979283820908174e-05,
"loss": 0.6145,
"step": 500
},
{
"epoch": 0.4469763365468887,
"grad_norm": 1.6948930025100708,
"learning_rate": 8.939666566882821e-05,
"loss": 0.6809,
"step": 510
},
{
"epoch": 0.45574057843996496,
"grad_norm": 1.9926345348358154,
"learning_rate": 8.899397417405442e-05,
"loss": 0.6811,
"step": 520
},
{
"epoch": 0.4645048203330412,
"grad_norm": 2.0141072273254395,
"learning_rate": 8.858484004400496e-05,
"loss": 0.6326,
"step": 530
},
{
"epoch": 0.4732690622261174,
"grad_norm": 1.5361511707305908,
"learning_rate": 8.816934081895105e-05,
"loss": 0.6316,
"step": 540
},
{
"epoch": 0.4820333041191937,
"grad_norm": 2.0940563678741455,
"learning_rate": 8.774755524549503e-05,
"loss": 0.7468,
"step": 550
},
{
"epoch": 0.49079754601226994,
"grad_norm": 2.6678926944732666,
"learning_rate": 8.731956326164591e-05,
"loss": 0.6853,
"step": 560
},
{
"epoch": 0.49956178790534617,
"grad_norm": 2.2798871994018555,
"learning_rate": 8.688544598166935e-05,
"loss": 0.6488,
"step": 570
},
{
"epoch": 0.5083260297984225,
"grad_norm": 2.1741902828216553,
"learning_rate": 8.644528568071472e-05,
"loss": 0.7044,
"step": 580
},
{
"epoch": 0.5170902716914987,
"grad_norm": 1.9935109615325928,
"learning_rate": 8.599916577922198e-05,
"loss": 0.6689,
"step": 590
},
{
"epoch": 0.5258545135845749,
"grad_norm": 2.2509982585906982,
"learning_rate": 8.554717082711164e-05,
"loss": 0.6697,
"step": 600
},
{
"epoch": 0.5346187554776511,
"grad_norm": 1.5349043607711792,
"learning_rate": 8.508938648776062e-05,
"loss": 0.6741,
"step": 610
},
{
"epoch": 0.5433829973707275,
"grad_norm": 1.8080490827560425,
"learning_rate": 8.462589952176709e-05,
"loss": 0.6448,
"step": 620
},
{
"epoch": 0.5521472392638037,
"grad_norm": 1.4415456056594849,
"learning_rate": 8.415679777050735e-05,
"loss": 0.6585,
"step": 630
},
{
"epoch": 0.5609114811568799,
"grad_norm": 1.9184411764144897,
"learning_rate": 8.368217013948786e-05,
"loss": 0.6934,
"step": 640
},
{
"epoch": 0.5696757230499562,
"grad_norm": 2.0871620178222656,
"learning_rate": 8.320210658149562e-05,
"loss": 0.642,
"step": 650
},
{
"epoch": 0.5784399649430324,
"grad_norm": 1.835964322090149,
"learning_rate": 8.271669807955007e-05,
"loss": 0.7067,
"step": 660
},
{
"epoch": 0.5872042068361086,
"grad_norm": 2.068668842315674,
"learning_rate": 8.222603662965974e-05,
"loss": 0.6759,
"step": 670
},
{
"epoch": 0.595968448729185,
"grad_norm": 1.837086796760559,
"learning_rate": 8.173021522338687e-05,
"loss": 0.6301,
"step": 680
},
{
"epoch": 0.6047326906222612,
"grad_norm": 1.995373010635376,
"learning_rate": 8.122932783022342e-05,
"loss": 0.688,
"step": 690
},
{
"epoch": 0.6134969325153374,
"grad_norm": 1.736932635307312,
"learning_rate": 8.072346937978168e-05,
"loss": 0.6538,
"step": 700
},
{
"epoch": 0.6222611744084137,
"grad_norm": 2.074141025543213,
"learning_rate": 8.02127357438029e-05,
"loss": 0.7275,
"step": 710
},
{
"epoch": 0.6310254163014899,
"grad_norm": 2.758039951324463,
"learning_rate": 7.969722371798753e-05,
"loss": 0.7139,
"step": 720
},
{
"epoch": 0.6397896581945661,
"grad_norm": 1.8415740728378296,
"learning_rate": 7.917703100365005e-05,
"loss": 0.5433,
"step": 730
},
{
"epoch": 0.6485539000876425,
"grad_norm": 2.0133821964263916,
"learning_rate": 7.865225618920248e-05,
"loss": 0.6369,
"step": 740
},
{
"epoch": 0.6573181419807187,
"grad_norm": 1.943287968635559,
"learning_rate": 7.812299873146955e-05,
"loss": 0.6168,
"step": 750
},
{
"epoch": 0.6660823838737949,
"grad_norm": 1.7971194982528687,
"learning_rate": 7.758935893683939e-05,
"loss": 0.6518,
"step": 760
},
{
"epoch": 0.6748466257668712,
"grad_norm": 1.5026414394378662,
"learning_rate": 7.705143794225315e-05,
"loss": 0.6498,
"step": 770
},
{
"epoch": 0.6836108676599474,
"grad_norm": 1.8013416528701782,
"learning_rate": 7.65093376960372e-05,
"loss": 0.6598,
"step": 780
},
{
"epoch": 0.6923751095530236,
"grad_norm": 1.8364579677581787,
"learning_rate": 7.596316093858172e-05,
"loss": 0.6934,
"step": 790
},
{
"epoch": 0.7011393514461,
"grad_norm": 1.7535738945007324,
"learning_rate": 7.541301118286894e-05,
"loss": 0.5668,
"step": 800
},
{
"epoch": 0.7099035933391762,
"grad_norm": 1.9401272535324097,
"learning_rate": 7.485899269485506e-05,
"loss": 0.6852,
"step": 810
},
{
"epoch": 0.7186678352322524,
"grad_norm": 1.886604905128479,
"learning_rate": 7.430121047370955e-05,
"loss": 0.6845,
"step": 820
},
{
"epoch": 0.7274320771253286,
"grad_norm": 1.9620131254196167,
"learning_rate": 7.37397702319153e-05,
"loss": 0.6344,
"step": 830
},
{
"epoch": 0.7361963190184049,
"grad_norm": 1.949866771697998,
"learning_rate": 7.32314343370074e-05,
"loss": 0.6247,
"step": 840
},
{
"epoch": 0.7449605609114811,
"grad_norm": 1.5609701871871948,
"learning_rate": 7.266333756059938e-05,
"loss": 0.604,
"step": 850
},
{
"epoch": 0.7537248028045574,
"grad_norm": 2.5070557594299316,
"learning_rate": 7.209189317790467e-05,
"loss": 0.6262,
"step": 860
},
{
"epoch": 0.7624890446976337,
"grad_norm": 1.672145128250122,
"learning_rate": 7.151720949069814e-05,
"loss": 0.6188,
"step": 870
},
{
"epoch": 0.7712532865907099,
"grad_norm": 1.456264615058899,
"learning_rate": 7.093939541467697e-05,
"loss": 0.6026,
"step": 880
},
{
"epoch": 0.7800175284837861,
"grad_norm": 1.967781901359558,
"learning_rate": 7.035856045881851e-05,
"loss": 0.6649,
"step": 890
},
{
"epoch": 0.7887817703768624,
"grad_norm": 1.6206531524658203,
"learning_rate": 6.977481470462593e-05,
"loss": 0.6018,
"step": 900
},
{
"epoch": 0.7975460122699386,
"grad_norm": 2.240525960922241,
"learning_rate": 6.918826878526527e-05,
"loss": 0.5912,
"step": 910
},
{
"epoch": 0.8063102541630149,
"grad_norm": 1.8358842134475708,
"learning_rate": 6.859903386459781e-05,
"loss": 0.6435,
"step": 920
},
{
"epoch": 0.8150744960560912,
"grad_norm": 2.574652671813965,
"learning_rate": 6.80072216161121e-05,
"loss": 0.6789,
"step": 930
},
{
"epoch": 0.8238387379491674,
"grad_norm": 1.668445348739624,
"learning_rate": 6.741294420175927e-05,
"loss": 0.6103,
"step": 940
},
{
"epoch": 0.8326029798422436,
"grad_norm": 1.7513461112976074,
"learning_rate": 6.681631425069566e-05,
"loss": 0.5696,
"step": 950
},
{
"epoch": 0.8413672217353199,
"grad_norm": 2.508164405822754,
"learning_rate": 6.621744483793715e-05,
"loss": 0.653,
"step": 960
},
{
"epoch": 0.8501314636283961,
"grad_norm": 1.9786324501037598,
"learning_rate": 6.56164494629288e-05,
"loss": 0.635,
"step": 970
},
{
"epoch": 0.8588957055214724,
"grad_norm": 1.8781009912490845,
"learning_rate": 6.501344202803414e-05,
"loss": 0.7071,
"step": 980
},
{
"epoch": 0.8676599474145487,
"grad_norm": 1.819648027420044,
"learning_rate": 6.440853681694801e-05,
"loss": 0.5952,
"step": 990
},
{
"epoch": 0.8764241893076249,
"grad_norm": 1.8697113990783691,
"learning_rate": 6.380184847303727e-05,
"loss": 0.6179,
"step": 1000
},
{
"epoch": 0.8851884312007011,
"grad_norm": 1.6153459548950195,
"learning_rate": 6.319349197761317e-05,
"loss": 0.6071,
"step": 1010
},
{
"epoch": 0.8939526730937774,
"grad_norm": 1.5980494022369385,
"learning_rate": 6.25835826281398e-05,
"loss": 0.588,
"step": 1020
},
{
"epoch": 0.9027169149868537,
"grad_norm": 1.509323239326477,
"learning_rate": 6.197223601638266e-05,
"loss": 0.6636,
"step": 1030
},
{
"epoch": 0.9114811568799299,
"grad_norm": 1.91567862033844,
"learning_rate": 6.135956800650128e-05,
"loss": 0.6727,
"step": 1040
},
{
"epoch": 0.9202453987730062,
"grad_norm": 1.7963509559631348,
"learning_rate": 6.074569471309032e-05,
"loss": 0.5768,
"step": 1050
},
{
"epoch": 0.9290096406660824,
"grad_norm": 1.8217496871948242,
"learning_rate": 6.013073247917326e-05,
"loss": 0.6243,
"step": 1060
},
{
"epoch": 0.9377738825591586,
"grad_norm": 1.648887276649475,
"learning_rate": 5.951479785415266e-05,
"loss": 0.6717,
"step": 1070
},
{
"epoch": 0.9465381244522348,
"grad_norm": 1.668747067451477,
"learning_rate": 5.889800757172146e-05,
"loss": 0.5823,
"step": 1080
},
{
"epoch": 0.9553023663453112,
"grad_norm": 2.224945545196533,
"learning_rate": 5.8280478527739235e-05,
"loss": 0.6203,
"step": 1090
},
{
"epoch": 0.9640666082383874,
"grad_norm": NaN,
"learning_rate": 5.7724167474641534e-05,
"loss": 0.6271,
"step": 1100
},
{
"epoch": 0.9728308501314636,
"grad_norm": 2.2267634868621826,
"learning_rate": 5.7105557315385284e-05,
"loss": 0.6614,
"step": 1110
},
{
"epoch": 0.9815950920245399,
"grad_norm": 1.7963807582855225,
"learning_rate": 5.6486548104880555e-05,
"loss": 0.7113,
"step": 1120
},
{
"epoch": 0.9903593339176161,
"grad_norm": 2.0616729259490967,
"learning_rate": 5.586725715952452e-05,
"loss": 0.6077,
"step": 1130
},
{
"epoch": 0.9991235758106923,
"grad_norm": 1.7366951704025269,
"learning_rate": 5.5247801849109526e-05,
"loss": 0.634,
"step": 1140
},
{
"epoch": 1.0078878177037687,
"grad_norm": 1.4753001928329468,
"learning_rate": 5.462829957457888e-05,
"loss": 0.44,
"step": 1150
},
{
"epoch": 1.016652059596845,
"grad_norm": 1.493257999420166,
"learning_rate": 5.400886774577667e-05,
"loss": 0.3635,
"step": 1160
},
{
"epoch": 1.0254163014899211,
"grad_norm": 1.7157448530197144,
"learning_rate": 5.338962375919589e-05,
"loss": 0.3525,
"step": 1170
},
{
"epoch": 1.0341805433829974,
"grad_norm": 1.7190569639205933,
"learning_rate": 5.277068497572914e-05,
"loss": 0.3751,
"step": 1180
},
{
"epoch": 1.0429447852760736,
"grad_norm": 1.5113599300384521,
"learning_rate": 5.215216869842604e-05,
"loss": 0.3857,
"step": 1190
},
{
"epoch": 1.0517090271691498,
"grad_norm": 1.6809605360031128,
"learning_rate": 5.1534192150261676e-05,
"loss": 0.3611,
"step": 1200
},
{
"epoch": 1.060473269062226,
"grad_norm": 1.6623985767364502,
"learning_rate": 5.091687245192006e-05,
"loss": 0.3558,
"step": 1210
},
{
"epoch": 1.0692375109553023,
"grad_norm": 2.009783983230591,
"learning_rate": 5.030032659959722e-05,
"loss": 0.3654,
"step": 1220
},
{
"epoch": 1.0780017528483785,
"grad_norm": 1.6117240190505981,
"learning_rate": 4.968467144282759e-05,
"loss": 0.3715,
"step": 1230
},
{
"epoch": 1.086765994741455,
"grad_norm": 1.3283199071884155,
"learning_rate": 4.9070023662338523e-05,
"loss": 0.3736,
"step": 1240
},
{
"epoch": 1.0955302366345312,
"grad_norm": 1.8791778087615967,
"learning_rate": 4.8456499747936465e-05,
"loss": 0.4082,
"step": 1250
},
{
"epoch": 1.1042944785276074,
"grad_norm": 1.7714508771896362,
"learning_rate": 4.7844215976429576e-05,
"loss": 0.3743,
"step": 1260
},
{
"epoch": 1.1130587204206837,
"grad_norm": 1.773903489112854,
"learning_rate": 4.723328838959057e-05,
"loss": 0.3858,
"step": 1270
},
{
"epoch": 1.1218229623137599,
"grad_norm": 1.4871599674224854,
"learning_rate": 4.662383277216418e-05,
"loss": 0.3486,
"step": 1280
},
{
"epoch": 1.1305872042068361,
"grad_norm": 1.3975831270217896,
"learning_rate": 4.601596462992326e-05,
"loss": 0.3657,
"step": 1290
},
{
"epoch": 1.1393514460999123,
"grad_norm": 1.5370299816131592,
"learning_rate": 4.540979916777783e-05,
"loss": 0.3371,
"step": 1300
},
{
"epoch": 1.1481156879929886,
"grad_norm": 1.5495011806488037,
"learning_rate": 4.480545126794115e-05,
"loss": 0.3357,
"step": 1310
},
{
"epoch": 1.1568799298860648,
"grad_norm": 1.694589376449585,
"learning_rate": 4.420303546815678e-05,
"loss": 0.3645,
"step": 1320
},
{
"epoch": 1.165644171779141,
"grad_norm": 1.985825777053833,
"learning_rate": 4.360266593999124e-05,
"loss": 0.3546,
"step": 1330
},
{
"epoch": 1.1744084136722173,
"grad_norm": 1.5808460712432861,
"learning_rate": 4.300445646719573e-05,
"loss": 0.3638,
"step": 1340
},
{
"epoch": 1.1831726555652935,
"grad_norm": 1.7759652137756348,
"learning_rate": 4.240852042414162e-05,
"loss": 0.4059,
"step": 1350
},
{
"epoch": 1.19193689745837,
"grad_norm": 1.7563869953155518,
"learning_rate": 4.181497075433334e-05,
"loss": 0.3446,
"step": 1360
},
{
"epoch": 1.2007011393514462,
"grad_norm": 1.9143909215927124,
"learning_rate": 4.1223919949003045e-05,
"loss": 0.3487,
"step": 1370
},
{
"epoch": 1.2094653812445224,
"grad_norm": 1.8407511711120605,
"learning_rate": 4.0635480025790926e-05,
"loss": 0.3649,
"step": 1380
},
{
"epoch": 1.2182296231375986,
"grad_norm": 1.7706880569458008,
"learning_rate": 4.0049762507515355e-05,
"loss": 0.3612,
"step": 1390
},
{
"epoch": 1.2269938650306749,
"grad_norm": 1.0870561599731445,
"learning_rate": 3.9466878401036686e-05,
"loss": 0.3401,
"step": 1400
},
{
"epoch": 1.235758106923751,
"grad_norm": 1.91828453540802,
"learning_rate": 3.8886938176219024e-05,
"loss": 0.3327,
"step": 1410
},
{
"epoch": 1.2445223488168273,
"grad_norm": 1.61056649684906,
"learning_rate": 3.8310051744993514e-05,
"loss": 0.3386,
"step": 1420
},
{
"epoch": 1.2532865907099036,
"grad_norm": 2.071869373321533,
"learning_rate": 3.773632844052767e-05,
"loss": 0.363,
"step": 1430
},
{
"epoch": 1.2620508326029798,
"grad_norm": 1.671288251876831,
"learning_rate": 3.7165876996504125e-05,
"loss": 0.3828,
"step": 1440
},
{
"epoch": 1.270815074496056,
"grad_norm": 1.8811005353927612,
"learning_rate": 3.659880552651317e-05,
"loss": 0.3551,
"step": 1450
},
{
"epoch": 1.2795793163891322,
"grad_norm": 1.5208740234375,
"learning_rate": 3.6035221503562775e-05,
"loss": 0.3566,
"step": 1460
},
{
"epoch": 1.2883435582822087,
"grad_norm": 1.7736235857009888,
"learning_rate": 3.547523173970989e-05,
"loss": 0.3629,
"step": 1470
},
{
"epoch": 1.2971078001752847,
"grad_norm": 1.6049748659133911,
"learning_rate": 3.491894236581728e-05,
"loss": 0.3359,
"step": 1480
},
{
"epoch": 1.3058720420683612,
"grad_norm": 1.4260120391845703,
"learning_rate": 3.436645881143918e-05,
"loss": 0.3758,
"step": 1490
},
{
"epoch": 1.3146362839614374,
"grad_norm": 1.9971890449523926,
"learning_rate": 3.3817885784839986e-05,
"loss": 0.314,
"step": 1500
},
{
"epoch": 1.3234005258545136,
"grad_norm": 1.7832164764404297,
"learning_rate": 3.327332725314974e-05,
"loss": 0.3901,
"step": 1510
},
{
"epoch": 1.3321647677475899,
"grad_norm": 1.6533173322677612,
"learning_rate": 3.273288642265985e-05,
"loss": 0.3324,
"step": 1520
},
{
"epoch": 1.340929009640666,
"grad_norm": 1.5945855379104614,
"learning_rate": 3.2196665719263266e-05,
"loss": 0.3435,
"step": 1530
},
{
"epoch": 1.3496932515337423,
"grad_norm": 1.51680588722229,
"learning_rate": 3.166476676904235e-05,
"loss": 0.3714,
"step": 1540
},
{
"epoch": 1.3584574934268185,
"grad_norm": 1.6684399843215942,
"learning_rate": 3.113729037900843e-05,
"loss": 0.333,
"step": 1550
},
{
"epoch": 1.3672217353198948,
"grad_norm": 1.5105247497558594,
"learning_rate": 3.0614336517996576e-05,
"loss": 0.3615,
"step": 1560
},
{
"epoch": 1.375985977212971,
"grad_norm": 2.113157272338867,
"learning_rate": 3.0096004297719205e-05,
"loss": 0.3002,
"step": 1570
},
{
"epoch": 1.3847502191060475,
"grad_norm": 1.1173641681671143,
"learning_rate": 2.958239195398217e-05,
"loss": 0.3571,
"step": 1580
},
{
"epoch": 1.3935144609991235,
"grad_norm": 1.4548070430755615,
"learning_rate": 2.90735968280668e-05,
"loss": 0.3252,
"step": 1590
},
{
"epoch": 1.4022787028922,
"grad_norm": 1.7521406412124634,
"learning_rate": 2.8569715348281547e-05,
"loss": 0.3409,
"step": 1600
},
{
"epoch": 1.4110429447852761,
"grad_norm": 1.7292704582214355,
"learning_rate": 2.807084301168652e-05,
"loss": 0.3282,
"step": 1610
},
{
"epoch": 1.4198071866783524,
"grad_norm": 1.7453776597976685,
"learning_rate": 2.7577074365994747e-05,
"loss": 0.3217,
"step": 1620
},
{
"epoch": 1.4285714285714286,
"grad_norm": 2.02482271194458,
"learning_rate": 2.70885029916531e-05,
"loss": 0.3355,
"step": 1630
},
{
"epoch": 1.4373356704645048,
"grad_norm": 1.927150845527649,
"learning_rate": 2.660522148410675e-05,
"loss": 0.3528,
"step": 1640
},
{
"epoch": 1.446099912357581,
"grad_norm": 1.5125665664672852,
"learning_rate": 2.6127321436250117e-05,
"loss": 0.3355,
"step": 1650
},
{
"epoch": 1.4548641542506573,
"grad_norm": 1.771646499633789,
"learning_rate": 2.565489342106805e-05,
"loss": 0.3375,
"step": 1660
},
{
"epoch": 1.4636283961437335,
"grad_norm": 2.540931463241577,
"learning_rate": 2.518802697447003e-05,
"loss": 0.3601,
"step": 1670
},
{
"epoch": 1.4723926380368098,
"grad_norm": 1.5853700637817383,
"learning_rate": 2.472681057832121e-05,
"loss": 0.3207,
"step": 1680
},
{
"epoch": 1.481156879929886,
"grad_norm": 2.11691951751709,
"learning_rate": 2.427133164367296e-05,
"loss": 0.3357,
"step": 1690
},
{
"epoch": 1.4899211218229622,
"grad_norm": 1.55061674118042,
"learning_rate": 2.3821676494196572e-05,
"loss": 0.3314,
"step": 1700
},
{
"epoch": 1.4986853637160387,
"grad_norm": 1.5113292932510376,
"learning_rate": 2.3377930349822856e-05,
"loss": 0.3162,
"step": 1710
},
{
"epoch": 1.5074496056091147,
"grad_norm": 2.0338950157165527,
"learning_rate": 2.2940177310591113e-05,
"loss": 0.3693,
"step": 1720
},
{
"epoch": 1.5162138475021911,
"grad_norm": 2.0127060413360596,
"learning_rate": 2.250850034071016e-05,
"loss": 0.324,
"step": 1730
},
{
"epoch": 1.5249780893952674,
"grad_norm": 1.9195280075073242,
"learning_rate": 2.20829812528348e-05,
"loss": 0.3559,
"step": 1740
},
{
"epoch": 1.5337423312883436,
"grad_norm": 1.6400375366210938,
"learning_rate": 2.1663700692560373e-05,
"loss": 0.3371,
"step": 1750
},
{
"epoch": 1.5425065731814198,
"grad_norm": 1.9201463460922241,
"learning_rate": 2.1250738123138665e-05,
"loss": 0.3536,
"step": 1760
},
{
"epoch": 1.551270815074496,
"grad_norm": 1.8780487775802612,
"learning_rate": 2.084417181041769e-05,
"loss": 0.3829,
"step": 1770
},
{
"epoch": 1.5600350569675723,
"grad_norm": 1.8463397026062012,
"learning_rate": 2.0444078808008655e-05,
"loss": 0.3431,
"step": 1780
},
{
"epoch": 1.5687992988606485,
"grad_norm": 1.8080838918685913,
"learning_rate": 2.005053494268241e-05,
"loss": 0.3748,
"step": 1790
},
{
"epoch": 1.577563540753725,
"grad_norm": 1.9387633800506592,
"learning_rate": 1.9663614799998635e-05,
"loss": 0.3586,
"step": 1800
},
{
"epoch": 1.586327782646801,
"grad_norm": 1.7084999084472656,
"learning_rate": 1.928339171017015e-05,
"loss": 0.357,
"step": 1810
},
{
"epoch": 1.5950920245398774,
"grad_norm": 2.1515519618988037,
"learning_rate": 1.8909937734165107e-05,
"loss": 0.3333,
"step": 1820
},
{
"epoch": 1.6038562664329534,
"grad_norm": 1.4635021686553955,
"learning_rate": 1.8543323650049864e-05,
"loss": 0.3337,
"step": 1830
},
{
"epoch": 1.6126205083260299,
"grad_norm": 1.7444740533828735,
"learning_rate": 1.8183618939574904e-05,
"loss": 0.3686,
"step": 1840
},
{
"epoch": 1.6213847502191059,
"grad_norm": 1.587889552116394,
"learning_rate": 1.7830891775006396e-05,
"loss": 0.3167,
"step": 1850
},
{
"epoch": 1.6301489921121823,
"grad_norm": 1.9329102039337158,
"learning_rate": 1.748520900620609e-05,
"loss": 0.3349,
"step": 1860
},
{
"epoch": 1.6389132340052586,
"grad_norm": 1.610293984413147,
"learning_rate": 1.714663614796167e-05,
"loss": 0.3841,
"step": 1870
},
{
"epoch": 1.6476774758983348,
"grad_norm": 1.9500341415405273,
"learning_rate": 1.6815237367570197e-05,
"loss": 0.3575,
"step": 1880
},
{
"epoch": 1.656441717791411,
"grad_norm": 1.717809796333313,
"learning_rate": 1.6491075472677016e-05,
"loss": 0.2917,
"step": 1890
},
{
"epoch": 1.6652059596844873,
"grad_norm": 1.2370789051055908,
"learning_rate": 1.6174211899372175e-05,
"loss": 0.3535,
"step": 1900
},
{
"epoch": 1.6739702015775635,
"grad_norm": 1.6636922359466553,
"learning_rate": 1.5864706700546955e-05,
"loss": 0.3242,
"step": 1910
},
{
"epoch": 1.6827344434706397,
"grad_norm": 1.3746954202651978,
"learning_rate": 1.5562618534512428e-05,
"loss": 0.3463,
"step": 1920
},
{
"epoch": 1.6914986853637162,
"grad_norm": 1.9031116962432861,
"learning_rate": 1.5268004653882406e-05,
"loss": 0.3448,
"step": 1930
},
{
"epoch": 1.7002629272567922,
"grad_norm": 1.8231595754623413,
"learning_rate": 1.4980920894722692e-05,
"loss": 0.3327,
"step": 1940
},
{
"epoch": 1.7090271691498686,
"grad_norm": 2.2330541610717773,
"learning_rate": 1.4701421665969001e-05,
"loss": 0.344,
"step": 1950
},
{
"epoch": 1.7177914110429446,
"grad_norm": 1.5624206066131592,
"learning_rate": 1.442955993911505e-05,
"loss": 0.3194,
"step": 1960
},
{
"epoch": 1.726555652936021,
"grad_norm": 1.7255851030349731,
"learning_rate": 1.4165387238173399e-05,
"loss": 0.3122,
"step": 1970
},
{
"epoch": 1.7353198948290973,
"grad_norm": 1.7800745964050293,
"learning_rate": 1.3908953629910376e-05,
"loss": 0.3005,
"step": 1980
},
{
"epoch": 1.7440841367221736,
"grad_norm": 2.472658395767212,
"learning_rate": 1.3660307714357338e-05,
"loss": 0.3571,
"step": 1990
},
{
"epoch": 1.7528483786152498,
"grad_norm": 1.6620064973831177,
"learning_rate": 1.3419496615599805e-05,
"loss": 0.3522,
"step": 2000
},
{
"epoch": 1.761612620508326,
"grad_norm": 1.5490673780441284,
"learning_rate": 1.318656597284643e-05,
"loss": 0.2967,
"step": 2010
},
{
"epoch": 1.7703768624014022,
"grad_norm": 2.1536474227905273,
"learning_rate": 1.2961559931779257e-05,
"loss": 0.3703,
"step": 2020
},
{
"epoch": 1.7791411042944785,
"grad_norm": 1.490907073020935,
"learning_rate": 1.274452113618716e-05,
"loss": 0.3076,
"step": 2030
},
{
"epoch": 1.787905346187555,
"grad_norm": 1.4357167482376099,
"learning_rate": 1.2535490719883835e-05,
"loss": 0.3265,
"step": 2040
},
{
"epoch": 1.796669588080631,
"grad_norm": 1.5666704177856445,
"learning_rate": 1.233450829891203e-05,
"loss": 0.324,
"step": 2050
},
{
"epoch": 1.8054338299737074,
"grad_norm": 1.3453813791275024,
"learning_rate": 1.2141611964035366e-05,
"loss": 0.2977,
"step": 2060
},
{
"epoch": 1.8141980718667834,
"grad_norm": 1.782593011856079,
"learning_rate": 1.195683827351931e-05,
"loss": 0.2795,
"step": 2070
},
{
"epoch": 1.8229623137598598,
"grad_norm": 1.9441533088684082,
"learning_rate": 1.1780222246202494e-05,
"loss": 0.3166,
"step": 2080
},
{
"epoch": 1.831726555652936,
"grad_norm": 1.778911828994751,
"learning_rate": 1.1611797354859892e-05,
"loss": 0.3493,
"step": 2090
},
{
"epoch": 1.8404907975460123,
"grad_norm": 1.7141963243484497,
"learning_rate": 1.145159551985894e-05,
"loss": 0.3313,
"step": 2100
},
{
"epoch": 1.8492550394390885,
"grad_norm": 3.3165926933288574,
"learning_rate": 1.1299647103109908e-05,
"loss": 0.356,
"step": 2110
},
{
"epoch": 1.8580192813321648,
"grad_norm": 1.787851095199585,
"learning_rate": 1.11559809023116e-05,
"loss": 0.3219,
"step": 2120
},
{
"epoch": 1.866783523225241,
"grad_norm": 1.7090057134628296,
"learning_rate": 1.1020624145493572e-05,
"loss": 0.3445,
"step": 2130
},
{
"epoch": 1.8755477651183172,
"grad_norm": 1.5748744010925293,
"learning_rate": 1.0893602485855766e-05,
"loss": 0.3253,
"step": 2140
},
{
"epoch": 1.8843120070113937,
"grad_norm": 1.9979685544967651,
"learning_rate": 1.0774939996906644e-05,
"loss": 0.3177,
"step": 2150
},
{
"epoch": 1.8930762489044697,
"grad_norm": 1.936296820640564,
"learning_rate": 1.0664659167900723e-05,
"loss": 0.3304,
"step": 2160
},
{
"epoch": 1.9018404907975461,
"grad_norm": 1.8121618032455444,
"learning_rate": 1.0562780899576344e-05,
"loss": 0.3493,
"step": 2170
},
{
"epoch": 1.9106047326906221,
"grad_norm": 1.6870180368423462,
"learning_rate": 1.046932450019448e-05,
"loss": 0.328,
"step": 2180
},
{
"epoch": 1.9193689745836986,
"grad_norm": 2.0619869232177734,
"learning_rate": 1.0384307681879428e-05,
"loss": 0.3845,
"step": 2190
},
{
"epoch": 1.9281332164767746,
"grad_norm": 2.1429038047790527,
"learning_rate": 1.030774655726191e-05,
"loss": 0.3143,
"step": 2200
},
{
"epoch": 1.936897458369851,
"grad_norm": 1.9444646835327148,
"learning_rate": 1.0239655636425374e-05,
"loss": 0.3135,
"step": 2210
},
{
"epoch": 1.9456617002629273,
"grad_norm": 1.6065791845321655,
"learning_rate": 1.0180047824156011e-05,
"loss": 0.3142,
"step": 2220
},
{
"epoch": 1.9544259421560035,
"grad_norm": 2.215041160583496,
"learning_rate": 1.0128934417497004e-05,
"loss": 0.3234,
"step": 2230
},
{
"epoch": 1.9631901840490797,
"grad_norm": 1.766499638557434,
"learning_rate": 1.008632510360747e-05,
"loss": 0.3395,
"step": 2240
},
{
"epoch": 1.971954425942156,
"grad_norm": 2.355278491973877,
"learning_rate": 1.0052227957926518e-05,
"loss": 0.3476,
"step": 2250
},
{
"epoch": 1.9807186678352322,
"grad_norm": 1.6923573017120361,
"learning_rate": 1.0026649442642785e-05,
"loss": 0.386,
"step": 2260
},
{
"epoch": 1.9894829097283084,
"grad_norm": 1.454087495803833,
"learning_rate": 1.0009594405469695e-05,
"loss": 0.3059,
"step": 2270
},
{
"epoch": 1.9982471516213849,
"grad_norm": 1.5868600606918335,
"learning_rate": 1.0001066078726703e-05,
"loss": 0.3474,
"step": 2280
}
],
"logging_steps": 10,
"max_steps": 2282,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7769146165323366e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}