Qwen-sft-la-v0.1 / trainer_state.json
sci-m-wang's picture
Upload 7 files
9b7fea5 verified
raw
history blame
116 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.996258885147775,
"eval_steps": 500,
"global_step": 3340,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014964459408903853,
"grad_norm": 0.5450117588043213,
"learning_rate": 4.999972352489418e-05,
"loss": 1.3208,
"step": 5
},
{
"epoch": 0.029928918817807706,
"grad_norm": 0.5009211301803589,
"learning_rate": 4.9998894105691785e-05,
"loss": 1.2903,
"step": 10
},
{
"epoch": 0.04489337822671156,
"grad_norm": 0.45117729902267456,
"learning_rate": 4.9997511760737915e-05,
"loss": 1.2271,
"step": 15
},
{
"epoch": 0.05985783763561541,
"grad_norm": 0.5625576376914978,
"learning_rate": 4.999557652060729e-05,
"loss": 1.186,
"step": 20
},
{
"epoch": 0.07482229704451927,
"grad_norm": 0.519900381565094,
"learning_rate": 4.999308842810357e-05,
"loss": 1.1302,
"step": 25
},
{
"epoch": 0.08978675645342311,
"grad_norm": 0.593724250793457,
"learning_rate": 4.999004753825842e-05,
"loss": 1.1372,
"step": 30
},
{
"epoch": 0.10475121586232697,
"grad_norm": 0.663527250289917,
"learning_rate": 4.998645391833024e-05,
"loss": 1.0359,
"step": 35
},
{
"epoch": 0.11971567527123082,
"grad_norm": 0.6744909286499023,
"learning_rate": 4.9982307647802765e-05,
"loss": 1.0511,
"step": 40
},
{
"epoch": 0.13468013468013468,
"grad_norm": 0.5474820137023926,
"learning_rate": 4.9977608818383226e-05,
"loss": 0.9909,
"step": 45
},
{
"epoch": 0.14964459408903855,
"grad_norm": 0.5713778734207153,
"learning_rate": 4.9972357534000394e-05,
"loss": 1.0139,
"step": 50
},
{
"epoch": 0.1646090534979424,
"grad_norm": 0.6148533225059509,
"learning_rate": 4.99665539108022e-05,
"loss": 1.0156,
"step": 55
},
{
"epoch": 0.17957351290684623,
"grad_norm": 0.6597582697868347,
"learning_rate": 4.996019807715324e-05,
"loss": 0.995,
"step": 60
},
{
"epoch": 0.1945379723157501,
"grad_norm": 0.6145315170288086,
"learning_rate": 4.9953290173631896e-05,
"loss": 0.9641,
"step": 65
},
{
"epoch": 0.20950243172465394,
"grad_norm": 0.7690613865852356,
"learning_rate": 4.994583035302723e-05,
"loss": 0.9934,
"step": 70
},
{
"epoch": 0.2244668911335578,
"grad_norm": 0.7555385828018188,
"learning_rate": 4.9937818780335646e-05,
"loss": 0.946,
"step": 75
},
{
"epoch": 0.23943135054246165,
"grad_norm": 0.7178649306297302,
"learning_rate": 4.992925563275714e-05,
"loss": 0.91,
"step": 80
},
{
"epoch": 0.2543958099513655,
"grad_norm": 0.7035357356071472,
"learning_rate": 4.99201410996915e-05,
"loss": 0.9612,
"step": 85
},
{
"epoch": 0.26936026936026936,
"grad_norm": 0.7382842898368835,
"learning_rate": 4.9910475382734034e-05,
"loss": 0.8687,
"step": 90
},
{
"epoch": 0.2843247287691732,
"grad_norm": 0.8056157231330872,
"learning_rate": 4.990025869567117e-05,
"loss": 0.9038,
"step": 95
},
{
"epoch": 0.2992891881780771,
"grad_norm": 0.9074241518974304,
"learning_rate": 4.988949126447567e-05,
"loss": 0.9063,
"step": 100
},
{
"epoch": 0.31425364758698093,
"grad_norm": 0.7684347033500671,
"learning_rate": 4.987817332730166e-05,
"loss": 0.9065,
"step": 105
},
{
"epoch": 0.3292181069958848,
"grad_norm": 0.8751540184020996,
"learning_rate": 4.986630513447938e-05,
"loss": 0.9492,
"step": 110
},
{
"epoch": 0.3441825664047886,
"grad_norm": 0.8586133718490601,
"learning_rate": 4.985388694850963e-05,
"loss": 0.9085,
"step": 115
},
{
"epoch": 0.35914702581369246,
"grad_norm": 0.7148111462593079,
"learning_rate": 4.984091904405793e-05,
"loss": 0.9125,
"step": 120
},
{
"epoch": 0.37411148522259635,
"grad_norm": 0.7763463258743286,
"learning_rate": 4.9827401707948504e-05,
"loss": 0.9019,
"step": 125
},
{
"epoch": 0.3890759446315002,
"grad_norm": 0.9074414372444153,
"learning_rate": 4.981333523915792e-05,
"loss": 0.8188,
"step": 130
},
{
"epoch": 0.40404040404040403,
"grad_norm": 1.0170953273773193,
"learning_rate": 4.979871994880845e-05,
"loss": 0.8757,
"step": 135
},
{
"epoch": 0.4190048634493079,
"grad_norm": 0.8414100408554077,
"learning_rate": 4.97835561601612e-05,
"loss": 0.8711,
"step": 140
},
{
"epoch": 0.43396932285821177,
"grad_norm": 0.9481101036071777,
"learning_rate": 4.9767844208608984e-05,
"loss": 0.8371,
"step": 145
},
{
"epoch": 0.4489337822671156,
"grad_norm": 0.8936446905136108,
"learning_rate": 4.9751584441668874e-05,
"loss": 0.8282,
"step": 150
},
{
"epoch": 0.46389824167601945,
"grad_norm": 0.906244695186615,
"learning_rate": 4.973477721897454e-05,
"loss": 0.8702,
"step": 155
},
{
"epoch": 0.4788627010849233,
"grad_norm": 0.9465859532356262,
"learning_rate": 4.971742291226827e-05,
"loss": 0.8779,
"step": 160
},
{
"epoch": 0.49382716049382713,
"grad_norm": 0.752061128616333,
"learning_rate": 4.969952190539276e-05,
"loss": 0.8855,
"step": 165
},
{
"epoch": 0.508791619902731,
"grad_norm": 0.9224424958229065,
"learning_rate": 4.968107459428265e-05,
"loss": 0.8211,
"step": 170
},
{
"epoch": 0.5237560793116348,
"grad_norm": 0.8061802387237549,
"learning_rate": 4.9662081386955714e-05,
"loss": 0.84,
"step": 175
},
{
"epoch": 0.5387205387205387,
"grad_norm": 1.4521487951278687,
"learning_rate": 4.964254270350387e-05,
"loss": 0.8529,
"step": 180
},
{
"epoch": 0.5536849981294426,
"grad_norm": 0.807750403881073,
"learning_rate": 4.9622458976083885e-05,
"loss": 0.8891,
"step": 185
},
{
"epoch": 0.5686494575383464,
"grad_norm": 0.9400819540023804,
"learning_rate": 4.960183064890782e-05,
"loss": 0.8705,
"step": 190
},
{
"epoch": 0.5836139169472503,
"grad_norm": 0.8715450763702393,
"learning_rate": 4.958065817823318e-05,
"loss": 0.8671,
"step": 195
},
{
"epoch": 0.5985783763561542,
"grad_norm": 0.9633534550666809,
"learning_rate": 4.955894203235284e-05,
"loss": 0.8379,
"step": 200
},
{
"epoch": 0.613542835765058,
"grad_norm": 0.8975620865821838,
"learning_rate": 4.953668269158472e-05,
"loss": 0.8086,
"step": 205
},
{
"epoch": 0.6285072951739619,
"grad_norm": 0.904045045375824,
"learning_rate": 4.9513880648261114e-05,
"loss": 0.8183,
"step": 210
},
{
"epoch": 0.6434717545828657,
"grad_norm": 1.009617805480957,
"learning_rate": 4.949053640671778e-05,
"loss": 0.8557,
"step": 215
},
{
"epoch": 0.6584362139917695,
"grad_norm": 1.1475061178207397,
"learning_rate": 4.946665048328287e-05,
"loss": 0.8809,
"step": 220
},
{
"epoch": 0.6734006734006734,
"grad_norm": 1.087480068206787,
"learning_rate": 4.944222340626543e-05,
"loss": 0.7887,
"step": 225
},
{
"epoch": 0.6883651328095772,
"grad_norm": 0.9593074321746826,
"learning_rate": 4.9417255715943766e-05,
"loss": 0.8965,
"step": 230
},
{
"epoch": 0.7033295922184811,
"grad_norm": 1.103148102760315,
"learning_rate": 4.939174796455346e-05,
"loss": 0.9189,
"step": 235
},
{
"epoch": 0.7182940516273849,
"grad_norm": 0.9249380826950073,
"learning_rate": 4.936570071627518e-05,
"loss": 0.8793,
"step": 240
},
{
"epoch": 0.7332585110362888,
"grad_norm": 1.03147554397583,
"learning_rate": 4.933911454722217e-05,
"loss": 0.8052,
"step": 245
},
{
"epoch": 0.7482229704451927,
"grad_norm": 1.0135599374771118,
"learning_rate": 4.9311990045427553e-05,
"loss": 0.8033,
"step": 250
},
{
"epoch": 0.7631874298540965,
"grad_norm": 1.0295403003692627,
"learning_rate": 4.928432781083128e-05,
"loss": 0.9045,
"step": 255
},
{
"epoch": 0.7781518892630004,
"grad_norm": 0.9905064105987549,
"learning_rate": 4.92561284552669e-05,
"loss": 0.8486,
"step": 260
},
{
"epoch": 0.7931163486719043,
"grad_norm": 0.9656111598014832,
"learning_rate": 4.9227392602447996e-05,
"loss": 0.8324,
"step": 265
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.9249575734138489,
"learning_rate": 4.91981208879544e-05,
"loss": 0.8172,
"step": 270
},
{
"epoch": 0.823045267489712,
"grad_norm": 0.904988706111908,
"learning_rate": 4.9168313959218135e-05,
"loss": 0.8258,
"step": 275
},
{
"epoch": 0.8380097268986157,
"grad_norm": 1.060915231704712,
"learning_rate": 4.913797247550912e-05,
"loss": 0.867,
"step": 280
},
{
"epoch": 0.8529741863075196,
"grad_norm": 1.017268180847168,
"learning_rate": 4.910709710792054e-05,
"loss": 0.7974,
"step": 285
},
{
"epoch": 0.8679386457164235,
"grad_norm": 1.0362051725387573,
"learning_rate": 4.9075688539354025e-05,
"loss": 0.8596,
"step": 290
},
{
"epoch": 0.8829031051253273,
"grad_norm": 0.9945353269577026,
"learning_rate": 4.904374746450459e-05,
"loss": 0.8076,
"step": 295
},
{
"epoch": 0.8978675645342312,
"grad_norm": 0.986596941947937,
"learning_rate": 4.901127458984516e-05,
"loss": 0.8126,
"step": 300
},
{
"epoch": 0.912832023943135,
"grad_norm": 1.016927719116211,
"learning_rate": 4.8978270633611086e-05,
"loss": 0.817,
"step": 305
},
{
"epoch": 0.9277964833520389,
"grad_norm": 1.0122638940811157,
"learning_rate": 4.8944736325784136e-05,
"loss": 0.9226,
"step": 310
},
{
"epoch": 0.9427609427609428,
"grad_norm": 1.04526948928833,
"learning_rate": 4.891067240807641e-05,
"loss": 0.7878,
"step": 315
},
{
"epoch": 0.9577254021698466,
"grad_norm": 0.8837614059448242,
"learning_rate": 4.887607963391394e-05,
"loss": 0.8187,
"step": 320
},
{
"epoch": 0.9726898615787505,
"grad_norm": 1.1496906280517578,
"learning_rate": 4.884095876841999e-05,
"loss": 0.8531,
"step": 325
},
{
"epoch": 0.9876543209876543,
"grad_norm": 0.9902486205101013,
"learning_rate": 4.880531058839816e-05,
"loss": 0.7615,
"step": 330
},
{
"epoch": 1.0026187803965583,
"grad_norm": 1.0682637691497803,
"learning_rate": 4.87691358823152e-05,
"loss": 0.8132,
"step": 335
},
{
"epoch": 1.017583239805462,
"grad_norm": 1.0381455421447754,
"learning_rate": 4.8732435450283565e-05,
"loss": 0.7877,
"step": 340
},
{
"epoch": 1.0325476992143658,
"grad_norm": 0.9033912420272827,
"learning_rate": 4.869521010404373e-05,
"loss": 0.7892,
"step": 345
},
{
"epoch": 1.0475121586232696,
"grad_norm": 0.9539169669151306,
"learning_rate": 4.86574606669462e-05,
"loss": 0.8192,
"step": 350
},
{
"epoch": 1.0624766180321736,
"grad_norm": 0.8774738907814026,
"learning_rate": 4.861918797393336e-05,
"loss": 0.753,
"step": 355
},
{
"epoch": 1.0774410774410774,
"grad_norm": 1.1601481437683105,
"learning_rate": 4.8580392871520946e-05,
"loss": 0.8113,
"step": 360
},
{
"epoch": 1.0924055368499812,
"grad_norm": 0.9350395202636719,
"learning_rate": 4.854107621777938e-05,
"loss": 0.8731,
"step": 365
},
{
"epoch": 1.1073699962588852,
"grad_norm": 1.0182013511657715,
"learning_rate": 4.8501238882314715e-05,
"loss": 0.8649,
"step": 370
},
{
"epoch": 1.122334455667789,
"grad_norm": 1.158241629600525,
"learning_rate": 4.84608817462495e-05,
"loss": 0.82,
"step": 375
},
{
"epoch": 1.1372989150766928,
"grad_norm": 1.0360873937606812,
"learning_rate": 4.8420005702203196e-05,
"loss": 0.8236,
"step": 380
},
{
"epoch": 1.1522633744855968,
"grad_norm": 1.1079692840576172,
"learning_rate": 4.83786116542725e-05,
"loss": 0.7584,
"step": 385
},
{
"epoch": 1.1672278338945006,
"grad_norm": 1.0275564193725586,
"learning_rate": 4.833670051801131e-05,
"loss": 0.7847,
"step": 390
},
{
"epoch": 1.1821922933034044,
"grad_norm": 1.1246060132980347,
"learning_rate": 4.829427322041049e-05,
"loss": 0.7597,
"step": 395
},
{
"epoch": 1.1971567527123081,
"grad_norm": 1.0290372371673584,
"learning_rate": 4.825133069987737e-05,
"loss": 0.733,
"step": 400
},
{
"epoch": 1.2121212121212122,
"grad_norm": 1.1085125207901,
"learning_rate": 4.820787390621499e-05,
"loss": 0.7729,
"step": 405
},
{
"epoch": 1.227085671530116,
"grad_norm": 1.2001007795333862,
"learning_rate": 4.816390380060108e-05,
"loss": 0.769,
"step": 410
},
{
"epoch": 1.24205013093902,
"grad_norm": 1.1357756853103638,
"learning_rate": 4.8119421355566796e-05,
"loss": 0.8017,
"step": 415
},
{
"epoch": 1.2570145903479237,
"grad_norm": 1.0524897575378418,
"learning_rate": 4.807442755497524e-05,
"loss": 0.7916,
"step": 420
},
{
"epoch": 1.2719790497568275,
"grad_norm": 1.1913483142852783,
"learning_rate": 4.802892339399967e-05,
"loss": 0.8058,
"step": 425
},
{
"epoch": 1.2869435091657313,
"grad_norm": 1.2256290912628174,
"learning_rate": 4.7982909879101515e-05,
"loss": 0.8267,
"step": 430
},
{
"epoch": 1.3019079685746353,
"grad_norm": 1.0073180198669434,
"learning_rate": 4.7936388028008084e-05,
"loss": 0.8316,
"step": 435
},
{
"epoch": 1.316872427983539,
"grad_norm": 1.030885934829712,
"learning_rate": 4.7889358869690056e-05,
"loss": 0.7874,
"step": 440
},
{
"epoch": 1.3318368873924429,
"grad_norm": 1.1515103578567505,
"learning_rate": 4.784182344433878e-05,
"loss": 0.8268,
"step": 445
},
{
"epoch": 1.3468013468013469,
"grad_norm": 0.8818157315254211,
"learning_rate": 4.779378280334318e-05,
"loss": 0.8366,
"step": 450
},
{
"epoch": 1.3617658062102507,
"grad_norm": 0.9439221024513245,
"learning_rate": 4.7745238009266556e-05,
"loss": 0.8279,
"step": 455
},
{
"epoch": 1.3767302656191545,
"grad_norm": 0.9476526379585266,
"learning_rate": 4.7696190135823094e-05,
"loss": 0.807,
"step": 460
},
{
"epoch": 1.3916947250280582,
"grad_norm": 0.9290974736213684,
"learning_rate": 4.764664026785405e-05,
"loss": 0.8259,
"step": 465
},
{
"epoch": 1.4066591844369623,
"grad_norm": 1.007653832435608,
"learning_rate": 4.759658950130385e-05,
"loss": 0.8344,
"step": 470
},
{
"epoch": 1.421623643845866,
"grad_norm": 1.1456594467163086,
"learning_rate": 4.7546038943195736e-05,
"loss": 0.7565,
"step": 475
},
{
"epoch": 1.43658810325477,
"grad_norm": 1.0801887512207031,
"learning_rate": 4.749498971160742e-05,
"loss": 0.7771,
"step": 480
},
{
"epoch": 1.4515525626636738,
"grad_norm": 0.9851332306861877,
"learning_rate": 4.744344293564621e-05,
"loss": 0.7803,
"step": 485
},
{
"epoch": 1.4665170220725776,
"grad_norm": 1.1050950288772583,
"learning_rate": 4.739139975542415e-05,
"loss": 0.8118,
"step": 490
},
{
"epoch": 1.4814814814814814,
"grad_norm": 1.030402421951294,
"learning_rate": 4.7338861322032726e-05,
"loss": 0.849,
"step": 495
},
{
"epoch": 1.4964459408903854,
"grad_norm": 0.9448444843292236,
"learning_rate": 4.7285828797517465e-05,
"loss": 0.7255,
"step": 500
},
{
"epoch": 1.5114104002992892,
"grad_norm": 1.0439302921295166,
"learning_rate": 4.723230335485218e-05,
"loss": 0.7413,
"step": 505
},
{
"epoch": 1.5263748597081932,
"grad_norm": 1.234944462776184,
"learning_rate": 4.717828617791308e-05,
"loss": 0.7648,
"step": 510
},
{
"epoch": 1.541339319117097,
"grad_norm": 1.1755177974700928,
"learning_rate": 4.7123778461452536e-05,
"loss": 0.7203,
"step": 515
},
{
"epoch": 1.5563037785260008,
"grad_norm": 0.9818833470344543,
"learning_rate": 4.7068781411072686e-05,
"loss": 0.7813,
"step": 520
},
{
"epoch": 1.5712682379349046,
"grad_norm": 1.286037802696228,
"learning_rate": 4.7013296243198746e-05,
"loss": 0.8098,
"step": 525
},
{
"epoch": 1.5862326973438083,
"grad_norm": 1.2840811014175415,
"learning_rate": 4.695732418505214e-05,
"loss": 0.7752,
"step": 530
},
{
"epoch": 1.6011971567527123,
"grad_norm": 0.9121096730232239,
"learning_rate": 4.690086647462331e-05,
"loss": 0.8124,
"step": 535
},
{
"epoch": 1.6161616161616161,
"grad_norm": 1.1097527742385864,
"learning_rate": 4.684392436064439e-05,
"loss": 0.8453,
"step": 540
},
{
"epoch": 1.6311260755705201,
"grad_norm": 1.0087509155273438,
"learning_rate": 4.678649910256152e-05,
"loss": 0.7736,
"step": 545
},
{
"epoch": 1.646090534979424,
"grad_norm": 1.028320074081421,
"learning_rate": 4.6728591970507055e-05,
"loss": 0.8248,
"step": 550
},
{
"epoch": 1.6610549943883277,
"grad_norm": 1.2182048559188843,
"learning_rate": 4.6670204245271444e-05,
"loss": 0.7903,
"step": 555
},
{
"epoch": 1.6760194537972315,
"grad_norm": 1.0399212837219238,
"learning_rate": 4.661133721827486e-05,
"loss": 0.7495,
"step": 560
},
{
"epoch": 1.6909839132061353,
"grad_norm": 1.0186119079589844,
"learning_rate": 4.655199219153873e-05,
"loss": 0.76,
"step": 565
},
{
"epoch": 1.7059483726150393,
"grad_norm": 1.2510963678359985,
"learning_rate": 4.649217047765685e-05,
"loss": 0.7618,
"step": 570
},
{
"epoch": 1.7209128320239433,
"grad_norm": 1.3667418956756592,
"learning_rate": 4.643187339976639e-05,
"loss": 0.8169,
"step": 575
},
{
"epoch": 1.735877291432847,
"grad_norm": 1.1502622365951538,
"learning_rate": 4.637110229151863e-05,
"loss": 0.8384,
"step": 580
},
{
"epoch": 1.7508417508417509,
"grad_norm": 0.9805833697319031,
"learning_rate": 4.6309858497049464e-05,
"loss": 0.757,
"step": 585
},
{
"epoch": 1.7658062102506547,
"grad_norm": 1.111222743988037,
"learning_rate": 4.6248143370949636e-05,
"loss": 0.8712,
"step": 590
},
{
"epoch": 1.7807706696595584,
"grad_norm": 1.0927162170410156,
"learning_rate": 4.618595827823486e-05,
"loss": 0.8009,
"step": 595
},
{
"epoch": 1.7957351290684624,
"grad_norm": 1.218826174736023,
"learning_rate": 4.612330459431552e-05,
"loss": 0.8323,
"step": 600
},
{
"epoch": 1.8106995884773662,
"grad_norm": 1.233129858970642,
"learning_rate": 4.606018370496633e-05,
"loss": 0.7373,
"step": 605
},
{
"epoch": 1.8256640478862702,
"grad_norm": 0.9207433462142944,
"learning_rate": 4.5996597006295655e-05,
"loss": 0.7533,
"step": 610
},
{
"epoch": 1.840628507295174,
"grad_norm": 1.1917186975479126,
"learning_rate": 4.593254590471464e-05,
"loss": 0.7831,
"step": 615
},
{
"epoch": 1.8555929667040778,
"grad_norm": 0.9462292194366455,
"learning_rate": 4.586803181690609e-05,
"loss": 0.7733,
"step": 620
},
{
"epoch": 1.8705574261129816,
"grad_norm": 0.9503220915794373,
"learning_rate": 4.580305616979314e-05,
"loss": 0.8178,
"step": 625
},
{
"epoch": 1.8855218855218854,
"grad_norm": 0.8902071118354797,
"learning_rate": 4.573762040050772e-05,
"loss": 0.8028,
"step": 630
},
{
"epoch": 1.9004863449307894,
"grad_norm": 1.1471889019012451,
"learning_rate": 4.567172595635871e-05,
"loss": 0.7499,
"step": 635
},
{
"epoch": 1.9154508043396934,
"grad_norm": 1.063602328300476,
"learning_rate": 4.560537429479998e-05,
"loss": 0.7516,
"step": 640
},
{
"epoch": 1.9304152637485972,
"grad_norm": 0.9486026763916016,
"learning_rate": 4.553856688339817e-05,
"loss": 0.7598,
"step": 645
},
{
"epoch": 1.945379723157501,
"grad_norm": 0.9863812923431396,
"learning_rate": 4.547130519980014e-05,
"loss": 0.8039,
"step": 650
},
{
"epoch": 1.9603441825664047,
"grad_norm": 1.0052098035812378,
"learning_rate": 4.54035907317004e-05,
"loss": 0.7574,
"step": 655
},
{
"epoch": 1.9753086419753085,
"grad_norm": 1.0039119720458984,
"learning_rate": 4.533542497680812e-05,
"loss": 0.7594,
"step": 660
},
{
"epoch": 1.9902731013842125,
"grad_norm": 1.0683871507644653,
"learning_rate": 4.5266809442814035e-05,
"loss": 0.7489,
"step": 665
},
{
"epoch": 2.0052375607931165,
"grad_norm": 0.964434802532196,
"learning_rate": 4.519774564735711e-05,
"loss": 0.7376,
"step": 670
},
{
"epoch": 2.0202020202020203,
"grad_norm": 1.0128512382507324,
"learning_rate": 4.512823511799098e-05,
"loss": 0.7275,
"step": 675
},
{
"epoch": 2.035166479610924,
"grad_norm": 1.0383211374282837,
"learning_rate": 4.5058279392150096e-05,
"loss": 0.749,
"step": 680
},
{
"epoch": 2.050130939019828,
"grad_norm": 0.9630417227745056,
"learning_rate": 4.4987880017115793e-05,
"loss": 0.7563,
"step": 685
},
{
"epoch": 2.0650953984287317,
"grad_norm": 1.2037107944488525,
"learning_rate": 4.491703854998207e-05,
"loss": 0.7426,
"step": 690
},
{
"epoch": 2.0800598578376355,
"grad_norm": 1.144274353981018,
"learning_rate": 4.484575655762107e-05,
"loss": 0.7323,
"step": 695
},
{
"epoch": 2.0950243172465393,
"grad_norm": 1.2053053379058838,
"learning_rate": 4.477403561664852e-05,
"loss": 0.7474,
"step": 700
},
{
"epoch": 2.1099887766554435,
"grad_norm": 1.039339542388916,
"learning_rate": 4.4701877313388784e-05,
"loss": 0.766,
"step": 705
},
{
"epoch": 2.1249532360643473,
"grad_norm": 1.0573480129241943,
"learning_rate": 4.462928324383985e-05,
"loss": 0.8314,
"step": 710
},
{
"epoch": 2.139917695473251,
"grad_norm": 1.2173911333084106,
"learning_rate": 4.455625501363794e-05,
"loss": 0.7388,
"step": 715
},
{
"epoch": 2.154882154882155,
"grad_norm": 1.1535879373550415,
"learning_rate": 4.448279423802207e-05,
"loss": 0.7698,
"step": 720
},
{
"epoch": 2.1698466142910586,
"grad_norm": 0.985844075679779,
"learning_rate": 4.44089025417983e-05,
"loss": 0.7692,
"step": 725
},
{
"epoch": 2.1848110736999624,
"grad_norm": 1.0249568223953247,
"learning_rate": 4.43345815593038e-05,
"loss": 0.7261,
"step": 730
},
{
"epoch": 2.1997755331088666,
"grad_norm": 1.305114507675171,
"learning_rate": 4.425983293437069e-05,
"loss": 0.8001,
"step": 735
},
{
"epoch": 2.2147399925177704,
"grad_norm": 1.1478573083877563,
"learning_rate": 4.4184658320289675e-05,
"loss": 0.8036,
"step": 740
},
{
"epoch": 2.229704451926674,
"grad_norm": 0.9925107359886169,
"learning_rate": 4.410905937977352e-05,
"loss": 0.7775,
"step": 745
},
{
"epoch": 2.244668911335578,
"grad_norm": 1.0552047491073608,
"learning_rate": 4.403303778492022e-05,
"loss": 0.7449,
"step": 750
},
{
"epoch": 2.259633370744482,
"grad_norm": 1.0449482202529907,
"learning_rate": 4.395659521717607e-05,
"loss": 0.7197,
"step": 755
},
{
"epoch": 2.2745978301533856,
"grad_norm": 1.0985392332077026,
"learning_rate": 4.3879733367298405e-05,
"loss": 0.7691,
"step": 760
},
{
"epoch": 2.28956228956229,
"grad_norm": 1.1123350858688354,
"learning_rate": 4.3802453935318294e-05,
"loss": 0.7501,
"step": 765
},
{
"epoch": 2.3045267489711936,
"grad_norm": 1.2243092060089111,
"learning_rate": 4.372475863050286e-05,
"loss": 0.7606,
"step": 770
},
{
"epoch": 2.3194912083800974,
"grad_norm": 0.9873678088188171,
"learning_rate": 4.364664917131751e-05,
"loss": 0.7605,
"step": 775
},
{
"epoch": 2.334455667789001,
"grad_norm": 1.1722878217697144,
"learning_rate": 4.3568127285387925e-05,
"loss": 0.7186,
"step": 780
},
{
"epoch": 2.349420127197905,
"grad_norm": 1.1724722385406494,
"learning_rate": 4.348919470946185e-05,
"loss": 0.7614,
"step": 785
},
{
"epoch": 2.3643845866068087,
"grad_norm": 1.0196776390075684,
"learning_rate": 4.340985318937066e-05,
"loss": 0.7537,
"step": 790
},
{
"epoch": 2.3793490460157125,
"grad_norm": 1.0410211086273193,
"learning_rate": 4.333010447999077e-05,
"loss": 0.7246,
"step": 795
},
{
"epoch": 2.3943135054246163,
"grad_norm": 1.0125558376312256,
"learning_rate": 4.3249950345204806e-05,
"loss": 0.7561,
"step": 800
},
{
"epoch": 2.4092779648335205,
"grad_norm": 0.9731377363204956,
"learning_rate": 4.31693925578626e-05,
"loss": 0.7418,
"step": 805
},
{
"epoch": 2.4242424242424243,
"grad_norm": 1.1109338998794556,
"learning_rate": 4.3088432899741985e-05,
"loss": 0.6956,
"step": 810
},
{
"epoch": 2.439206883651328,
"grad_norm": 1.4336915016174316,
"learning_rate": 4.3007073161509345e-05,
"loss": 0.7715,
"step": 815
},
{
"epoch": 2.454171343060232,
"grad_norm": 1.0564075708389282,
"learning_rate": 4.292531514268008e-05,
"loss": 0.7782,
"step": 820
},
{
"epoch": 2.4691358024691357,
"grad_norm": 1.1472092866897583,
"learning_rate": 4.2843160651578726e-05,
"loss": 0.7125,
"step": 825
},
{
"epoch": 2.48410026187804,
"grad_norm": 0.8819020390510559,
"learning_rate": 4.276061150529903e-05,
"loss": 0.7647,
"step": 830
},
{
"epoch": 2.4990647212869437,
"grad_norm": 1.1399636268615723,
"learning_rate": 4.267766952966369e-05,
"loss": 0.7327,
"step": 835
},
{
"epoch": 2.5140291806958475,
"grad_norm": 1.3246550559997559,
"learning_rate": 4.259433655918404e-05,
"loss": 0.7553,
"step": 840
},
{
"epoch": 2.5289936401047513,
"grad_norm": 1.1041203737258911,
"learning_rate": 4.2510614437019416e-05,
"loss": 0.7685,
"step": 845
},
{
"epoch": 2.543958099513655,
"grad_norm": 1.099228024482727,
"learning_rate": 4.242650501493642e-05,
"loss": 0.7207,
"step": 850
},
{
"epoch": 2.558922558922559,
"grad_norm": 1.140915036201477,
"learning_rate": 4.2342010153267986e-05,
"loss": 0.7253,
"step": 855
},
{
"epoch": 2.5738870183314626,
"grad_norm": 1.020257830619812,
"learning_rate": 4.2257131720872164e-05,
"loss": 0.8055,
"step": 860
},
{
"epoch": 2.5888514777403664,
"grad_norm": 1.1489384174346924,
"learning_rate": 4.2171871595090826e-05,
"loss": 0.7747,
"step": 865
},
{
"epoch": 2.6038159371492706,
"grad_norm": 1.0359724760055542,
"learning_rate": 4.2086231661708185e-05,
"loss": 0.7525,
"step": 870
},
{
"epoch": 2.6187803965581744,
"grad_norm": 1.1845808029174805,
"learning_rate": 4.200021381490899e-05,
"loss": 0.7259,
"step": 875
},
{
"epoch": 2.633744855967078,
"grad_norm": 1.0849671363830566,
"learning_rate": 4.191381995723672e-05,
"loss": 0.7267,
"step": 880
},
{
"epoch": 2.648709315375982,
"grad_norm": 1.1475163698196411,
"learning_rate": 4.182705199955144e-05,
"loss": 0.7862,
"step": 885
},
{
"epoch": 2.6636737747848858,
"grad_norm": 1.1838606595993042,
"learning_rate": 4.173991186098757e-05,
"loss": 0.8079,
"step": 890
},
{
"epoch": 2.67863823419379,
"grad_norm": 0.9914394021034241,
"learning_rate": 4.165240146891145e-05,
"loss": 0.7319,
"step": 895
},
{
"epoch": 2.6936026936026938,
"grad_norm": 1.149774193763733,
"learning_rate": 4.1564522758878656e-05,
"loss": 0.7478,
"step": 900
},
{
"epoch": 2.7085671530115976,
"grad_norm": 1.1517828702926636,
"learning_rate": 4.147627767459124e-05,
"loss": 0.8038,
"step": 905
},
{
"epoch": 2.7235316124205013,
"grad_norm": 1.1382722854614258,
"learning_rate": 4.138766816785474e-05,
"loss": 0.7596,
"step": 910
},
{
"epoch": 2.738496071829405,
"grad_norm": 0.9787604808807373,
"learning_rate": 4.1298696198534955e-05,
"loss": 0.6991,
"step": 915
},
{
"epoch": 2.753460531238309,
"grad_norm": 1.0162303447723389,
"learning_rate": 4.1209363734514674e-05,
"loss": 0.7014,
"step": 920
},
{
"epoch": 2.7684249906472127,
"grad_norm": 1.106070637702942,
"learning_rate": 4.1119672751650074e-05,
"loss": 0.8249,
"step": 925
},
{
"epoch": 2.7833894500561165,
"grad_norm": 1.2757290601730347,
"learning_rate": 4.102962523372709e-05,
"loss": 0.7091,
"step": 930
},
{
"epoch": 2.7983539094650207,
"grad_norm": 1.107681155204773,
"learning_rate": 4.093922317241748e-05,
"loss": 0.8038,
"step": 935
},
{
"epoch": 2.8133183688739245,
"grad_norm": 1.2649710178375244,
"learning_rate": 4.0848468567234796e-05,
"loss": 0.7707,
"step": 940
},
{
"epoch": 2.8282828282828283,
"grad_norm": 1.0683587789535522,
"learning_rate": 4.075736342549018e-05,
"loss": 0.7483,
"step": 945
},
{
"epoch": 2.843247287691732,
"grad_norm": 1.1959580183029175,
"learning_rate": 4.066590976224791e-05,
"loss": 0.7838,
"step": 950
},
{
"epoch": 2.858211747100636,
"grad_norm": 1.1794955730438232,
"learning_rate": 4.0574109600280886e-05,
"loss": 0.7758,
"step": 955
},
{
"epoch": 2.87317620650954,
"grad_norm": 1.0079686641693115,
"learning_rate": 4.048196497002588e-05,
"loss": 0.7591,
"step": 960
},
{
"epoch": 2.888140665918444,
"grad_norm": 1.265084981918335,
"learning_rate": 4.038947790953859e-05,
"loss": 0.7012,
"step": 965
},
{
"epoch": 2.9031051253273477,
"grad_norm": 1.0062329769134521,
"learning_rate": 4.0296650464448616e-05,
"loss": 0.8008,
"step": 970
},
{
"epoch": 2.9180695847362514,
"grad_norm": 1.1500215530395508,
"learning_rate": 4.020348468791416e-05,
"loss": 0.7492,
"step": 975
},
{
"epoch": 2.9330340441451552,
"grad_norm": 1.2765411138534546,
"learning_rate": 4.0109982640576674e-05,
"loss": 0.7736,
"step": 980
},
{
"epoch": 2.947998503554059,
"grad_norm": 1.0655264854431152,
"learning_rate": 4.001614639051521e-05,
"loss": 0.7198,
"step": 985
},
{
"epoch": 2.962962962962963,
"grad_norm": 1.269967794418335,
"learning_rate": 3.9921978013200766e-05,
"loss": 0.7513,
"step": 990
},
{
"epoch": 2.9779274223718666,
"grad_norm": 1.0883420705795288,
"learning_rate": 3.98274795914503e-05,
"loss": 0.804,
"step": 995
},
{
"epoch": 2.992891881780771,
"grad_norm": 1.0706652402877808,
"learning_rate": 3.973265321538069e-05,
"loss": 0.6987,
"step": 1000
},
{
"epoch": 3.0078563411896746,
"grad_norm": 1.1653481721878052,
"learning_rate": 3.963750098236253e-05,
"loss": 0.8132,
"step": 1005
},
{
"epoch": 3.0228208005985784,
"grad_norm": 1.1991537809371948,
"learning_rate": 3.954202499697373e-05,
"loss": 0.7291,
"step": 1010
},
{
"epoch": 3.037785260007482,
"grad_norm": 1.0241738557815552,
"learning_rate": 3.944622737095294e-05,
"loss": 0.7181,
"step": 1015
},
{
"epoch": 3.052749719416386,
"grad_norm": 1.00551438331604,
"learning_rate": 3.9350110223152844e-05,
"loss": 0.732,
"step": 1020
},
{
"epoch": 3.0677141788252897,
"grad_norm": 1.3171230554580688,
"learning_rate": 3.925367567949335e-05,
"loss": 0.8267,
"step": 1025
},
{
"epoch": 3.082678638234194,
"grad_norm": 1.0425944328308105,
"learning_rate": 3.9156925872914506e-05,
"loss": 0.6677,
"step": 1030
},
{
"epoch": 3.0976430976430978,
"grad_norm": 1.0785578489303589,
"learning_rate": 3.905986294332935e-05,
"loss": 0.7701,
"step": 1035
},
{
"epoch": 3.1126075570520015,
"grad_norm": 1.089606523513794,
"learning_rate": 3.8962489037576586e-05,
"loss": 0.6776,
"step": 1040
},
{
"epoch": 3.1275720164609053,
"grad_norm": 1.234337329864502,
"learning_rate": 3.8864806309373076e-05,
"loss": 0.7917,
"step": 1045
},
{
"epoch": 3.142536475869809,
"grad_norm": 1.1076655387878418,
"learning_rate": 3.876681691926624e-05,
"loss": 0.7032,
"step": 1050
},
{
"epoch": 3.157500935278713,
"grad_norm": 1.0013830661773682,
"learning_rate": 3.866852303458623e-05,
"loss": 0.7442,
"step": 1055
},
{
"epoch": 3.1724653946876167,
"grad_norm": 1.2173100709915161,
"learning_rate": 3.856992682939803e-05,
"loss": 0.6668,
"step": 1060
},
{
"epoch": 3.187429854096521,
"grad_norm": 1.1837962865829468,
"learning_rate": 3.847103048445333e-05,
"loss": 0.7408,
"step": 1065
},
{
"epoch": 3.2023943135054247,
"grad_norm": 1.1347203254699707,
"learning_rate": 3.837183618714233e-05,
"loss": 0.7615,
"step": 1070
},
{
"epoch": 3.2173587729143285,
"grad_norm": 1.1322319507598877,
"learning_rate": 3.827234613144533e-05,
"loss": 0.6853,
"step": 1075
},
{
"epoch": 3.2323232323232323,
"grad_norm": 1.24197256565094,
"learning_rate": 3.817256251788425e-05,
"loss": 0.6563,
"step": 1080
},
{
"epoch": 3.247287691732136,
"grad_norm": 1.1162407398223877,
"learning_rate": 3.807248755347387e-05,
"loss": 0.7744,
"step": 1085
},
{
"epoch": 3.2622521511410403,
"grad_norm": 1.0863921642303467,
"learning_rate": 3.79721234516731e-05,
"loss": 0.7257,
"step": 1090
},
{
"epoch": 3.277216610549944,
"grad_norm": 1.1099414825439453,
"learning_rate": 3.787147243233602e-05,
"loss": 0.7711,
"step": 1095
},
{
"epoch": 3.292181069958848,
"grad_norm": 1.3686941862106323,
"learning_rate": 3.77705367216627e-05,
"loss": 0.7528,
"step": 1100
},
{
"epoch": 3.3071455293677516,
"grad_norm": 1.1821480989456177,
"learning_rate": 3.766931855215006e-05,
"loss": 0.7642,
"step": 1105
},
{
"epoch": 3.3221099887766554,
"grad_norm": 1.0392365455627441,
"learning_rate": 3.756782016254242e-05,
"loss": 0.7566,
"step": 1110
},
{
"epoch": 3.337074448185559,
"grad_norm": 1.1076756715774536,
"learning_rate": 3.746604379778203e-05,
"loss": 0.6818,
"step": 1115
},
{
"epoch": 3.352038907594463,
"grad_norm": 1.2123860120773315,
"learning_rate": 3.7363991708959386e-05,
"loss": 0.7248,
"step": 1120
},
{
"epoch": 3.3670033670033668,
"grad_norm": 1.1529157161712646,
"learning_rate": 3.726166615326344e-05,
"loss": 0.7569,
"step": 1125
},
{
"epoch": 3.381967826412271,
"grad_norm": 1.0874592065811157,
"learning_rate": 3.715906939393172e-05,
"loss": 0.7775,
"step": 1130
},
{
"epoch": 3.396932285821175,
"grad_norm": 1.1039067506790161,
"learning_rate": 3.70562037002002e-05,
"loss": 0.7637,
"step": 1135
},
{
"epoch": 3.4118967452300786,
"grad_norm": 1.1319911479949951,
"learning_rate": 3.695307134725317e-05,
"loss": 0.7701,
"step": 1140
},
{
"epoch": 3.4268612046389824,
"grad_norm": 1.3674046993255615,
"learning_rate": 3.684967461617289e-05,
"loss": 0.7202,
"step": 1145
},
{
"epoch": 3.441825664047886,
"grad_norm": 1.214239478111267,
"learning_rate": 3.674601579388913e-05,
"loss": 0.736,
"step": 1150
},
{
"epoch": 3.45679012345679,
"grad_norm": 1.1035867929458618,
"learning_rate": 3.66420971731286e-05,
"loss": 0.7361,
"step": 1155
},
{
"epoch": 3.471754582865694,
"grad_norm": 1.1282587051391602,
"learning_rate": 3.653792105236422e-05,
"loss": 0.7012,
"step": 1160
},
{
"epoch": 3.486719042274598,
"grad_norm": 1.4782813787460327,
"learning_rate": 3.6433489735764334e-05,
"loss": 0.6902,
"step": 1165
},
{
"epoch": 3.5016835016835017,
"grad_norm": 1.2365137338638306,
"learning_rate": 3.6328805533141684e-05,
"loss": 0.7524,
"step": 1170
},
{
"epoch": 3.5166479610924055,
"grad_norm": 1.1180927753448486,
"learning_rate": 3.622387075990233e-05,
"loss": 0.727,
"step": 1175
},
{
"epoch": 3.5316124205013093,
"grad_norm": 1.1830908060073853,
"learning_rate": 3.611868773699449e-05,
"loss": 0.7811,
"step": 1180
},
{
"epoch": 3.546576879910213,
"grad_norm": 1.2073471546173096,
"learning_rate": 3.6013258790857154e-05,
"loss": 0.7164,
"step": 1185
},
{
"epoch": 3.561541339319117,
"grad_norm": 1.1175339221954346,
"learning_rate": 3.590758625336864e-05,
"loss": 0.7238,
"step": 1190
},
{
"epoch": 3.576505798728021,
"grad_norm": 1.2776098251342773,
"learning_rate": 3.5801672461795034e-05,
"loss": 0.7886,
"step": 1195
},
{
"epoch": 3.591470258136925,
"grad_norm": 1.021897792816162,
"learning_rate": 3.569551975873847e-05,
"loss": 0.7491,
"step": 1200
},
{
"epoch": 3.6064347175458287,
"grad_norm": 1.1327399015426636,
"learning_rate": 3.558913049208534e-05,
"loss": 0.7499,
"step": 1205
},
{
"epoch": 3.6213991769547325,
"grad_norm": 1.105021595954895,
"learning_rate": 3.548250701495432e-05,
"loss": 0.6803,
"step": 1210
},
{
"epoch": 3.6363636363636362,
"grad_norm": 1.1674951314926147,
"learning_rate": 3.537565168564442e-05,
"loss": 0.7302,
"step": 1215
},
{
"epoch": 3.6513280957725405,
"grad_norm": 1.0947383642196655,
"learning_rate": 3.526856686758269e-05,
"loss": 0.7106,
"step": 1220
},
{
"epoch": 3.6662925551814443,
"grad_norm": 1.1993179321289062,
"learning_rate": 3.5161254929272046e-05,
"loss": 0.793,
"step": 1225
},
{
"epoch": 3.681257014590348,
"grad_norm": 1.1340358257293701,
"learning_rate": 3.505371824423885e-05,
"loss": 0.8239,
"step": 1230
},
{
"epoch": 3.696221473999252,
"grad_norm": 1.7940102815628052,
"learning_rate": 3.494595919098041e-05,
"loss": 0.6556,
"step": 1235
},
{
"epoch": 3.7111859334081556,
"grad_norm": 1.231870174407959,
"learning_rate": 3.483798015291239e-05,
"loss": 0.7934,
"step": 1240
},
{
"epoch": 3.7261503928170594,
"grad_norm": 1.1797089576721191,
"learning_rate": 3.4729783518316056e-05,
"loss": 0.773,
"step": 1245
},
{
"epoch": 3.741114852225963,
"grad_norm": 1.148738980293274,
"learning_rate": 3.462137168028549e-05,
"loss": 0.7345,
"step": 1250
},
{
"epoch": 3.756079311634867,
"grad_norm": 1.1555569171905518,
"learning_rate": 3.4512747036674644e-05,
"loss": 0.7036,
"step": 1255
},
{
"epoch": 3.771043771043771,
"grad_norm": 1.172443151473999,
"learning_rate": 3.440391199004431e-05,
"loss": 0.8012,
"step": 1260
},
{
"epoch": 3.786008230452675,
"grad_norm": 1.4849857091903687,
"learning_rate": 3.4294868947608964e-05,
"loss": 0.7567,
"step": 1265
},
{
"epoch": 3.8009726898615788,
"grad_norm": 1.1539514064788818,
"learning_rate": 3.4185620321183545e-05,
"loss": 0.7258,
"step": 1270
},
{
"epoch": 3.8159371492704826,
"grad_norm": 1.3457891941070557,
"learning_rate": 3.4076168527130094e-05,
"loss": 0.7048,
"step": 1275
},
{
"epoch": 3.8309016086793863,
"grad_norm": 1.1766413450241089,
"learning_rate": 3.396651598630432e-05,
"loss": 0.7275,
"step": 1280
},
{
"epoch": 3.8458660680882906,
"grad_norm": 1.2674963474273682,
"learning_rate": 3.3856665124002054e-05,
"loss": 0.6935,
"step": 1285
},
{
"epoch": 3.8608305274971944,
"grad_norm": 1.256225347518921,
"learning_rate": 3.37466183699056e-05,
"loss": 0.7128,
"step": 1290
},
{
"epoch": 3.875794986906098,
"grad_norm": 1.098443627357483,
"learning_rate": 3.363637815802998e-05,
"loss": 0.6997,
"step": 1295
},
{
"epoch": 3.890759446315002,
"grad_norm": 1.1758556365966797,
"learning_rate": 3.352594692666915e-05,
"loss": 0.6989,
"step": 1300
},
{
"epoch": 3.9057239057239057,
"grad_norm": 1.265360951423645,
"learning_rate": 3.3415327118342015e-05,
"loss": 0.7412,
"step": 1305
},
{
"epoch": 3.9206883651328095,
"grad_norm": 1.212694525718689,
"learning_rate": 3.3304521179738437e-05,
"loss": 0.7208,
"step": 1310
},
{
"epoch": 3.9356528245417133,
"grad_norm": 1.2661161422729492,
"learning_rate": 3.319353156166509e-05,
"loss": 0.7097,
"step": 1315
},
{
"epoch": 3.950617283950617,
"grad_norm": 1.0489590167999268,
"learning_rate": 3.3082360718991304e-05,
"loss": 0.7063,
"step": 1320
},
{
"epoch": 3.9655817433595213,
"grad_norm": 1.304627537727356,
"learning_rate": 3.297101111059471e-05,
"loss": 0.7256,
"step": 1325
},
{
"epoch": 3.980546202768425,
"grad_norm": 1.1489557027816772,
"learning_rate": 3.2859485199306885e-05,
"loss": 0.7,
"step": 1330
},
{
"epoch": 3.995510662177329,
"grad_norm": 1.1346006393432617,
"learning_rate": 3.274778545185888e-05,
"loss": 0.7179,
"step": 1335
},
{
"epoch": 4.010475121586233,
"grad_norm": 1.1032986640930176,
"learning_rate": 3.263591433882666e-05,
"loss": 0.7768,
"step": 1340
},
{
"epoch": 4.025439580995137,
"grad_norm": 1.5759029388427734,
"learning_rate": 3.252387433457645e-05,
"loss": 0.6737,
"step": 1345
},
{
"epoch": 4.040404040404041,
"grad_norm": 1.208949327468872,
"learning_rate": 3.241166791721001e-05,
"loss": 0.648,
"step": 1350
},
{
"epoch": 4.0553684998129444,
"grad_norm": 1.332651972770691,
"learning_rate": 3.2299297568509835e-05,
"loss": 0.7591,
"step": 1355
},
{
"epoch": 4.070332959221848,
"grad_norm": 1.2246062755584717,
"learning_rate": 3.2186765773884245e-05,
"loss": 0.6756,
"step": 1360
},
{
"epoch": 4.085297418630752,
"grad_norm": 1.288155198097229,
"learning_rate": 3.2074075022312417e-05,
"loss": 0.7229,
"step": 1365
},
{
"epoch": 4.100261878039656,
"grad_norm": 1.2574666738510132,
"learning_rate": 3.196122780628936e-05,
"loss": 0.7267,
"step": 1370
},
{
"epoch": 4.11522633744856,
"grad_norm": 1.1441593170166016,
"learning_rate": 3.1848226621770744e-05,
"loss": 0.7363,
"step": 1375
},
{
"epoch": 4.130190796857463,
"grad_norm": 1.3331890106201172,
"learning_rate": 3.173507396811774e-05,
"loss": 0.7083,
"step": 1380
},
{
"epoch": 4.145155256266367,
"grad_norm": 1.2898328304290771,
"learning_rate": 3.162177234804168e-05,
"loss": 0.6997,
"step": 1385
},
{
"epoch": 4.160119715675271,
"grad_norm": 1.0942869186401367,
"learning_rate": 3.150832426754877e-05,
"loss": 0.7047,
"step": 1390
},
{
"epoch": 4.175084175084175,
"grad_norm": 1.1719509363174438,
"learning_rate": 3.1394732235884615e-05,
"loss": 0.6965,
"step": 1395
},
{
"epoch": 4.1900486344930785,
"grad_norm": 1.2531814575195312,
"learning_rate": 3.1280998765478727e-05,
"loss": 0.7139,
"step": 1400
},
{
"epoch": 4.205013093901983,
"grad_norm": 1.9164917469024658,
"learning_rate": 3.116712637188897e-05,
"loss": 0.7125,
"step": 1405
},
{
"epoch": 4.219977553310887,
"grad_norm": 1.1178081035614014,
"learning_rate": 3.10531175737459e-05,
"loss": 0.7247,
"step": 1410
},
{
"epoch": 4.234942012719791,
"grad_norm": 1.3899163007736206,
"learning_rate": 3.0938974892697095e-05,
"loss": 0.6983,
"step": 1415
},
{
"epoch": 4.2499064721286945,
"grad_norm": 1.175616979598999,
"learning_rate": 3.082470085335133e-05,
"loss": 0.7491,
"step": 1420
},
{
"epoch": 4.264870931537598,
"grad_norm": 1.1819936037063599,
"learning_rate": 3.071029798322279e-05,
"loss": 0.6763,
"step": 1425
},
{
"epoch": 4.279835390946502,
"grad_norm": 1.109136939048767,
"learning_rate": 3.0595768812675104e-05,
"loss": 0.7401,
"step": 1430
},
{
"epoch": 4.294799850355406,
"grad_norm": 1.1672794818878174,
"learning_rate": 3.048111587486545e-05,
"loss": 0.6849,
"step": 1435
},
{
"epoch": 4.30976430976431,
"grad_norm": 1.3153440952301025,
"learning_rate": 3.0366341705688468e-05,
"loss": 0.7617,
"step": 1440
},
{
"epoch": 4.3247287691732135,
"grad_norm": 1.1830875873565674,
"learning_rate": 3.025144884372021e-05,
"loss": 0.7097,
"step": 1445
},
{
"epoch": 4.339693228582117,
"grad_norm": 1.1907213926315308,
"learning_rate": 3.0136439830161967e-05,
"loss": 0.6899,
"step": 1450
},
{
"epoch": 4.354657687991021,
"grad_norm": 1.1969035863876343,
"learning_rate": 3.0021317208784074e-05,
"loss": 0.7034,
"step": 1455
},
{
"epoch": 4.369622147399925,
"grad_norm": 1.181558609008789,
"learning_rate": 2.990608352586965e-05,
"loss": 0.7223,
"step": 1460
},
{
"epoch": 4.3845866068088295,
"grad_norm": 1.2934932708740234,
"learning_rate": 2.979074133015827e-05,
"loss": 0.7026,
"step": 1465
},
{
"epoch": 4.399551066217733,
"grad_norm": 1.2202645540237427,
"learning_rate": 2.9675293172789583e-05,
"loss": 0.734,
"step": 1470
},
{
"epoch": 4.414515525626637,
"grad_norm": 1.2204395532608032,
"learning_rate": 2.9559741607246922e-05,
"loss": 0.7691,
"step": 1475
},
{
"epoch": 4.429479985035541,
"grad_norm": 1.2286920547485352,
"learning_rate": 2.9444089189300783e-05,
"loss": 0.7691,
"step": 1480
},
{
"epoch": 4.444444444444445,
"grad_norm": 1.1387462615966797,
"learning_rate": 2.932833847695234e-05,
"loss": 0.7064,
"step": 1485
},
{
"epoch": 4.459408903853348,
"grad_norm": 1.3988234996795654,
"learning_rate": 2.9212492030376814e-05,
"loss": 0.6983,
"step": 1490
},
{
"epoch": 4.474373363262252,
"grad_norm": 1.144126296043396,
"learning_rate": 2.90965524118669e-05,
"loss": 0.7616,
"step": 1495
},
{
"epoch": 4.489337822671156,
"grad_norm": 1.073025107383728,
"learning_rate": 2.8980522185776065e-05,
"loss": 0.7386,
"step": 1500
},
{
"epoch": 4.50430228208006,
"grad_norm": 1.3249400854110718,
"learning_rate": 2.8864403918461812e-05,
"loss": 0.6959,
"step": 1505
},
{
"epoch": 4.519266741488964,
"grad_norm": 1.3409395217895508,
"learning_rate": 2.874820017822899e-05,
"loss": 0.696,
"step": 1510
},
{
"epoch": 4.534231200897867,
"grad_norm": 1.2458475828170776,
"learning_rate": 2.8631913535272888e-05,
"loss": 0.7367,
"step": 1515
},
{
"epoch": 4.549195660306771,
"grad_norm": 1.3703510761260986,
"learning_rate": 2.8515546561622462e-05,
"loss": 0.7221,
"step": 1520
},
{
"epoch": 4.564160119715675,
"grad_norm": 1.0971307754516602,
"learning_rate": 2.839910183108342e-05,
"loss": 0.7485,
"step": 1525
},
{
"epoch": 4.57912457912458,
"grad_norm": 1.198792815208435,
"learning_rate": 2.828258191918131e-05,
"loss": 0.8012,
"step": 1530
},
{
"epoch": 4.5940890385334825,
"grad_norm": 1.2157917022705078,
"learning_rate": 2.816598940310452e-05,
"loss": 0.6885,
"step": 1535
},
{
"epoch": 4.609053497942387,
"grad_norm": 1.2653173208236694,
"learning_rate": 2.8049326861647302e-05,
"loss": 0.7332,
"step": 1540
},
{
"epoch": 4.624017957351291,
"grad_norm": 1.1794272661209106,
"learning_rate": 2.7932596875152744e-05,
"loss": 0.7952,
"step": 1545
},
{
"epoch": 4.638982416760195,
"grad_norm": 1.2640421390533447,
"learning_rate": 2.781580202545568e-05,
"loss": 0.7742,
"step": 1550
},
{
"epoch": 4.6539468761690985,
"grad_norm": 2.050365686416626,
"learning_rate": 2.7698944895825572e-05,
"loss": 0.7715,
"step": 1555
},
{
"epoch": 4.668911335578002,
"grad_norm": 1.2108561992645264,
"learning_rate": 2.7582028070909415e-05,
"loss": 0.7624,
"step": 1560
},
{
"epoch": 4.683875794986906,
"grad_norm": 1.2775709629058838,
"learning_rate": 2.746505413667452e-05,
"loss": 0.6833,
"step": 1565
},
{
"epoch": 4.69884025439581,
"grad_norm": 1.178462266921997,
"learning_rate": 2.7348025680351363e-05,
"loss": 0.6924,
"step": 1570
},
{
"epoch": 4.713804713804714,
"grad_norm": 1.1037096977233887,
"learning_rate": 2.7230945290376325e-05,
"loss": 0.6909,
"step": 1575
},
{
"epoch": 4.7287691732136174,
"grad_norm": 1.241242527961731,
"learning_rate": 2.7113815556334478e-05,
"loss": 0.7844,
"step": 1580
},
{
"epoch": 4.743733632622521,
"grad_norm": 1.220365047454834,
"learning_rate": 2.6996639068902253e-05,
"loss": 0.7149,
"step": 1585
},
{
"epoch": 4.758698092031425,
"grad_norm": 1.3249695301055908,
"learning_rate": 2.6879418419790204e-05,
"loss": 0.6882,
"step": 1590
},
{
"epoch": 4.77366255144033,
"grad_norm": 1.3471956253051758,
"learning_rate": 2.6762156201685628e-05,
"loss": 0.7442,
"step": 1595
},
{
"epoch": 4.788627010849233,
"grad_norm": 1.2055602073669434,
"learning_rate": 2.6644855008195267e-05,
"loss": 0.7078,
"step": 1600
},
{
"epoch": 4.803591470258137,
"grad_norm": 1.2006497383117676,
"learning_rate": 2.6527517433787913e-05,
"loss": 0.6789,
"step": 1605
},
{
"epoch": 4.818555929667041,
"grad_norm": 1.1846423149108887,
"learning_rate": 2.641014607373702e-05,
"loss": 0.6703,
"step": 1610
},
{
"epoch": 4.833520389075945,
"grad_norm": 1.2655390501022339,
"learning_rate": 2.6292743524063334e-05,
"loss": 0.6671,
"step": 1615
},
{
"epoch": 4.848484848484849,
"grad_norm": 1.250118374824524,
"learning_rate": 2.6175312381477442e-05,
"loss": 0.6936,
"step": 1620
},
{
"epoch": 4.863449307893752,
"grad_norm": 1.2387151718139648,
"learning_rate": 2.6057855243322344e-05,
"loss": 0.6755,
"step": 1625
},
{
"epoch": 4.878413767302656,
"grad_norm": 1.3161075115203857,
"learning_rate": 2.5940374707516015e-05,
"loss": 0.6515,
"step": 1630
},
{
"epoch": 4.89337822671156,
"grad_norm": 1.219498872756958,
"learning_rate": 2.582287337249394e-05,
"loss": 0.7108,
"step": 1635
},
{
"epoch": 4.908342686120464,
"grad_norm": 1.4078658819198608,
"learning_rate": 2.570535383715165e-05,
"loss": 0.7038,
"step": 1640
},
{
"epoch": 4.9233071455293675,
"grad_norm": 1.140682578086853,
"learning_rate": 2.558781870078722e-05,
"loss": 0.6804,
"step": 1645
},
{
"epoch": 4.938271604938271,
"grad_norm": 1.4205572605133057,
"learning_rate": 2.547027056304379e-05,
"loss": 0.7491,
"step": 1650
},
{
"epoch": 4.953236064347175,
"grad_norm": 1.2967289686203003,
"learning_rate": 2.5352712023852066e-05,
"loss": 0.7297,
"step": 1655
},
{
"epoch": 4.96820052375608,
"grad_norm": 1.2759593725204468,
"learning_rate": 2.5235145683372814e-05,
"loss": 0.6731,
"step": 1660
},
{
"epoch": 4.983164983164983,
"grad_norm": 1.1895300149917603,
"learning_rate": 2.5117574141939337e-05,
"loss": 0.7156,
"step": 1665
},
{
"epoch": 4.998129442573887,
"grad_norm": 1.1513454914093018,
"learning_rate": 2.5e-05,
"loss": 0.7455,
"step": 1670
},
{
"epoch": 5.013093901982791,
"grad_norm": 1.231416940689087,
"learning_rate": 2.4882425858060668e-05,
"loss": 0.7206,
"step": 1675
},
{
"epoch": 5.028058361391695,
"grad_norm": 1.270216941833496,
"learning_rate": 2.47648543166272e-05,
"loss": 0.6685,
"step": 1680
},
{
"epoch": 5.043022820800599,
"grad_norm": 1.4066438674926758,
"learning_rate": 2.4647287976147946e-05,
"loss": 0.6722,
"step": 1685
},
{
"epoch": 5.0579872802095025,
"grad_norm": 1.3440229892730713,
"learning_rate": 2.452972943695621e-05,
"loss": 0.7271,
"step": 1690
},
{
"epoch": 5.072951739618406,
"grad_norm": 1.1897931098937988,
"learning_rate": 2.441218129921278e-05,
"loss": 0.6775,
"step": 1695
},
{
"epoch": 5.08791619902731,
"grad_norm": 1.2431669235229492,
"learning_rate": 2.4294646162848354e-05,
"loss": 0.7324,
"step": 1700
},
{
"epoch": 5.102880658436214,
"grad_norm": 1.4123824834823608,
"learning_rate": 2.4177126627506067e-05,
"loss": 0.7041,
"step": 1705
},
{
"epoch": 5.117845117845118,
"grad_norm": 1.3087615966796875,
"learning_rate": 2.405962529248399e-05,
"loss": 0.6902,
"step": 1710
},
{
"epoch": 5.132809577254021,
"grad_norm": 1.1675366163253784,
"learning_rate": 2.394214475667767e-05,
"loss": 0.7462,
"step": 1715
},
{
"epoch": 5.147774036662925,
"grad_norm": 1.1870967149734497,
"learning_rate": 2.3824687618522567e-05,
"loss": 0.7482,
"step": 1720
},
{
"epoch": 5.162738496071829,
"grad_norm": 1.1886534690856934,
"learning_rate": 2.370725647593666e-05,
"loss": 0.7026,
"step": 1725
},
{
"epoch": 5.177702955480734,
"grad_norm": 1.3220059871673584,
"learning_rate": 2.3589853926262977e-05,
"loss": 0.681,
"step": 1730
},
{
"epoch": 5.1926674148896375,
"grad_norm": 1.2325706481933594,
"learning_rate": 2.3472482566212093e-05,
"loss": 0.7101,
"step": 1735
},
{
"epoch": 5.207631874298541,
"grad_norm": 1.1882089376449585,
"learning_rate": 2.3355144991804735e-05,
"loss": 0.6857,
"step": 1740
},
{
"epoch": 5.222596333707445,
"grad_norm": 1.3109657764434814,
"learning_rate": 2.323784379831438e-05,
"loss": 0.7127,
"step": 1745
},
{
"epoch": 5.237560793116349,
"grad_norm": 1.1432446241378784,
"learning_rate": 2.3120581580209808e-05,
"loss": 0.6823,
"step": 1750
},
{
"epoch": 5.252525252525253,
"grad_norm": 1.3307565450668335,
"learning_rate": 2.3003360931097757e-05,
"loss": 0.7118,
"step": 1755
},
{
"epoch": 5.267489711934156,
"grad_norm": 1.6253339052200317,
"learning_rate": 2.2886184443665525e-05,
"loss": 0.7521,
"step": 1760
},
{
"epoch": 5.28245417134306,
"grad_norm": 1.3010215759277344,
"learning_rate": 2.2769054709623674e-05,
"loss": 0.7331,
"step": 1765
},
{
"epoch": 5.297418630751964,
"grad_norm": 1.2219674587249756,
"learning_rate": 2.2651974319648643e-05,
"loss": 0.7031,
"step": 1770
},
{
"epoch": 5.312383090160868,
"grad_norm": 1.2299708127975464,
"learning_rate": 2.2534945863325487e-05,
"loss": 0.6622,
"step": 1775
},
{
"epoch": 5.3273475495697715,
"grad_norm": 1.1474329233169556,
"learning_rate": 2.241797192909059e-05,
"loss": 0.6662,
"step": 1780
},
{
"epoch": 5.342312008978675,
"grad_norm": 1.1639771461486816,
"learning_rate": 2.2301055104174433e-05,
"loss": 0.6913,
"step": 1785
},
{
"epoch": 5.357276468387579,
"grad_norm": 1.2043278217315674,
"learning_rate": 2.218419797454433e-05,
"loss": 0.6777,
"step": 1790
},
{
"epoch": 5.372240927796484,
"grad_norm": 1.2802300453186035,
"learning_rate": 2.206740312484726e-05,
"loss": 0.6608,
"step": 1795
},
{
"epoch": 5.3872053872053876,
"grad_norm": 1.2886018753051758,
"learning_rate": 2.19506731383527e-05,
"loss": 0.6696,
"step": 1800
},
{
"epoch": 5.402169846614291,
"grad_norm": 1.6271384954452515,
"learning_rate": 2.1834010596895487e-05,
"loss": 0.7117,
"step": 1805
},
{
"epoch": 5.417134306023195,
"grad_norm": 1.3303827047348022,
"learning_rate": 2.1717418080818696e-05,
"loss": 0.6851,
"step": 1810
},
{
"epoch": 5.432098765432099,
"grad_norm": 1.3058645725250244,
"learning_rate": 2.1600898168916584e-05,
"loss": 0.7386,
"step": 1815
},
{
"epoch": 5.447063224841003,
"grad_norm": 1.3986623287200928,
"learning_rate": 2.148445343837755e-05,
"loss": 0.6995,
"step": 1820
},
{
"epoch": 5.4620276842499065,
"grad_norm": 1.2918411493301392,
"learning_rate": 2.1368086464727125e-05,
"loss": 0.6936,
"step": 1825
},
{
"epoch": 5.47699214365881,
"grad_norm": 1.1513465642929077,
"learning_rate": 2.1251799821771012e-05,
"loss": 0.7228,
"step": 1830
},
{
"epoch": 5.491956603067714,
"grad_norm": 1.233217716217041,
"learning_rate": 2.1135596081538184e-05,
"loss": 0.77,
"step": 1835
},
{
"epoch": 5.506921062476618,
"grad_norm": 1.2311054468154907,
"learning_rate": 2.1019477814223944e-05,
"loss": 0.6844,
"step": 1840
},
{
"epoch": 5.521885521885522,
"grad_norm": 1.3642069101333618,
"learning_rate": 2.09034475881331e-05,
"loss": 0.7025,
"step": 1845
},
{
"epoch": 5.536849981294425,
"grad_norm": 1.311928391456604,
"learning_rate": 2.0787507969623192e-05,
"loss": 0.6874,
"step": 1850
},
{
"epoch": 5.55181444070333,
"grad_norm": 1.2631707191467285,
"learning_rate": 2.0671661523047663e-05,
"loss": 0.7446,
"step": 1855
},
{
"epoch": 5.566778900112233,
"grad_norm": 1.1697314977645874,
"learning_rate": 2.0555910810699223e-05,
"loss": 0.7386,
"step": 1860
},
{
"epoch": 5.581743359521138,
"grad_norm": 1.2585618495941162,
"learning_rate": 2.0440258392753084e-05,
"loss": 0.7292,
"step": 1865
},
{
"epoch": 5.596707818930041,
"grad_norm": 1.357924461364746,
"learning_rate": 2.032470682721042e-05,
"loss": 0.7167,
"step": 1870
},
{
"epoch": 5.611672278338945,
"grad_norm": 1.0884181261062622,
"learning_rate": 2.0209258669841737e-05,
"loss": 0.7249,
"step": 1875
},
{
"epoch": 5.626636737747849,
"grad_norm": 1.3644371032714844,
"learning_rate": 2.0093916474130353e-05,
"loss": 0.7436,
"step": 1880
},
{
"epoch": 5.641601197156753,
"grad_norm": 1.1837425231933594,
"learning_rate": 1.997868279121593e-05,
"loss": 0.6922,
"step": 1885
},
{
"epoch": 5.656565656565657,
"grad_norm": 1.3669867515563965,
"learning_rate": 1.9863560169838042e-05,
"loss": 0.7689,
"step": 1890
},
{
"epoch": 5.67153011597456,
"grad_norm": 1.3488072156906128,
"learning_rate": 1.97485511562798e-05,
"loss": 0.7074,
"step": 1895
},
{
"epoch": 5.686494575383464,
"grad_norm": 1.1839897632598877,
"learning_rate": 1.9633658294311535e-05,
"loss": 0.7115,
"step": 1900
},
{
"epoch": 5.701459034792368,
"grad_norm": 1.3153859376907349,
"learning_rate": 1.9518884125134556e-05,
"loss": 0.723,
"step": 1905
},
{
"epoch": 5.716423494201272,
"grad_norm": 1.2922106981277466,
"learning_rate": 1.9404231187324902e-05,
"loss": 0.6543,
"step": 1910
},
{
"epoch": 5.7313879536101755,
"grad_norm": 1.3643290996551514,
"learning_rate": 1.928970201677722e-05,
"loss": 0.7399,
"step": 1915
},
{
"epoch": 5.74635241301908,
"grad_norm": 1.188324213027954,
"learning_rate": 1.9175299146648674e-05,
"loss": 0.6795,
"step": 1920
},
{
"epoch": 5.761316872427983,
"grad_norm": 1.4890059232711792,
"learning_rate": 1.906102510730291e-05,
"loss": 0.721,
"step": 1925
},
{
"epoch": 5.776281331836888,
"grad_norm": 1.4943420886993408,
"learning_rate": 1.8946882426254105e-05,
"loss": 0.6991,
"step": 1930
},
{
"epoch": 5.7912457912457915,
"grad_norm": 1.2924257516860962,
"learning_rate": 1.8832873628111038e-05,
"loss": 0.7136,
"step": 1935
},
{
"epoch": 5.806210250654695,
"grad_norm": 1.3031319379806519,
"learning_rate": 1.8719001234521283e-05,
"loss": 0.6695,
"step": 1940
},
{
"epoch": 5.821174710063599,
"grad_norm": 1.2206610441207886,
"learning_rate": 1.860526776411539e-05,
"loss": 0.6473,
"step": 1945
},
{
"epoch": 5.836139169472503,
"grad_norm": 1.173349142074585,
"learning_rate": 1.849167573245123e-05,
"loss": 0.6412,
"step": 1950
},
{
"epoch": 5.851103628881407,
"grad_norm": 1.5744128227233887,
"learning_rate": 1.8378227651958326e-05,
"loss": 0.6956,
"step": 1955
},
{
"epoch": 5.8660680882903105,
"grad_norm": 1.170933723449707,
"learning_rate": 1.8264926031882272e-05,
"loss": 0.7798,
"step": 1960
},
{
"epoch": 5.881032547699214,
"grad_norm": 1.5066628456115723,
"learning_rate": 1.8151773378229265e-05,
"loss": 0.7011,
"step": 1965
},
{
"epoch": 5.895997007108118,
"grad_norm": 1.2198915481567383,
"learning_rate": 1.8038772193710646e-05,
"loss": 0.724,
"step": 1970
},
{
"epoch": 5.910961466517022,
"grad_norm": 1.227023959159851,
"learning_rate": 1.792592497768759e-05,
"loss": 0.6702,
"step": 1975
},
{
"epoch": 5.925925925925926,
"grad_norm": 1.3417410850524902,
"learning_rate": 1.7813234226115764e-05,
"loss": 0.747,
"step": 1980
},
{
"epoch": 5.94089038533483,
"grad_norm": 1.3337069749832153,
"learning_rate": 1.7700702431490174e-05,
"loss": 0.669,
"step": 1985
},
{
"epoch": 5.955854844743733,
"grad_norm": 1.2036738395690918,
"learning_rate": 1.7588332082789993e-05,
"loss": 0.7339,
"step": 1990
},
{
"epoch": 5.970819304152638,
"grad_norm": 1.1622107028961182,
"learning_rate": 1.747612566542356e-05,
"loss": 0.6925,
"step": 1995
},
{
"epoch": 5.985783763561542,
"grad_norm": 1.3639973402023315,
"learning_rate": 1.7364085661173347e-05,
"loss": 0.6798,
"step": 2000
},
{
"epoch": 6.000748222970445,
"grad_norm": 1.2021132707595825,
"learning_rate": 1.725221454814112e-05,
"loss": 0.7133,
"step": 2005
},
{
"epoch": 6.015712682379349,
"grad_norm": 1.4045711755752563,
"learning_rate": 1.7140514800693124e-05,
"loss": 0.6953,
"step": 2010
},
{
"epoch": 6.030677141788253,
"grad_norm": 1.2548061609268188,
"learning_rate": 1.7028988889405296e-05,
"loss": 0.6381,
"step": 2015
},
{
"epoch": 6.045641601197157,
"grad_norm": 1.1166868209838867,
"learning_rate": 1.69176392810087e-05,
"loss": 0.7127,
"step": 2020
},
{
"epoch": 6.0606060606060606,
"grad_norm": 1.2931350469589233,
"learning_rate": 1.6806468438334917e-05,
"loss": 0.7081,
"step": 2025
},
{
"epoch": 6.075570520014964,
"grad_norm": 1.365538239479065,
"learning_rate": 1.6695478820261573e-05,
"loss": 0.6766,
"step": 2030
},
{
"epoch": 6.090534979423868,
"grad_norm": 1.4035921096801758,
"learning_rate": 1.658467288165799e-05,
"loss": 0.6857,
"step": 2035
},
{
"epoch": 6.105499438832772,
"grad_norm": 1.0855042934417725,
"learning_rate": 1.647405307333085e-05,
"loss": 0.7685,
"step": 2040
},
{
"epoch": 6.120463898241676,
"grad_norm": 1.4982078075408936,
"learning_rate": 1.6363621841970022e-05,
"loss": 0.7044,
"step": 2045
},
{
"epoch": 6.1354283576505795,
"grad_norm": 1.233553171157837,
"learning_rate": 1.625338163009441e-05,
"loss": 0.6415,
"step": 2050
},
{
"epoch": 6.150392817059484,
"grad_norm": 2.476423978805542,
"learning_rate": 1.6143334875997952e-05,
"loss": 0.7047,
"step": 2055
},
{
"epoch": 6.165357276468388,
"grad_norm": 1.2853014469146729,
"learning_rate": 1.6033484013695687e-05,
"loss": 0.7164,
"step": 2060
},
{
"epoch": 6.180321735877292,
"grad_norm": 1.376776933670044,
"learning_rate": 1.5923831472869915e-05,
"loss": 0.6773,
"step": 2065
},
{
"epoch": 6.1952861952861955,
"grad_norm": 1.2735328674316406,
"learning_rate": 1.581437967881647e-05,
"loss": 0.6457,
"step": 2070
},
{
"epoch": 6.210250654695099,
"grad_norm": 1.3325200080871582,
"learning_rate": 1.5705131052391042e-05,
"loss": 0.7297,
"step": 2075
},
{
"epoch": 6.225215114104003,
"grad_norm": 1.1959949731826782,
"learning_rate": 1.5596088009955695e-05,
"loss": 0.7535,
"step": 2080
},
{
"epoch": 6.240179573512907,
"grad_norm": 1.307750940322876,
"learning_rate": 1.5487252963325362e-05,
"loss": 0.7605,
"step": 2085
},
{
"epoch": 6.255144032921811,
"grad_norm": 1.3463622331619263,
"learning_rate": 1.5378628319714512e-05,
"loss": 0.7251,
"step": 2090
},
{
"epoch": 6.270108492330714,
"grad_norm": 1.2366999387741089,
"learning_rate": 1.5270216481683953e-05,
"loss": 0.6835,
"step": 2095
},
{
"epoch": 6.285072951739618,
"grad_norm": 1.2593817710876465,
"learning_rate": 1.5162019847087617e-05,
"loss": 0.6598,
"step": 2100
},
{
"epoch": 6.300037411148522,
"grad_norm": 1.3024280071258545,
"learning_rate": 1.5054040809019584e-05,
"loss": 0.6683,
"step": 2105
},
{
"epoch": 6.315001870557426,
"grad_norm": 1.4586106538772583,
"learning_rate": 1.4946281755761152e-05,
"loss": 0.6762,
"step": 2110
},
{
"epoch": 6.32996632996633,
"grad_norm": 1.338810920715332,
"learning_rate": 1.4838745070727958e-05,
"loss": 0.6821,
"step": 2115
},
{
"epoch": 6.344930789375233,
"grad_norm": 1.425808310508728,
"learning_rate": 1.4731433132417316e-05,
"loss": 0.6303,
"step": 2120
},
{
"epoch": 6.359895248784138,
"grad_norm": 1.1587165594100952,
"learning_rate": 1.4624348314355585e-05,
"loss": 0.6306,
"step": 2125
},
{
"epoch": 6.374859708193042,
"grad_norm": 1.3677455186843872,
"learning_rate": 1.4517492985045678e-05,
"loss": 0.7352,
"step": 2130
},
{
"epoch": 6.389824167601946,
"grad_norm": 1.4579230546951294,
"learning_rate": 1.4410869507914669e-05,
"loss": 0.6911,
"step": 2135
},
{
"epoch": 6.404788627010849,
"grad_norm": 1.3865454196929932,
"learning_rate": 1.4304480241261528e-05,
"loss": 0.6651,
"step": 2140
},
{
"epoch": 6.419753086419753,
"grad_norm": 1.1365728378295898,
"learning_rate": 1.4198327538204961e-05,
"loss": 0.6779,
"step": 2145
},
{
"epoch": 6.434717545828657,
"grad_norm": 1.271693229675293,
"learning_rate": 1.409241374663136e-05,
"loss": 0.7289,
"step": 2150
},
{
"epoch": 6.449682005237561,
"grad_norm": 1.314024567604065,
"learning_rate": 1.3986741209142845e-05,
"loss": 0.6656,
"step": 2155
},
{
"epoch": 6.4646464646464645,
"grad_norm": 1.2013462781906128,
"learning_rate": 1.3881312263005519e-05,
"loss": 0.6836,
"step": 2160
},
{
"epoch": 6.479610924055368,
"grad_norm": 1.332503080368042,
"learning_rate": 1.3776129240097673e-05,
"loss": 0.7178,
"step": 2165
},
{
"epoch": 6.494575383464272,
"grad_norm": 1.4150094985961914,
"learning_rate": 1.3671194466858334e-05,
"loss": 0.6895,
"step": 2170
},
{
"epoch": 6.509539842873176,
"grad_norm": 1.3232195377349854,
"learning_rate": 1.356651026423566e-05,
"loss": 0.7292,
"step": 2175
},
{
"epoch": 6.524504302282081,
"grad_norm": 1.324210286140442,
"learning_rate": 1.3462078947635781e-05,
"loss": 0.756,
"step": 2180
},
{
"epoch": 6.5394687616909835,
"grad_norm": 1.2665998935699463,
"learning_rate": 1.335790282687141e-05,
"loss": 0.6959,
"step": 2185
},
{
"epoch": 6.554433221099888,
"grad_norm": 1.1720548868179321,
"learning_rate": 1.325398420611088e-05,
"loss": 0.7918,
"step": 2190
},
{
"epoch": 6.569397680508792,
"grad_norm": 1.0761444568634033,
"learning_rate": 1.3150325383827117e-05,
"loss": 0.679,
"step": 2195
},
{
"epoch": 6.584362139917696,
"grad_norm": 1.4445922374725342,
"learning_rate": 1.3046928652746832e-05,
"loss": 0.802,
"step": 2200
},
{
"epoch": 6.5993265993265995,
"grad_norm": 1.2890619039535522,
"learning_rate": 1.2943796299799809e-05,
"loss": 0.747,
"step": 2205
},
{
"epoch": 6.614291058735503,
"grad_norm": 1.3807190656661987,
"learning_rate": 1.2840930606068289e-05,
"loss": 0.6693,
"step": 2210
},
{
"epoch": 6.629255518144407,
"grad_norm": 1.4410628080368042,
"learning_rate": 1.273833384673656e-05,
"loss": 0.7011,
"step": 2215
},
{
"epoch": 6.644219977553311,
"grad_norm": 1.255650520324707,
"learning_rate": 1.2636008291040618e-05,
"loss": 0.7627,
"step": 2220
},
{
"epoch": 6.659184436962215,
"grad_norm": 1.2652361392974854,
"learning_rate": 1.2533956202217975e-05,
"loss": 0.6859,
"step": 2225
},
{
"epoch": 6.674148896371118,
"grad_norm": 1.2963732481002808,
"learning_rate": 1.243217983745758e-05,
"loss": 0.7204,
"step": 2230
},
{
"epoch": 6.689113355780022,
"grad_norm": 1.4088592529296875,
"learning_rate": 1.2330681447849951e-05,
"loss": 0.6392,
"step": 2235
},
{
"epoch": 6.704077815188926,
"grad_norm": 1.3027905225753784,
"learning_rate": 1.2229463278337308e-05,
"loss": 0.7128,
"step": 2240
},
{
"epoch": 6.71904227459783,
"grad_norm": 1.2761296033859253,
"learning_rate": 1.2128527567663988e-05,
"loss": 0.7145,
"step": 2245
},
{
"epoch": 6.7340067340067336,
"grad_norm": 1.4830342531204224,
"learning_rate": 1.2027876548326897e-05,
"loss": 0.6784,
"step": 2250
},
{
"epoch": 6.748971193415638,
"grad_norm": 1.2457510232925415,
"learning_rate": 1.1927512446526142e-05,
"loss": 0.6929,
"step": 2255
},
{
"epoch": 6.763935652824542,
"grad_norm": 1.4039334058761597,
"learning_rate": 1.1827437482115759e-05,
"loss": 0.7516,
"step": 2260
},
{
"epoch": 6.778900112233446,
"grad_norm": 1.3703151941299438,
"learning_rate": 1.172765386855467e-05,
"loss": 0.699,
"step": 2265
},
{
"epoch": 6.79386457164235,
"grad_norm": 1.3183362483978271,
"learning_rate": 1.1628163812857674e-05,
"loss": 0.7607,
"step": 2270
},
{
"epoch": 6.808829031051253,
"grad_norm": 1.2728744745254517,
"learning_rate": 1.1528969515546672e-05,
"loss": 0.6541,
"step": 2275
},
{
"epoch": 6.823793490460157,
"grad_norm": 1.2783997058868408,
"learning_rate": 1.1430073170601968e-05,
"loss": 0.684,
"step": 2280
},
{
"epoch": 6.838757949869061,
"grad_norm": 1.145731806755066,
"learning_rate": 1.1331476965413773e-05,
"loss": 0.7134,
"step": 2285
},
{
"epoch": 6.853722409277965,
"grad_norm": 1.3381609916687012,
"learning_rate": 1.1233183080733764e-05,
"loss": 0.7275,
"step": 2290
},
{
"epoch": 6.8686868686868685,
"grad_norm": 1.2908689975738525,
"learning_rate": 1.1135193690626925e-05,
"loss": 0.6796,
"step": 2295
},
{
"epoch": 6.883651328095772,
"grad_norm": 1.5330723524093628,
"learning_rate": 1.1037510962423425e-05,
"loss": 0.674,
"step": 2300
},
{
"epoch": 6.898615787504676,
"grad_norm": 1.3555113077163696,
"learning_rate": 1.0940137056670655e-05,
"loss": 0.6678,
"step": 2305
},
{
"epoch": 6.91358024691358,
"grad_norm": 1.2070436477661133,
"learning_rate": 1.0843074127085507e-05,
"loss": 0.6954,
"step": 2310
},
{
"epoch": 6.928544706322484,
"grad_norm": 1.4584565162658691,
"learning_rate": 1.074632432050665e-05,
"loss": 0.6517,
"step": 2315
},
{
"epoch": 6.943509165731388,
"grad_norm": 1.2838579416275024,
"learning_rate": 1.0649889776847161e-05,
"loss": 0.6424,
"step": 2320
},
{
"epoch": 6.958473625140292,
"grad_norm": 1.2093007564544678,
"learning_rate": 1.0553772629047067e-05,
"loss": 0.7396,
"step": 2325
},
{
"epoch": 6.973438084549196,
"grad_norm": 1.5044478178024292,
"learning_rate": 1.0457975003026276e-05,
"loss": 0.6806,
"step": 2330
},
{
"epoch": 6.9884025439581,
"grad_norm": 1.2098227739334106,
"learning_rate": 1.0362499017637472e-05,
"loss": 0.6835,
"step": 2335
},
{
"epoch": 7.0033670033670035,
"grad_norm": 1.259406566619873,
"learning_rate": 1.0267346784619324e-05,
"loss": 0.6672,
"step": 2340
},
{
"epoch": 7.018331462775907,
"grad_norm": 1.2552211284637451,
"learning_rate": 1.0172520408549716e-05,
"loss": 0.6341,
"step": 2345
},
{
"epoch": 7.033295922184811,
"grad_norm": 1.3525948524475098,
"learning_rate": 1.0078021986799238e-05,
"loss": 0.6665,
"step": 2350
},
{
"epoch": 7.048260381593715,
"grad_norm": 1.2309094667434692,
"learning_rate": 9.983853609484786e-06,
"loss": 0.6903,
"step": 2355
},
{
"epoch": 7.063224841002619,
"grad_norm": 1.2575538158416748,
"learning_rate": 9.890017359423325e-06,
"loss": 0.7205,
"step": 2360
},
{
"epoch": 7.078189300411522,
"grad_norm": 1.2174732685089111,
"learning_rate": 9.796515312085841e-06,
"loss": 0.6929,
"step": 2365
},
{
"epoch": 7.093153759820426,
"grad_norm": 1.4941829442977905,
"learning_rate": 9.703349535551387e-06,
"loss": 0.6346,
"step": 2370
},
{
"epoch": 7.10811821922933,
"grad_norm": 1.3313934803009033,
"learning_rate": 9.610522090461415e-06,
"loss": 0.6626,
"step": 2375
},
{
"epoch": 7.123082678638234,
"grad_norm": 1.1870646476745605,
"learning_rate": 9.518035029974126e-06,
"loss": 0.6738,
"step": 2380
},
{
"epoch": 7.138047138047138,
"grad_norm": 1.376810073852539,
"learning_rate": 9.425890399719115e-06,
"loss": 0.657,
"step": 2385
},
{
"epoch": 7.153011597456042,
"grad_norm": 1.2887132167816162,
"learning_rate": 9.334090237752094e-06,
"loss": 0.712,
"step": 2390
},
{
"epoch": 7.167976056864946,
"grad_norm": 1.4136420488357544,
"learning_rate": 9.242636574509828e-06,
"loss": 0.7623,
"step": 2395
},
{
"epoch": 7.18294051627385,
"grad_norm": 1.2454450130462646,
"learning_rate": 9.151531432765203e-06,
"loss": 0.7891,
"step": 2400
},
{
"epoch": 7.197904975682754,
"grad_norm": 1.3656915426254272,
"learning_rate": 9.060776827582529e-06,
"loss": 0.6479,
"step": 2405
},
{
"epoch": 7.212869435091657,
"grad_norm": 1.3422670364379883,
"learning_rate": 8.970374766272915e-06,
"loss": 0.7534,
"step": 2410
},
{
"epoch": 7.227833894500561,
"grad_norm": 1.4018194675445557,
"learning_rate": 8.880327248349937e-06,
"loss": 0.679,
"step": 2415
},
{
"epoch": 7.242798353909465,
"grad_norm": 1.4204267263412476,
"learning_rate": 8.790636265485334e-06,
"loss": 0.6811,
"step": 2420
},
{
"epoch": 7.257762813318369,
"grad_norm": 1.3640581369400024,
"learning_rate": 8.701303801465052e-06,
"loss": 0.6518,
"step": 2425
},
{
"epoch": 7.2727272727272725,
"grad_norm": 1.255414366722107,
"learning_rate": 8.612331832145268e-06,
"loss": 0.6485,
"step": 2430
},
{
"epoch": 7.287691732136176,
"grad_norm": 1.3959693908691406,
"learning_rate": 8.523722325408758e-06,
"loss": 0.6528,
"step": 2435
},
{
"epoch": 7.30265619154508,
"grad_norm": 1.3679065704345703,
"learning_rate": 8.435477241121353e-06,
"loss": 0.6834,
"step": 2440
},
{
"epoch": 7.317620650953984,
"grad_norm": 1.1936756372451782,
"learning_rate": 8.347598531088554e-06,
"loss": 0.6883,
"step": 2445
},
{
"epoch": 7.3325851103628885,
"grad_norm": 1.3999428749084473,
"learning_rate": 8.260088139012435e-06,
"loss": 0.6906,
"step": 2450
},
{
"epoch": 7.347549569771792,
"grad_norm": 1.3568490743637085,
"learning_rate": 8.17294800044856e-06,
"loss": 0.7172,
"step": 2455
},
{
"epoch": 7.362514029180696,
"grad_norm": 1.362327218055725,
"learning_rate": 8.086180042763283e-06,
"loss": 0.6523,
"step": 2460
},
{
"epoch": 7.3774784885896,
"grad_norm": 1.2796952724456787,
"learning_rate": 7.999786185091008e-06,
"loss": 0.7196,
"step": 2465
},
{
"epoch": 7.392442947998504,
"grad_norm": 1.339594841003418,
"learning_rate": 7.913768338291821e-06,
"loss": 0.6475,
"step": 2470
},
{
"epoch": 7.407407407407407,
"grad_norm": 1.3105710744857788,
"learning_rate": 7.828128404909171e-06,
"loss": 0.6756,
"step": 2475
},
{
"epoch": 7.422371866816311,
"grad_norm": 1.3429076671600342,
"learning_rate": 7.742868279127848e-06,
"loss": 0.6886,
"step": 2480
},
{
"epoch": 7.437336326225215,
"grad_norm": 1.4829093217849731,
"learning_rate": 7.657989846732019e-06,
"loss": 0.6894,
"step": 2485
},
{
"epoch": 7.452300785634119,
"grad_norm": 1.4806331396102905,
"learning_rate": 7.573494985063579e-06,
"loss": 0.6653,
"step": 2490
},
{
"epoch": 7.467265245043023,
"grad_norm": 1.2165873050689697,
"learning_rate": 7.489385562980589e-06,
"loss": 0.7941,
"step": 2495
},
{
"epoch": 7.482229704451926,
"grad_norm": 1.4139281511306763,
"learning_rate": 7.4056634408159685e-06,
"loss": 0.689,
"step": 2500
},
{
"epoch": 7.49719416386083,
"grad_norm": 1.307259202003479,
"learning_rate": 7.3223304703363135e-06,
"loss": 0.6626,
"step": 2505
},
{
"epoch": 7.512158623269734,
"grad_norm": 1.5060079097747803,
"learning_rate": 7.2393884947009745e-06,
"loss": 0.7061,
"step": 2510
},
{
"epoch": 7.527123082678639,
"grad_norm": 1.623346209526062,
"learning_rate": 7.156839348421279e-06,
"loss": 0.6958,
"step": 2515
},
{
"epoch": 7.542087542087542,
"grad_norm": 1.3768142461776733,
"learning_rate": 7.074684857319927e-06,
"loss": 0.7661,
"step": 2520
},
{
"epoch": 7.557052001496446,
"grad_norm": 1.7065874338150024,
"learning_rate": 6.992926838490657e-06,
"loss": 0.6989,
"step": 2525
},
{
"epoch": 7.57201646090535,
"grad_norm": 1.4630271196365356,
"learning_rate": 6.91156710025802e-06,
"loss": 0.761,
"step": 2530
},
{
"epoch": 7.586980920314254,
"grad_norm": 1.3342783451080322,
"learning_rate": 6.830607442137405e-06,
"loss": 0.6834,
"step": 2535
},
{
"epoch": 7.6019453797231575,
"grad_norm": 1.3920519351959229,
"learning_rate": 6.7500496547951984e-06,
"loss": 0.6939,
"step": 2540
},
{
"epoch": 7.616909839132061,
"grad_norm": 1.4310715198516846,
"learning_rate": 6.6698955200092396e-06,
"loss": 0.6789,
"step": 2545
},
{
"epoch": 7.631874298540965,
"grad_norm": 1.2729769945144653,
"learning_rate": 6.590146810629347e-06,
"loss": 0.6925,
"step": 2550
},
{
"epoch": 7.646838757949869,
"grad_norm": 1.2772436141967773,
"learning_rate": 6.510805290538158e-06,
"loss": 0.6714,
"step": 2555
},
{
"epoch": 7.661803217358773,
"grad_norm": 1.3461037874221802,
"learning_rate": 6.431872714612072e-06,
"loss": 0.6973,
"step": 2560
},
{
"epoch": 7.6767676767676765,
"grad_norm": 1.2915376424789429,
"learning_rate": 6.353350828682494e-06,
"loss": 0.6669,
"step": 2565
},
{
"epoch": 7.69173213617658,
"grad_norm": 1.287246584892273,
"learning_rate": 6.275241369497142e-06,
"loss": 0.7157,
"step": 2570
},
{
"epoch": 7.706696595585484,
"grad_norm": 1.4065686464309692,
"learning_rate": 6.197546064681714e-06,
"loss": 0.7474,
"step": 2575
},
{
"epoch": 7.721661054994389,
"grad_norm": 1.5173590183258057,
"learning_rate": 6.120266632701599e-06,
"loss": 0.6442,
"step": 2580
},
{
"epoch": 7.7366255144032925,
"grad_norm": 1.2145261764526367,
"learning_rate": 6.043404782823939e-06,
"loss": 0.6729,
"step": 2585
},
{
"epoch": 7.751589973812196,
"grad_norm": 1.3860505819320679,
"learning_rate": 5.966962215079786e-06,
"loss": 0.7085,
"step": 2590
},
{
"epoch": 7.7665544332211,
"grad_norm": 1.2852251529693604,
"learning_rate": 5.890940620226479e-06,
"loss": 0.6983,
"step": 2595
},
{
"epoch": 7.781518892630004,
"grad_norm": 1.2326298952102661,
"learning_rate": 5.815341679710326e-06,
"loss": 0.6758,
"step": 2600
},
{
"epoch": 7.796483352038908,
"grad_norm": 1.2480541467666626,
"learning_rate": 5.740167065629312e-06,
"loss": 0.6605,
"step": 2605
},
{
"epoch": 7.811447811447811,
"grad_norm": 1.2479559183120728,
"learning_rate": 5.665418440696202e-06,
"loss": 0.6348,
"step": 2610
},
{
"epoch": 7.826412270856715,
"grad_norm": 1.3373992443084717,
"learning_rate": 5.591097458201699e-06,
"loss": 0.746,
"step": 2615
},
{
"epoch": 7.841376730265619,
"grad_norm": 1.3737704753875732,
"learning_rate": 5.51720576197794e-06,
"loss": 0.6511,
"step": 2620
},
{
"epoch": 7.856341189674523,
"grad_norm": 1.3783513307571411,
"learning_rate": 5.443744986362071e-06,
"loss": 0.6767,
"step": 2625
},
{
"epoch": 7.871305649083427,
"grad_norm": 1.2600133419036865,
"learning_rate": 5.370716756160157e-06,
"loss": 0.6918,
"step": 2630
},
{
"epoch": 7.88627010849233,
"grad_norm": 1.254599928855896,
"learning_rate": 5.298122686611212e-06,
"loss": 0.7017,
"step": 2635
},
{
"epoch": 7.901234567901234,
"grad_norm": 1.2620840072631836,
"learning_rate": 5.2259643833514896e-06,
"loss": 0.7181,
"step": 2640
},
{
"epoch": 7.916199027310139,
"grad_norm": 1.2185419797897339,
"learning_rate": 5.154243442378934e-06,
"loss": 0.7121,
"step": 2645
},
{
"epoch": 7.931163486719043,
"grad_norm": 1.360809564590454,
"learning_rate": 5.082961450017943e-06,
"loss": 0.6642,
"step": 2650
},
{
"epoch": 7.946127946127946,
"grad_norm": 1.3635886907577515,
"learning_rate": 5.012119982884209e-06,
"loss": 0.7676,
"step": 2655
},
{
"epoch": 7.96109240553685,
"grad_norm": 1.37740159034729,
"learning_rate": 4.9417206078499115e-06,
"loss": 0.6912,
"step": 2660
},
{
"epoch": 7.976056864945754,
"grad_norm": 1.2868249416351318,
"learning_rate": 4.871764882009025e-06,
"loss": 0.6582,
"step": 2665
},
{
"epoch": 7.991021324354658,
"grad_norm": 1.4278684854507446,
"learning_rate": 4.802254352642882e-06,
"loss": 0.6806,
"step": 2670
},
{
"epoch": 8.005985783763562,
"grad_norm": 1.2541025876998901,
"learning_rate": 4.7331905571859705e-06,
"loss": 0.6896,
"step": 2675
},
{
"epoch": 8.020950243172466,
"grad_norm": 1.2635290622711182,
"learning_rate": 4.664575023191886e-06,
"loss": 0.6491,
"step": 2680
},
{
"epoch": 8.035914702581369,
"grad_norm": 1.266473412513733,
"learning_rate": 4.5964092682996065e-06,
"loss": 0.6457,
"step": 2685
},
{
"epoch": 8.050879161990274,
"grad_norm": 1.4658360481262207,
"learning_rate": 4.528694800199859e-06,
"loss": 0.673,
"step": 2690
},
{
"epoch": 8.065843621399177,
"grad_norm": 1.3015804290771484,
"learning_rate": 4.46143311660184e-06,
"loss": 0.661,
"step": 2695
},
{
"epoch": 8.080808080808081,
"grad_norm": 1.334692358970642,
"learning_rate": 4.394625705200011e-06,
"loss": 0.7065,
"step": 2700
},
{
"epoch": 8.095772540216984,
"grad_norm": 1.2139922380447388,
"learning_rate": 4.328274043641295e-06,
"loss": 0.7074,
"step": 2705
},
{
"epoch": 8.110736999625889,
"grad_norm": 1.2450037002563477,
"learning_rate": 4.262379599492283e-06,
"loss": 0.666,
"step": 2710
},
{
"epoch": 8.125701459034792,
"grad_norm": 1.3340483903884888,
"learning_rate": 4.196943830206859e-06,
"loss": 0.6469,
"step": 2715
},
{
"epoch": 8.140665918443696,
"grad_norm": 1.3370238542556763,
"learning_rate": 4.131968183093912e-06,
"loss": 0.6642,
"step": 2720
},
{
"epoch": 8.1556303778526,
"grad_norm": 1.2851170301437378,
"learning_rate": 4.067454095285362e-06,
"loss": 0.6602,
"step": 2725
},
{
"epoch": 8.170594837261504,
"grad_norm": 1.5661766529083252,
"learning_rate": 4.003402993704353e-06,
"loss": 0.6465,
"step": 2730
},
{
"epoch": 8.185559296670407,
"grad_norm": 1.2045555114746094,
"learning_rate": 3.939816295033677e-06,
"loss": 0.6823,
"step": 2735
},
{
"epoch": 8.200523756079312,
"grad_norm": 1.3167060613632202,
"learning_rate": 3.8766954056844855e-06,
"loss": 0.7163,
"step": 2740
},
{
"epoch": 8.215488215488216,
"grad_norm": 1.3332468271255493,
"learning_rate": 3.8140417217651438e-06,
"loss": 0.7558,
"step": 2745
},
{
"epoch": 8.23045267489712,
"grad_norm": 1.344228744506836,
"learning_rate": 3.7518566290503626e-06,
"loss": 0.7451,
"step": 2750
},
{
"epoch": 8.245417134306024,
"grad_norm": 1.346323847770691,
"learning_rate": 3.690141502950542e-06,
"loss": 0.6998,
"step": 2755
},
{
"epoch": 8.260381593714927,
"grad_norm": 1.3617771863937378,
"learning_rate": 3.6288977084813767e-06,
"loss": 0.6885,
"step": 2760
},
{
"epoch": 8.275346053123831,
"grad_norm": 1.2529648542404175,
"learning_rate": 3.568126600233615e-06,
"loss": 0.6851,
"step": 2765
},
{
"epoch": 8.290310512532734,
"grad_norm": 1.4627494812011719,
"learning_rate": 3.5078295223431536e-06,
"loss": 0.7307,
"step": 2770
},
{
"epoch": 8.305274971941639,
"grad_norm": 1.3447396755218506,
"learning_rate": 3.4480078084612677e-06,
"loss": 0.6878,
"step": 2775
},
{
"epoch": 8.320239431350542,
"grad_norm": 1.2098687887191772,
"learning_rate": 3.388662781725141e-06,
"loss": 0.6968,
"step": 2780
},
{
"epoch": 8.335203890759447,
"grad_norm": 1.2697949409484863,
"learning_rate": 3.3297957547285626e-06,
"loss": 0.7097,
"step": 2785
},
{
"epoch": 8.35016835016835,
"grad_norm": 1.344548225402832,
"learning_rate": 3.2714080294929477e-06,
"loss": 0.6899,
"step": 2790
},
{
"epoch": 8.365132809577254,
"grad_norm": 1.283050298690796,
"learning_rate": 3.2135008974384874e-06,
"loss": 0.611,
"step": 2795
},
{
"epoch": 8.380097268986157,
"grad_norm": 1.4077322483062744,
"learning_rate": 3.1560756393556183e-06,
"loss": 0.6673,
"step": 2800
},
{
"epoch": 8.395061728395062,
"grad_norm": 1.4759045839309692,
"learning_rate": 3.0991335253766934e-06,
"loss": 0.7485,
"step": 2805
},
{
"epoch": 8.410026187803966,
"grad_norm": 1.3058305978775024,
"learning_rate": 3.042675814947868e-06,
"loss": 0.6873,
"step": 2810
},
{
"epoch": 8.42499064721287,
"grad_norm": 1.3055214881896973,
"learning_rate": 2.986703756801257e-06,
"loss": 0.7064,
"step": 2815
},
{
"epoch": 8.439955106621774,
"grad_norm": 1.2436131238937378,
"learning_rate": 2.931218588927315e-06,
"loss": 0.6871,
"step": 2820
},
{
"epoch": 8.454919566030677,
"grad_norm": 1.5080686807632446,
"learning_rate": 2.8762215385474633e-06,
"loss": 0.7363,
"step": 2825
},
{
"epoch": 8.469884025439582,
"grad_norm": 1.3684037923812866,
"learning_rate": 2.8217138220869187e-06,
"loss": 0.6719,
"step": 2830
},
{
"epoch": 8.484848484848484,
"grad_norm": 1.3375248908996582,
"learning_rate": 2.7676966451478214e-06,
"loss": 0.6715,
"step": 2835
},
{
"epoch": 8.499812944257389,
"grad_norm": 1.4447715282440186,
"learning_rate": 2.714171202482538e-06,
"loss": 0.6697,
"step": 2840
},
{
"epoch": 8.514777403666292,
"grad_norm": 1.4097157716751099,
"learning_rate": 2.661138677967279e-06,
"loss": 0.7199,
"step": 2845
},
{
"epoch": 8.529741863075197,
"grad_norm": 1.4371775388717651,
"learning_rate": 2.6086002445758566e-06,
"loss": 0.681,
"step": 2850
},
{
"epoch": 8.5447063224841,
"grad_norm": 1.353463053703308,
"learning_rate": 2.5565570643537954e-06,
"loss": 0.6461,
"step": 2855
},
{
"epoch": 8.559670781893004,
"grad_norm": 1.2656768560409546,
"learning_rate": 2.505010288392587e-06,
"loss": 0.723,
"step": 2860
},
{
"epoch": 8.574635241301909,
"grad_norm": 1.3458527326583862,
"learning_rate": 2.4539610568042657e-06,
"loss": 0.6481,
"step": 2865
},
{
"epoch": 8.589599700710812,
"grad_norm": 1.4183650016784668,
"learning_rate": 2.4034104986961627e-06,
"loss": 0.7229,
"step": 2870
},
{
"epoch": 8.604564160119716,
"grad_norm": 1.3535906076431274,
"learning_rate": 2.3533597321459516e-06,
"loss": 0.6762,
"step": 2875
},
{
"epoch": 8.61952861952862,
"grad_norm": 1.4276947975158691,
"learning_rate": 2.303809864176909e-06,
"loss": 0.6379,
"step": 2880
},
{
"epoch": 8.634493078937524,
"grad_norm": 1.312292218208313,
"learning_rate": 2.254761990733445e-06,
"loss": 0.6753,
"step": 2885
},
{
"epoch": 8.649457538346427,
"grad_norm": 1.3349074125289917,
"learning_rate": 2.206217196656826e-06,
"loss": 0.7395,
"step": 2890
},
{
"epoch": 8.664421997755332,
"grad_norm": 1.367660403251648,
"learning_rate": 2.1581765556612233e-06,
"loss": 0.7564,
"step": 2895
},
{
"epoch": 8.679386457164235,
"grad_norm": 1.302215337753296,
"learning_rate": 2.1106411303099455e-06,
"loss": 0.6862,
"step": 2900
},
{
"epoch": 8.69435091657314,
"grad_norm": 1.2132118940353394,
"learning_rate": 2.0636119719919246e-06,
"loss": 0.7351,
"step": 2905
},
{
"epoch": 8.709315375982042,
"grad_norm": 1.4168857336044312,
"learning_rate": 2.017090120898485e-06,
"loss": 0.6748,
"step": 2910
},
{
"epoch": 8.724279835390947,
"grad_norm": 1.5280455350875854,
"learning_rate": 1.971076606000327e-06,
"loss": 0.6935,
"step": 2915
},
{
"epoch": 8.73924429479985,
"grad_norm": 1.440262794494629,
"learning_rate": 1.9255724450247674e-06,
"loss": 0.6629,
"step": 2920
},
{
"epoch": 8.754208754208754,
"grad_norm": 1.36149263381958,
"learning_rate": 1.8805786444332092e-06,
"loss": 0.6644,
"step": 2925
},
{
"epoch": 8.769173213617659,
"grad_norm": 1.3209813833236694,
"learning_rate": 1.836096199398929e-06,
"loss": 0.6469,
"step": 2930
},
{
"epoch": 8.784137673026562,
"grad_norm": 1.3598469495773315,
"learning_rate": 1.7921260937850099e-06,
"loss": 0.646,
"step": 2935
},
{
"epoch": 8.799102132435467,
"grad_norm": 1.2802332639694214,
"learning_rate": 1.7486693001226268e-06,
"loss": 0.7487,
"step": 2940
},
{
"epoch": 8.81406659184437,
"grad_norm": 1.3110156059265137,
"learning_rate": 1.7057267795895115e-06,
"loss": 0.702,
"step": 2945
},
{
"epoch": 8.829031051253274,
"grad_norm": 1.324245572090149,
"learning_rate": 1.6632994819886977e-06,
"loss": 0.6807,
"step": 2950
},
{
"epoch": 8.843995510662177,
"grad_norm": 1.2745212316513062,
"learning_rate": 1.6213883457275065e-06,
"loss": 0.6846,
"step": 2955
},
{
"epoch": 8.858959970071082,
"grad_norm": 1.4197077751159668,
"learning_rate": 1.579994297796808e-06,
"loss": 0.7325,
"step": 2960
},
{
"epoch": 8.873924429479985,
"grad_norm": 1.3314228057861328,
"learning_rate": 1.5391182537505072e-06,
"loss": 0.6899,
"step": 2965
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.3566371202468872,
"learning_rate": 1.4987611176852878e-06,
"loss": 0.6596,
"step": 2970
},
{
"epoch": 8.903853348297792,
"grad_norm": 1.3632760047912598,
"learning_rate": 1.4589237822206282e-06,
"loss": 0.7111,
"step": 2975
},
{
"epoch": 8.918817807706697,
"grad_norm": 1.4764022827148438,
"learning_rate": 1.419607128479053e-06,
"loss": 0.7168,
"step": 2980
},
{
"epoch": 8.9337822671156,
"grad_norm": 1.1871962547302246,
"learning_rate": 1.3808120260666441e-06,
"loss": 0.7182,
"step": 2985
},
{
"epoch": 8.948746726524504,
"grad_norm": 1.2561469078063965,
"learning_rate": 1.3425393330538022e-06,
"loss": 0.6455,
"step": 2990
},
{
"epoch": 8.963711185933409,
"grad_norm": 1.4918162822723389,
"learning_rate": 1.3047898959562765e-06,
"loss": 0.7042,
"step": 2995
},
{
"epoch": 8.978675645342312,
"grad_norm": 1.3534742593765259,
"learning_rate": 1.267564549716435e-06,
"loss": 0.6742,
"step": 3000
},
{
"epoch": 8.993640104751217,
"grad_norm": 1.4959015846252441,
"learning_rate": 1.2308641176848046e-06,
"loss": 0.6838,
"step": 3005
},
{
"epoch": 9.00860456416012,
"grad_norm": 1.309097409248352,
"learning_rate": 1.1946894116018404e-06,
"loss": 0.6411,
"step": 3010
},
{
"epoch": 9.023569023569024,
"grad_norm": 1.3958250284194946,
"learning_rate": 1.159041231580016e-06,
"loss": 0.7136,
"step": 3015
},
{
"epoch": 9.038533482977927,
"grad_norm": 1.307607650756836,
"learning_rate": 1.1239203660860648e-06,
"loss": 0.7436,
"step": 3020
},
{
"epoch": 9.053497942386832,
"grad_norm": 1.273493766784668,
"learning_rate": 1.0893275919235945e-06,
"loss": 0.7149,
"step": 3025
},
{
"epoch": 9.068462401795735,
"grad_norm": 1.4512149095535278,
"learning_rate": 1.05526367421587e-06,
"loss": 0.7207,
"step": 3030
},
{
"epoch": 9.08342686120464,
"grad_norm": 1.3597697019577026,
"learning_rate": 1.0217293663889155e-06,
"loss": 0.6602,
"step": 3035
},
{
"epoch": 9.098391320613542,
"grad_norm": 1.4251606464385986,
"learning_rate": 9.88725410154842e-07,
"loss": 0.7312,
"step": 3040
},
{
"epoch": 9.113355780022447,
"grad_norm": 1.3595529794692993,
"learning_rate": 9.562525354954193e-07,
"loss": 0.7044,
"step": 3045
},
{
"epoch": 9.12832023943135,
"grad_norm": 1.2834125757217407,
"learning_rate": 9.243114606459741e-07,
"loss": 0.7221,
"step": 3050
},
{
"epoch": 9.143284698840255,
"grad_norm": 1.3886545896530151,
"learning_rate": 8.92902892079464e-07,
"loss": 0.6504,
"step": 3055
},
{
"epoch": 9.158249158249157,
"grad_norm": 1.533457636833191,
"learning_rate": 8.620275244908827e-07,
"loss": 0.6788,
"step": 3060
},
{
"epoch": 9.173213617658062,
"grad_norm": 1.493024230003357,
"learning_rate": 8.31686040781865e-07,
"loss": 0.6803,
"step": 3065
},
{
"epoch": 9.188178077066967,
"grad_norm": 1.2318785190582275,
"learning_rate": 8.018791120456087e-07,
"loss": 0.6904,
"step": 3070
},
{
"epoch": 9.20314253647587,
"grad_norm": 1.4301903247833252,
"learning_rate": 7.726073975520082e-07,
"loss": 0.6777,
"step": 3075
},
{
"epoch": 9.218106995884774,
"grad_norm": 1.322068452835083,
"learning_rate": 7.438715447331018e-07,
"loss": 0.685,
"step": 3080
},
{
"epoch": 9.233071455293677,
"grad_norm": 1.2603065967559814,
"learning_rate": 7.156721891687202e-07,
"loss": 0.6712,
"step": 3085
},
{
"epoch": 9.248035914702582,
"grad_norm": 1.4191964864730835,
"learning_rate": 6.880099545724522e-07,
"loss": 0.7124,
"step": 3090
},
{
"epoch": 9.263000374111485,
"grad_norm": 1.411106824874878,
"learning_rate": 6.608854527778319e-07,
"loss": 0.6788,
"step": 3095
},
{
"epoch": 9.27796483352039,
"grad_norm": 1.3679730892181396,
"learning_rate": 6.342992837248235e-07,
"loss": 0.69,
"step": 3100
},
{
"epoch": 9.292929292929292,
"grad_norm": 1.2826892137527466,
"learning_rate": 6.082520354465382e-07,
"loss": 0.7124,
"step": 3105
},
{
"epoch": 9.307893752338197,
"grad_norm": 1.2693568468093872,
"learning_rate": 5.82744284056233e-07,
"loss": 0.6702,
"step": 3110
},
{
"epoch": 9.3228582117471,
"grad_norm": 1.512661099433899,
"learning_rate": 5.577765937345686e-07,
"loss": 0.663,
"step": 3115
},
{
"epoch": 9.337822671156005,
"grad_norm": 1.5203378200531006,
"learning_rate": 5.333495167171353e-07,
"loss": 0.6927,
"step": 3120
},
{
"epoch": 9.352787130564908,
"grad_norm": 1.541284203529358,
"learning_rate": 5.094635932822223e-07,
"loss": 0.6629,
"step": 3125
},
{
"epoch": 9.367751589973812,
"grad_norm": 1.2277456521987915,
"learning_rate": 4.861193517388923e-07,
"loss": 0.7342,
"step": 3130
},
{
"epoch": 9.382716049382717,
"grad_norm": 1.3728615045547485,
"learning_rate": 4.6331730841527587e-07,
"loss": 0.6597,
"step": 3135
},
{
"epoch": 9.39768050879162,
"grad_norm": 1.2458422183990479,
"learning_rate": 4.4105796764715714e-07,
"loss": 0.6654,
"step": 3140
},
{
"epoch": 9.412644968200524,
"grad_norm": 1.41146981716156,
"learning_rate": 4.1934182176683045e-07,
"loss": 0.7134,
"step": 3145
},
{
"epoch": 9.427609427609427,
"grad_norm": 1.306872010231018,
"learning_rate": 3.9816935109218413e-07,
"loss": 0.6154,
"step": 3150
},
{
"epoch": 9.442573887018332,
"grad_norm": 1.3811511993408203,
"learning_rate": 3.7754102391611424e-07,
"loss": 0.6862,
"step": 3155
},
{
"epoch": 9.457538346427235,
"grad_norm": 1.2896925210952759,
"learning_rate": 3.5745729649613034e-07,
"loss": 0.6778,
"step": 3160
},
{
"epoch": 9.47250280583614,
"grad_norm": 1.1952729225158691,
"learning_rate": 3.3791861304428574e-07,
"loss": 0.6891,
"step": 3165
},
{
"epoch": 9.487467265245042,
"grad_norm": 1.3523303270339966,
"learning_rate": 3.189254057173491e-07,
"loss": 0.6576,
"step": 3170
},
{
"epoch": 9.502431724653947,
"grad_norm": 1.3650611639022827,
"learning_rate": 3.004780946072372e-07,
"loss": 0.6533,
"step": 3175
},
{
"epoch": 9.51739618406285,
"grad_norm": 1.2557883262634277,
"learning_rate": 2.825770877317363e-07,
"loss": 0.7639,
"step": 3180
},
{
"epoch": 9.532360643471755,
"grad_norm": 1.24583899974823,
"learning_rate": 2.6522278102546485e-07,
"loss": 0.6856,
"step": 3185
},
{
"epoch": 9.547325102880658,
"grad_norm": 1.48171865940094,
"learning_rate": 2.484155583311276e-07,
"loss": 0.6486,
"step": 3190
},
{
"epoch": 9.562289562289562,
"grad_norm": 1.2459851503372192,
"learning_rate": 2.3215579139101996e-07,
"loss": 0.6377,
"step": 3195
},
{
"epoch": 9.577254021698467,
"grad_norm": 1.1139715909957886,
"learning_rate": 2.1644383983880357e-07,
"loss": 0.6703,
"step": 3200
},
{
"epoch": 9.59221848110737,
"grad_norm": 1.4323070049285889,
"learning_rate": 2.012800511915547e-07,
"loss": 0.6743,
"step": 3205
},
{
"epoch": 9.607182940516275,
"grad_norm": 1.3250705003738403,
"learning_rate": 1.8666476084208129e-07,
"loss": 0.7117,
"step": 3210
},
{
"epoch": 9.622147399925177,
"grad_norm": 1.4704447984695435,
"learning_rate": 1.7259829205149568e-07,
"loss": 0.6817,
"step": 3215
},
{
"epoch": 9.637111859334082,
"grad_norm": 1.2608110904693604,
"learning_rate": 1.5908095594207583e-07,
"loss": 0.7122,
"step": 3220
},
{
"epoch": 9.652076318742985,
"grad_norm": 1.4275965690612793,
"learning_rate": 1.4611305149037358e-07,
"loss": 0.6386,
"step": 3225
},
{
"epoch": 9.66704077815189,
"grad_norm": 1.163145899772644,
"learning_rate": 1.336948655206144e-07,
"loss": 0.6882,
"step": 3230
},
{
"epoch": 9.682005237560793,
"grad_norm": 1.4491647481918335,
"learning_rate": 1.218266726983386e-07,
"loss": 0.6826,
"step": 3235
},
{
"epoch": 9.696969696969697,
"grad_norm": 1.3578821420669556,
"learning_rate": 1.1050873552433394e-07,
"loss": 0.7251,
"step": 3240
},
{
"epoch": 9.7119341563786,
"grad_norm": 1.2743161916732788,
"learning_rate": 9.974130432883199e-08,
"loss": 0.7072,
"step": 3245
},
{
"epoch": 9.726898615787505,
"grad_norm": 1.2915595769882202,
"learning_rate": 8.952461726596528e-08,
"loss": 0.6555,
"step": 3250
},
{
"epoch": 9.741863075196408,
"grad_norm": 1.2591161727905273,
"learning_rate": 7.985890030850762e-08,
"loss": 0.6642,
"step": 3255
},
{
"epoch": 9.756827534605312,
"grad_norm": 1.373780369758606,
"learning_rate": 7.074436724286704e-08,
"loss": 0.6987,
"step": 3260
},
{
"epoch": 9.771791994014217,
"grad_norm": 1.3390823602676392,
"learning_rate": 6.218121966436175e-08,
"loss": 0.7699,
"step": 3265
},
{
"epoch": 9.78675645342312,
"grad_norm": 1.4248472452163696,
"learning_rate": 5.416964697276261e-08,
"loss": 0.6654,
"step": 3270
},
{
"epoch": 9.801720912832025,
"grad_norm": 1.315335988998413,
"learning_rate": 4.670982636810761e-08,
"loss": 0.6681,
"step": 3275
},
{
"epoch": 9.816685372240928,
"grad_norm": 1.28786039352417,
"learning_rate": 3.9801922846766095e-08,
"loss": 0.7033,
"step": 3280
},
{
"epoch": 9.831649831649832,
"grad_norm": 1.4623719453811646,
"learning_rate": 3.3446089197805565e-08,
"loss": 0.6899,
"step": 3285
},
{
"epoch": 9.846614291058735,
"grad_norm": 1.390443205833435,
"learning_rate": 2.7642465999613842e-08,
"loss": 0.6837,
"step": 3290
},
{
"epoch": 9.86157875046764,
"grad_norm": 1.2957769632339478,
"learning_rate": 2.2391181616776556e-08,
"loss": 0.6578,
"step": 3295
},
{
"epoch": 9.876543209876543,
"grad_norm": 1.454103708267212,
"learning_rate": 1.7692352197240526e-08,
"loss": 0.6546,
"step": 3300
},
{
"epoch": 9.891507669285447,
"grad_norm": 1.2412161827087402,
"learning_rate": 1.354608166976301e-08,
"loss": 0.6437,
"step": 3305
},
{
"epoch": 9.90647212869435,
"grad_norm": 1.3440452814102173,
"learning_rate": 9.952461741585817e-09,
"loss": 0.726,
"step": 3310
},
{
"epoch": 9.921436588103255,
"grad_norm": 1.402801513671875,
"learning_rate": 6.9115718964257726e-09,
"loss": 0.6458,
"step": 3315
},
{
"epoch": 9.936401047512158,
"grad_norm": 1.253630518913269,
"learning_rate": 4.423479392709484e-09,
"loss": 0.6936,
"step": 3320
},
{
"epoch": 9.951365506921062,
"grad_norm": 1.3306463956832886,
"learning_rate": 2.48823926208841e-09,
"loss": 0.7048,
"step": 3325
},
{
"epoch": 9.966329966329967,
"grad_norm": 1.2049169540405273,
"learning_rate": 1.10589430822039e-09,
"loss": 0.6899,
"step": 3330
},
{
"epoch": 9.98129442573887,
"grad_norm": 1.4228817224502563,
"learning_rate": 2.764751058259574e-10,
"loss": 0.7091,
"step": 3335
},
{
"epoch": 9.996258885147775,
"grad_norm": 1.234875202178955,
"learning_rate": 0.0,
"loss": 0.6954,
"step": 3340
},
{
"epoch": 9.996258885147775,
"step": 3340,
"total_flos": 1.2597949543307674e+18,
"train_loss": 0.7376079930516775,
"train_runtime": 29450.1993,
"train_samples_per_second": 1.815,
"train_steps_per_second": 0.113
}
],
"logging_steps": 5,
"max_steps": 3340,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"total_flos": 1.2597949543307674e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}