dian_r64_ep7 / checkpoint-4500 /trainer_state.json
Trace2333's picture
Upload folder using huggingface_hub
e3e763d verified
raw
history blame contribute delete
No virus
187 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.746626686656672,
"eval_steps": 500,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005997001499250375,
"grad_norm": 0.498046875,
"learning_rate": 2.553191489361702e-05,
"loss": 1.3604,
"step": 4
},
{
"epoch": 0.01199400299850075,
"grad_norm": 0.34375,
"learning_rate": 5.106382978723404e-05,
"loss": 1.395,
"step": 8
},
{
"epoch": 0.017991004497751123,
"grad_norm": 0.41796875,
"learning_rate": 7.659574468085105e-05,
"loss": 1.2836,
"step": 12
},
{
"epoch": 0.0239880059970015,
"grad_norm": 0.2890625,
"learning_rate": 0.00010212765957446807,
"loss": 1.2673,
"step": 16
},
{
"epoch": 0.029985007496251874,
"grad_norm": 0.2392578125,
"learning_rate": 0.0001276595744680851,
"loss": 1.1774,
"step": 20
},
{
"epoch": 0.035982008995502246,
"grad_norm": 0.208984375,
"learning_rate": 0.0001531914893617021,
"loss": 1.1907,
"step": 24
},
{
"epoch": 0.041979010494752625,
"grad_norm": 0.2333984375,
"learning_rate": 0.00017872340425531912,
"loss": 1.1519,
"step": 28
},
{
"epoch": 0.047976011994003,
"grad_norm": 0.2392578125,
"learning_rate": 0.00020425531914893615,
"loss": 1.1234,
"step": 32
},
{
"epoch": 0.053973013493253376,
"grad_norm": 0.2490234375,
"learning_rate": 0.00022978723404255317,
"loss": 1.1176,
"step": 36
},
{
"epoch": 0.05997001499250375,
"grad_norm": 0.25390625,
"learning_rate": 0.0002553191489361702,
"loss": 1.1349,
"step": 40
},
{
"epoch": 0.06596701649175413,
"grad_norm": 0.2412109375,
"learning_rate": 0.0002808510638297872,
"loss": 1.0681,
"step": 44
},
{
"epoch": 0.07196401799100449,
"grad_norm": 0.236328125,
"learning_rate": 0.0002999999653501698,
"loss": 1.035,
"step": 48
},
{
"epoch": 0.07796101949025487,
"grad_norm": 0.2412109375,
"learning_rate": 0.00029999913375504725,
"loss": 1.0308,
"step": 52
},
{
"epoch": 0.08395802098950525,
"grad_norm": 0.2353515625,
"learning_rate": 0.0002999971933724042,
"loss": 1.004,
"step": 56
},
{
"epoch": 0.08995502248875563,
"grad_norm": 0.2353515625,
"learning_rate": 0.00029999414421658403,
"loss": 0.974,
"step": 60
},
{
"epoch": 0.095952023988006,
"grad_norm": 0.236328125,
"learning_rate": 0.0002999899863101258,
"loss": 0.9826,
"step": 64
},
{
"epoch": 0.10194902548725637,
"grad_norm": 0.2470703125,
"learning_rate": 0.0002999847196837647,
"loss": 0.9721,
"step": 68
},
{
"epoch": 0.10794602698650675,
"grad_norm": 0.224609375,
"learning_rate": 0.00029997834437643146,
"loss": 0.9758,
"step": 72
},
{
"epoch": 0.11394302848575712,
"grad_norm": 0.28515625,
"learning_rate": 0.00029997086043525195,
"loss": 0.9551,
"step": 76
},
{
"epoch": 0.1199400299850075,
"grad_norm": 0.2333984375,
"learning_rate": 0.00029996226791554725,
"loss": 0.9514,
"step": 80
},
{
"epoch": 0.12593703148425786,
"grad_norm": 0.2734375,
"learning_rate": 0.00029995256688083294,
"loss": 0.971,
"step": 84
},
{
"epoch": 0.13193403298350825,
"grad_norm": 0.279296875,
"learning_rate": 0.0002999417574028187,
"loss": 0.9505,
"step": 88
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.259765625,
"learning_rate": 0.00029992983956140764,
"loss": 0.9274,
"step": 92
},
{
"epoch": 0.14392803598200898,
"grad_norm": 0.25390625,
"learning_rate": 0.00029991681344469605,
"loss": 0.908,
"step": 96
},
{
"epoch": 0.14992503748125938,
"grad_norm": 0.275390625,
"learning_rate": 0.0002999026791489724,
"loss": 0.8855,
"step": 100
},
{
"epoch": 0.15592203898050974,
"grad_norm": 0.25390625,
"learning_rate": 0.0002998874367787168,
"loss": 0.9112,
"step": 104
},
{
"epoch": 0.1619190404797601,
"grad_norm": 0.27734375,
"learning_rate": 0.0002998710864466004,
"loss": 0.8654,
"step": 108
},
{
"epoch": 0.1679160419790105,
"grad_norm": 0.259765625,
"learning_rate": 0.00029985362827348406,
"loss": 0.8824,
"step": 112
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.2451171875,
"learning_rate": 0.00029983506238841787,
"loss": 0.8495,
"step": 116
},
{
"epoch": 0.17991004497751126,
"grad_norm": 0.265625,
"learning_rate": 0.0002998153889286402,
"loss": 0.8686,
"step": 120
},
{
"epoch": 0.18590704647676162,
"grad_norm": 0.26171875,
"learning_rate": 0.00029979460803957635,
"loss": 0.8391,
"step": 124
},
{
"epoch": 0.191904047976012,
"grad_norm": 0.291015625,
"learning_rate": 0.00029977271987483787,
"loss": 0.8058,
"step": 128
},
{
"epoch": 0.19790104947526238,
"grad_norm": 0.26953125,
"learning_rate": 0.0002997497245962213,
"loss": 0.792,
"step": 132
},
{
"epoch": 0.20389805097451275,
"grad_norm": 0.283203125,
"learning_rate": 0.0002997256223737066,
"loss": 0.8186,
"step": 136
},
{
"epoch": 0.2098950524737631,
"grad_norm": 0.2578125,
"learning_rate": 0.00029970041338545653,
"loss": 0.7942,
"step": 140
},
{
"epoch": 0.2158920539730135,
"grad_norm": 0.28125,
"learning_rate": 0.0002996740978178149,
"loss": 0.7686,
"step": 144
},
{
"epoch": 0.22188905547226387,
"grad_norm": 0.25390625,
"learning_rate": 0.00029964667586530533,
"loss": 0.7888,
"step": 148
},
{
"epoch": 0.22788605697151423,
"grad_norm": 0.28515625,
"learning_rate": 0.00029961814773062973,
"loss": 0.7711,
"step": 152
},
{
"epoch": 0.23388305847076463,
"grad_norm": 0.2734375,
"learning_rate": 0.000299588513624667,
"loss": 0.7903,
"step": 156
},
{
"epoch": 0.239880059970015,
"grad_norm": 0.259765625,
"learning_rate": 0.00029955777376647124,
"loss": 0.7998,
"step": 160
},
{
"epoch": 0.24587706146926536,
"grad_norm": 0.251953125,
"learning_rate": 0.00029952592838327014,
"loss": 0.7503,
"step": 164
},
{
"epoch": 0.2518740629685157,
"grad_norm": 0.255859375,
"learning_rate": 0.0002994929777104636,
"loss": 0.7894,
"step": 168
},
{
"epoch": 0.25787106446776614,
"grad_norm": 0.294921875,
"learning_rate": 0.0002994589219916216,
"loss": 0.7525,
"step": 172
},
{
"epoch": 0.2638680659670165,
"grad_norm": 0.287109375,
"learning_rate": 0.0002994237614784826,
"loss": 0.7787,
"step": 176
},
{
"epoch": 0.2698650674662669,
"grad_norm": 0.28125,
"learning_rate": 0.00029938749643095176,
"loss": 0.7606,
"step": 180
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.275390625,
"learning_rate": 0.0002993501271170988,
"loss": 0.777,
"step": 184
},
{
"epoch": 0.2818590704647676,
"grad_norm": 0.279296875,
"learning_rate": 0.0002993116538131562,
"loss": 0.7596,
"step": 188
},
{
"epoch": 0.28785607196401797,
"grad_norm": 0.27734375,
"learning_rate": 0.000299272076803517,
"loss": 0.7424,
"step": 192
},
{
"epoch": 0.2938530734632684,
"grad_norm": 0.27734375,
"learning_rate": 0.000299231396380733,
"loss": 0.7254,
"step": 196
},
{
"epoch": 0.29985007496251875,
"grad_norm": 0.2734375,
"learning_rate": 0.0002991896128455121,
"loss": 0.7353,
"step": 200
},
{
"epoch": 0.3058470764617691,
"grad_norm": 0.27734375,
"learning_rate": 0.0002991467265067165,
"loss": 0.7678,
"step": 204
},
{
"epoch": 0.3118440779610195,
"grad_norm": 0.310546875,
"learning_rate": 0.00029910273768136026,
"loss": 0.7635,
"step": 208
},
{
"epoch": 0.31784107946026985,
"grad_norm": 0.2578125,
"learning_rate": 0.0002990576466946072,
"loss": 0.6941,
"step": 212
},
{
"epoch": 0.3238380809595202,
"grad_norm": 0.26953125,
"learning_rate": 0.0002990114538797678,
"loss": 0.7591,
"step": 216
},
{
"epoch": 0.32983508245877063,
"grad_norm": 0.2578125,
"learning_rate": 0.0002989641595782977,
"loss": 0.7628,
"step": 220
},
{
"epoch": 0.335832083958021,
"grad_norm": 0.251953125,
"learning_rate": 0.0002989157641397943,
"loss": 0.7194,
"step": 224
},
{
"epoch": 0.34182908545727136,
"grad_norm": 0.26953125,
"learning_rate": 0.00029886626792199476,
"loss": 0.7298,
"step": 228
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.2890625,
"learning_rate": 0.00029881567129077315,
"loss": 0.7616,
"step": 232
},
{
"epoch": 0.3538230884557721,
"grad_norm": 0.294921875,
"learning_rate": 0.0002987639746201377,
"loss": 0.7108,
"step": 236
},
{
"epoch": 0.3598200899550225,
"grad_norm": 0.29296875,
"learning_rate": 0.00029871117829222816,
"loss": 0.6867,
"step": 240
},
{
"epoch": 0.3658170914542729,
"grad_norm": 0.279296875,
"learning_rate": 0.00029865728269731274,
"loss": 0.7453,
"step": 244
},
{
"epoch": 0.37181409295352325,
"grad_norm": 0.3203125,
"learning_rate": 0.0002986022882337856,
"loss": 0.6907,
"step": 248
},
{
"epoch": 0.3778110944527736,
"grad_norm": 0.267578125,
"learning_rate": 0.0002985461953081635,
"loss": 0.7118,
"step": 252
},
{
"epoch": 0.383808095952024,
"grad_norm": 0.31640625,
"learning_rate": 0.0002984890043350831,
"loss": 0.6886,
"step": 256
},
{
"epoch": 0.38980509745127434,
"grad_norm": 0.318359375,
"learning_rate": 0.0002984307157372978,
"loss": 0.7285,
"step": 260
},
{
"epoch": 0.39580209895052476,
"grad_norm": 0.26953125,
"learning_rate": 0.0002983713299456745,
"loss": 0.6622,
"step": 264
},
{
"epoch": 0.4017991004497751,
"grad_norm": 0.3046875,
"learning_rate": 0.00029831084739919057,
"loss": 0.6718,
"step": 268
},
{
"epoch": 0.4077961019490255,
"grad_norm": 0.294921875,
"learning_rate": 0.0002982492685449306,
"loss": 0.6862,
"step": 272
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.3125,
"learning_rate": 0.0002981865938380829,
"loss": 0.7048,
"step": 276
},
{
"epoch": 0.4197901049475262,
"grad_norm": 0.283203125,
"learning_rate": 0.0002981228237419365,
"loss": 0.7153,
"step": 280
},
{
"epoch": 0.4257871064467766,
"grad_norm": 0.287109375,
"learning_rate": 0.0002980579587278771,
"loss": 0.7046,
"step": 284
},
{
"epoch": 0.431784107946027,
"grad_norm": 0.27734375,
"learning_rate": 0.00029799199927538455,
"loss": 0.687,
"step": 288
},
{
"epoch": 0.43778110944527737,
"grad_norm": 0.28125,
"learning_rate": 0.0002979249458720284,
"loss": 0.658,
"step": 292
},
{
"epoch": 0.44377811094452774,
"grad_norm": 0.287109375,
"learning_rate": 0.00029785679901346454,
"loss": 0.6552,
"step": 296
},
{
"epoch": 0.4497751124437781,
"grad_norm": 0.306640625,
"learning_rate": 0.00029778755920343186,
"loss": 0.6414,
"step": 300
},
{
"epoch": 0.45577211394302847,
"grad_norm": 0.283203125,
"learning_rate": 0.00029771722695374835,
"loss": 0.6696,
"step": 304
},
{
"epoch": 0.4617691154422789,
"grad_norm": 0.298828125,
"learning_rate": 0.00029764580278430694,
"loss": 0.6113,
"step": 308
},
{
"epoch": 0.46776611694152925,
"grad_norm": 0.296875,
"learning_rate": 0.00029757328722307234,
"loss": 0.6773,
"step": 312
},
{
"epoch": 0.4737631184407796,
"grad_norm": 0.361328125,
"learning_rate": 0.0002974996808060766,
"loss": 0.6691,
"step": 316
},
{
"epoch": 0.47976011994003,
"grad_norm": 0.291015625,
"learning_rate": 0.0002974249840774154,
"loss": 0.6465,
"step": 320
},
{
"epoch": 0.48575712143928035,
"grad_norm": 0.30078125,
"learning_rate": 0.0002973491975892439,
"loss": 0.6464,
"step": 324
},
{
"epoch": 0.4917541229385307,
"grad_norm": 0.263671875,
"learning_rate": 0.0002972723219017727,
"loss": 0.6439,
"step": 328
},
{
"epoch": 0.49775112443778113,
"grad_norm": 0.298828125,
"learning_rate": 0.0002971943575832639,
"loss": 0.6623,
"step": 332
},
{
"epoch": 0.5037481259370314,
"grad_norm": 0.318359375,
"learning_rate": 0.0002971153052100265,
"loss": 0.6793,
"step": 336
},
{
"epoch": 0.5097451274362819,
"grad_norm": 0.298828125,
"learning_rate": 0.0002970351653664125,
"loss": 0.6144,
"step": 340
},
{
"epoch": 0.5157421289355323,
"grad_norm": 0.5859375,
"learning_rate": 0.00029695393864481224,
"loss": 0.5845,
"step": 344
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.2734375,
"learning_rate": 0.0002968716256456505,
"loss": 0.6055,
"step": 348
},
{
"epoch": 0.527736131934033,
"grad_norm": 0.490234375,
"learning_rate": 0.00029678822697738153,
"loss": 0.6746,
"step": 352
},
{
"epoch": 0.5337331334332833,
"grad_norm": 0.30859375,
"learning_rate": 0.000296703743256485,
"loss": 0.6383,
"step": 356
},
{
"epoch": 0.5397301349325337,
"grad_norm": 0.283203125,
"learning_rate": 0.0002966181751074611,
"loss": 0.6634,
"step": 360
},
{
"epoch": 0.545727136431784,
"grad_norm": 0.330078125,
"learning_rate": 0.00029653152316282615,
"loss": 0.6992,
"step": 364
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.466796875,
"learning_rate": 0.00029644378806310774,
"loss": 0.6535,
"step": 368
},
{
"epoch": 0.5577211394302849,
"grad_norm": 0.376953125,
"learning_rate": 0.0002963549704568403,
"loss": 0.6474,
"step": 372
},
{
"epoch": 0.5637181409295352,
"grad_norm": 0.326171875,
"learning_rate": 0.0002962650710005599,
"loss": 0.6175,
"step": 376
},
{
"epoch": 0.5697151424287856,
"grad_norm": 0.283203125,
"learning_rate": 0.00029617409035879967,
"loss": 0.7,
"step": 380
},
{
"epoch": 0.5757121439280359,
"grad_norm": 0.287109375,
"learning_rate": 0.0002960820292040848,
"loss": 0.6635,
"step": 384
},
{
"epoch": 0.5817091454272864,
"grad_norm": 0.291015625,
"learning_rate": 0.00029598888821692776,
"loss": 0.6896,
"step": 388
},
{
"epoch": 0.5877061469265368,
"grad_norm": 0.291015625,
"learning_rate": 0.00029589466808582277,
"loss": 0.6824,
"step": 392
},
{
"epoch": 0.5937031484257871,
"grad_norm": 0.283203125,
"learning_rate": 0.00029579936950724134,
"loss": 0.598,
"step": 396
},
{
"epoch": 0.5997001499250375,
"grad_norm": 0.318359375,
"learning_rate": 0.0002957029931856267,
"loss": 0.6196,
"step": 400
},
{
"epoch": 0.6056971514242878,
"grad_norm": 1.25,
"learning_rate": 0.0002956055398333886,
"loss": 0.682,
"step": 404
},
{
"epoch": 0.6116941529235382,
"grad_norm": 0.29296875,
"learning_rate": 0.00029550701017089844,
"loss": 0.6669,
"step": 408
},
{
"epoch": 0.6176911544227887,
"grad_norm": 0.29296875,
"learning_rate": 0.00029540740492648343,
"loss": 0.6382,
"step": 412
},
{
"epoch": 0.623688155922039,
"grad_norm": 0.28515625,
"learning_rate": 0.0002953067248364214,
"loss": 0.6614,
"step": 416
},
{
"epoch": 0.6296851574212894,
"grad_norm": 0.310546875,
"learning_rate": 0.0002952049706449356,
"loss": 0.7027,
"step": 420
},
{
"epoch": 0.6356821589205397,
"grad_norm": 0.291015625,
"learning_rate": 0.00029510214310418887,
"loss": 0.6834,
"step": 424
},
{
"epoch": 0.6416791604197901,
"grad_norm": 0.26953125,
"learning_rate": 0.00029499824297427827,
"loss": 0.6876,
"step": 428
},
{
"epoch": 0.6476761619190404,
"grad_norm": 0.26953125,
"learning_rate": 0.00029489327102322926,
"loss": 0.6574,
"step": 432
},
{
"epoch": 0.6536731634182908,
"grad_norm": 0.28515625,
"learning_rate": 0.0002947872280269904,
"loss": 0.6296,
"step": 436
},
{
"epoch": 0.6596701649175413,
"grad_norm": 0.26171875,
"learning_rate": 0.000294680114769427,
"loss": 0.5848,
"step": 440
},
{
"epoch": 0.6656671664167916,
"grad_norm": 0.291015625,
"learning_rate": 0.0002945719320423161,
"loss": 0.6623,
"step": 444
},
{
"epoch": 0.671664167916042,
"grad_norm": 0.27734375,
"learning_rate": 0.00029446268064534,
"loss": 0.643,
"step": 448
},
{
"epoch": 0.6776611694152923,
"grad_norm": 0.294921875,
"learning_rate": 0.0002943523613860805,
"loss": 0.5834,
"step": 452
},
{
"epoch": 0.6836581709145427,
"grad_norm": 0.279296875,
"learning_rate": 0.0002942409750800133,
"loss": 0.6101,
"step": 456
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.279296875,
"learning_rate": 0.00029412852255050124,
"loss": 0.6145,
"step": 460
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.271484375,
"learning_rate": 0.000294015004628789,
"loss": 0.5801,
"step": 464
},
{
"epoch": 0.7016491754122939,
"grad_norm": 0.279296875,
"learning_rate": 0.0002939004221539964,
"loss": 0.6429,
"step": 468
},
{
"epoch": 0.7076461769115442,
"grad_norm": 0.306640625,
"learning_rate": 0.0002937847759731125,
"loss": 0.6359,
"step": 472
},
{
"epoch": 0.7136431784107946,
"grad_norm": 0.271484375,
"learning_rate": 0.0002936680669409891,
"loss": 0.6487,
"step": 476
},
{
"epoch": 0.719640179910045,
"grad_norm": 0.28515625,
"learning_rate": 0.00029355029592033474,
"loss": 0.6244,
"step": 480
},
{
"epoch": 0.7256371814092953,
"grad_norm": 0.27734375,
"learning_rate": 0.000293431463781708,
"loss": 0.6023,
"step": 484
},
{
"epoch": 0.7316341829085458,
"grad_norm": 0.287109375,
"learning_rate": 0.0002933115714035112,
"loss": 0.6105,
"step": 488
},
{
"epoch": 0.7376311844077961,
"grad_norm": 0.283203125,
"learning_rate": 0.00029319061967198395,
"loss": 0.6146,
"step": 492
},
{
"epoch": 0.7436281859070465,
"grad_norm": 0.29296875,
"learning_rate": 0.0002930686094811966,
"loss": 0.5759,
"step": 496
},
{
"epoch": 0.7496251874062968,
"grad_norm": 0.265625,
"learning_rate": 0.0002929455417330435,
"loss": 0.6215,
"step": 500
},
{
"epoch": 0.7556221889055472,
"grad_norm": 0.265625,
"learning_rate": 0.0002928214173372364,
"loss": 0.5969,
"step": 504
},
{
"epoch": 0.7616191904047976,
"grad_norm": 0.287109375,
"learning_rate": 0.00029269623721129797,
"loss": 0.6657,
"step": 508
},
{
"epoch": 0.767616191904048,
"grad_norm": 0.28515625,
"learning_rate": 0.00029257000228055446,
"loss": 0.5872,
"step": 512
},
{
"epoch": 0.7736131934032984,
"grad_norm": 0.26953125,
"learning_rate": 0.00029244271347812946,
"loss": 0.5736,
"step": 516
},
{
"epoch": 0.7796101949025487,
"grad_norm": 0.271484375,
"learning_rate": 0.00029231437174493654,
"loss": 0.6027,
"step": 520
},
{
"epoch": 0.7856071964017991,
"grad_norm": 0.28125,
"learning_rate": 0.00029218497802967273,
"loss": 0.6296,
"step": 524
},
{
"epoch": 0.7916041979010495,
"grad_norm": 0.291015625,
"learning_rate": 0.0002920545332888111,
"loss": 0.5929,
"step": 528
},
{
"epoch": 0.7976011994002998,
"grad_norm": 0.2734375,
"learning_rate": 0.00029192303848659377,
"loss": 0.636,
"step": 532
},
{
"epoch": 0.8035982008995503,
"grad_norm": 0.27734375,
"learning_rate": 0.0002917904945950252,
"loss": 0.6177,
"step": 536
},
{
"epoch": 0.8095952023988006,
"grad_norm": 0.259765625,
"learning_rate": 0.00029165690259386423,
"loss": 0.6226,
"step": 540
},
{
"epoch": 0.815592203898051,
"grad_norm": 0.291015625,
"learning_rate": 0.0002915222634706177,
"loss": 0.6155,
"step": 544
},
{
"epoch": 0.8215892053973014,
"grad_norm": 0.296875,
"learning_rate": 0.00029138657822053247,
"loss": 0.6098,
"step": 548
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.251953125,
"learning_rate": 0.00029124984784658844,
"loss": 0.5997,
"step": 552
},
{
"epoch": 0.8335832083958021,
"grad_norm": 0.27734375,
"learning_rate": 0.000291112073359491,
"loss": 0.6189,
"step": 556
},
{
"epoch": 0.8395802098950524,
"grad_norm": 0.25,
"learning_rate": 0.00029097325577766357,
"loss": 0.5949,
"step": 560
},
{
"epoch": 0.8455772113943029,
"grad_norm": 0.28125,
"learning_rate": 0.00029083339612724006,
"loss": 0.6277,
"step": 564
},
{
"epoch": 0.8515742128935532,
"grad_norm": 0.24609375,
"learning_rate": 0.00029069249544205744,
"loss": 0.5951,
"step": 568
},
{
"epoch": 0.8575712143928036,
"grad_norm": 0.294921875,
"learning_rate": 0.00029055055476364777,
"loss": 0.624,
"step": 572
},
{
"epoch": 0.863568215892054,
"grad_norm": 0.267578125,
"learning_rate": 0.00029040757514123077,
"loss": 0.6465,
"step": 576
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.2734375,
"learning_rate": 0.00029026355763170613,
"loss": 0.6299,
"step": 580
},
{
"epoch": 0.8755622188905547,
"grad_norm": 0.263671875,
"learning_rate": 0.00029011850329964536,
"loss": 0.6217,
"step": 584
},
{
"epoch": 0.881559220389805,
"grad_norm": 0.30078125,
"learning_rate": 0.0002899724132172842,
"loss": 0.6225,
"step": 588
},
{
"epoch": 0.8875562218890555,
"grad_norm": 0.271484375,
"learning_rate": 0.00028982528846451466,
"loss": 0.5979,
"step": 592
},
{
"epoch": 0.8935532233883059,
"grad_norm": 0.287109375,
"learning_rate": 0.000289677130128877,
"loss": 0.6094,
"step": 596
},
{
"epoch": 0.8995502248875562,
"grad_norm": 0.283203125,
"learning_rate": 0.00028952793930555156,
"loss": 0.6134,
"step": 600
},
{
"epoch": 0.9055472263868066,
"grad_norm": 0.279296875,
"learning_rate": 0.00028937771709735085,
"loss": 0.6125,
"step": 604
},
{
"epoch": 0.9115442278860569,
"grad_norm": 0.2734375,
"learning_rate": 0.00028922646461471146,
"loss": 0.6229,
"step": 608
},
{
"epoch": 0.9175412293853074,
"grad_norm": 0.267578125,
"learning_rate": 0.00028907418297568544,
"loss": 0.6114,
"step": 612
},
{
"epoch": 0.9235382308845578,
"grad_norm": 0.283203125,
"learning_rate": 0.00028892087330593263,
"loss": 0.6052,
"step": 616
},
{
"epoch": 0.9295352323838081,
"grad_norm": 0.27734375,
"learning_rate": 0.0002887665367387119,
"loss": 0.5971,
"step": 620
},
{
"epoch": 0.9355322338830585,
"grad_norm": 0.279296875,
"learning_rate": 0.00028861117441487277,
"loss": 0.563,
"step": 624
},
{
"epoch": 0.9415292353823088,
"grad_norm": 0.2578125,
"learning_rate": 0.00028845478748284743,
"loss": 0.5906,
"step": 628
},
{
"epoch": 0.9475262368815592,
"grad_norm": 0.26171875,
"learning_rate": 0.0002882973770986416,
"loss": 0.5841,
"step": 632
},
{
"epoch": 0.9535232383808095,
"grad_norm": 0.267578125,
"learning_rate": 0.00028813894442582656,
"loss": 0.6249,
"step": 636
},
{
"epoch": 0.95952023988006,
"grad_norm": 0.28515625,
"learning_rate": 0.00028797949063553014,
"loss": 0.5862,
"step": 640
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.263671875,
"learning_rate": 0.00028781901690642833,
"loss": 0.5564,
"step": 644
},
{
"epoch": 0.9715142428785607,
"grad_norm": 0.294921875,
"learning_rate": 0.0002876575244247364,
"loss": 0.6202,
"step": 648
},
{
"epoch": 0.9775112443778111,
"grad_norm": 0.28125,
"learning_rate": 0.00028749501438420034,
"loss": 0.6844,
"step": 652
},
{
"epoch": 0.9835082458770614,
"grad_norm": 0.27734375,
"learning_rate": 0.00028733148798608767,
"loss": 0.6133,
"step": 656
},
{
"epoch": 0.9895052473763118,
"grad_norm": 0.265625,
"learning_rate": 0.0002871669464391789,
"loss": 0.5914,
"step": 660
},
{
"epoch": 0.9955022488755623,
"grad_norm": 0.2578125,
"learning_rate": 0.0002870013909597586,
"loss": 0.5781,
"step": 664
},
{
"epoch": 1.0014992503748126,
"grad_norm": 0.263671875,
"learning_rate": 0.000286834822771606,
"loss": 0.5998,
"step": 668
},
{
"epoch": 1.0074962518740629,
"grad_norm": 0.265625,
"learning_rate": 0.00028666724310598657,
"loss": 0.5466,
"step": 672
},
{
"epoch": 1.0134932533733134,
"grad_norm": 0.283203125,
"learning_rate": 0.0002864986532016423,
"loss": 0.4778,
"step": 676
},
{
"epoch": 1.0194902548725637,
"grad_norm": 0.255859375,
"learning_rate": 0.00028632905430478294,
"loss": 0.4739,
"step": 680
},
{
"epoch": 1.025487256371814,
"grad_norm": 0.287109375,
"learning_rate": 0.0002861584476690767,
"loss": 0.51,
"step": 684
},
{
"epoch": 1.0314842578710646,
"grad_norm": 0.265625,
"learning_rate": 0.0002859868345556409,
"loss": 0.5517,
"step": 688
},
{
"epoch": 1.0374812593703149,
"grad_norm": 0.28125,
"learning_rate": 0.00028581421623303274,
"loss": 0.5065,
"step": 692
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.259765625,
"learning_rate": 0.0002856405939772398,
"loss": 0.5512,
"step": 696
},
{
"epoch": 1.0494752623688155,
"grad_norm": 0.25390625,
"learning_rate": 0.00028546596907167094,
"loss": 0.5293,
"step": 700
},
{
"epoch": 1.055472263868066,
"grad_norm": 0.259765625,
"learning_rate": 0.0002852903428071462,
"loss": 0.5048,
"step": 704
},
{
"epoch": 1.0614692653673163,
"grad_norm": 0.24609375,
"learning_rate": 0.00028511371648188785,
"loss": 0.5045,
"step": 708
},
{
"epoch": 1.0674662668665666,
"grad_norm": 0.291015625,
"learning_rate": 0.0002849360914015106,
"loss": 0.486,
"step": 712
},
{
"epoch": 1.0734632683658172,
"grad_norm": 0.267578125,
"learning_rate": 0.0002847574688790118,
"loss": 0.5105,
"step": 716
},
{
"epoch": 1.0794602698650675,
"grad_norm": 0.25390625,
"learning_rate": 0.00028457785023476193,
"loss": 0.5176,
"step": 720
},
{
"epoch": 1.0854572713643178,
"grad_norm": 0.279296875,
"learning_rate": 0.00028439723679649467,
"loss": 0.4982,
"step": 724
},
{
"epoch": 1.0914542728635683,
"grad_norm": 0.255859375,
"learning_rate": 0.00028421562989929726,
"loss": 0.5004,
"step": 728
},
{
"epoch": 1.0974512743628186,
"grad_norm": 0.263671875,
"learning_rate": 0.0002840330308856006,
"loss": 0.5341,
"step": 732
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.2578125,
"learning_rate": 0.0002838494411051692,
"loss": 0.5225,
"step": 736
},
{
"epoch": 1.1094452773613193,
"grad_norm": 0.291015625,
"learning_rate": 0.00028366486191509115,
"loss": 0.5249,
"step": 740
},
{
"epoch": 1.1154422788605698,
"grad_norm": 0.27734375,
"learning_rate": 0.00028347929467976843,
"loss": 0.4945,
"step": 744
},
{
"epoch": 1.12143928035982,
"grad_norm": 0.28515625,
"learning_rate": 0.00028329274077090657,
"loss": 0.4733,
"step": 748
},
{
"epoch": 1.1274362818590704,
"grad_norm": 0.267578125,
"learning_rate": 0.0002831052015675044,
"loss": 0.5443,
"step": 752
},
{
"epoch": 1.133433283358321,
"grad_norm": 0.267578125,
"learning_rate": 0.0002829166784558442,
"loss": 0.5287,
"step": 756
},
{
"epoch": 1.1394302848575713,
"grad_norm": 0.291015625,
"learning_rate": 0.0002827271728294812,
"loss": 0.4699,
"step": 760
},
{
"epoch": 1.1454272863568216,
"grad_norm": 0.265625,
"learning_rate": 0.00028253668608923323,
"loss": 0.5091,
"step": 764
},
{
"epoch": 1.1514242878560719,
"grad_norm": 0.271484375,
"learning_rate": 0.0002823452196431706,
"loss": 0.4919,
"step": 768
},
{
"epoch": 1.1574212893553224,
"grad_norm": 0.259765625,
"learning_rate": 0.0002821527749066055,
"loss": 0.5538,
"step": 772
},
{
"epoch": 1.1634182908545727,
"grad_norm": 0.25,
"learning_rate": 0.00028195935330208163,
"loss": 0.5304,
"step": 776
},
{
"epoch": 1.169415292353823,
"grad_norm": 0.26171875,
"learning_rate": 0.0002817649562593637,
"loss": 0.5099,
"step": 780
},
{
"epoch": 1.1754122938530736,
"grad_norm": 0.259765625,
"learning_rate": 0.0002815695852154267,
"loss": 0.5286,
"step": 784
},
{
"epoch": 1.1814092953523239,
"grad_norm": 0.2734375,
"learning_rate": 0.00028137324161444554,
"loss": 0.5302,
"step": 788
},
{
"epoch": 1.1874062968515742,
"grad_norm": 0.287109375,
"learning_rate": 0.00028117592690778413,
"loss": 0.489,
"step": 792
},
{
"epoch": 1.1934032983508245,
"grad_norm": 0.31640625,
"learning_rate": 0.0002809776425539848,
"loss": 0.4831,
"step": 796
},
{
"epoch": 1.199400299850075,
"grad_norm": 0.259765625,
"learning_rate": 0.00028077839001875744,
"loss": 0.5265,
"step": 800
},
{
"epoch": 1.2053973013493253,
"grad_norm": 0.29296875,
"learning_rate": 0.0002805781707749688,
"loss": 0.4821,
"step": 804
},
{
"epoch": 1.2113943028485756,
"grad_norm": 0.29296875,
"learning_rate": 0.0002803769863026313,
"loss": 0.4793,
"step": 808
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.27734375,
"learning_rate": 0.00028017483808889245,
"loss": 0.5088,
"step": 812
},
{
"epoch": 1.2233883058470765,
"grad_norm": 0.298828125,
"learning_rate": 0.0002799717276280237,
"loss": 0.5152,
"step": 816
},
{
"epoch": 1.2293853073463268,
"grad_norm": 0.263671875,
"learning_rate": 0.00027976765642140935,
"loss": 0.595,
"step": 820
},
{
"epoch": 1.235382308845577,
"grad_norm": 0.2734375,
"learning_rate": 0.00027956262597753545,
"loss": 0.536,
"step": 824
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.26171875,
"learning_rate": 0.0002793566378119787,
"loss": 0.5102,
"step": 828
},
{
"epoch": 1.247376311844078,
"grad_norm": 0.279296875,
"learning_rate": 0.00027914969344739545,
"loss": 0.5385,
"step": 832
},
{
"epoch": 1.2533733133433285,
"grad_norm": 0.26171875,
"learning_rate": 0.0002789417944135098,
"loss": 0.5201,
"step": 836
},
{
"epoch": 1.2593703148425788,
"grad_norm": 0.271484375,
"learning_rate": 0.0002787329422471032,
"loss": 0.5126,
"step": 840
},
{
"epoch": 1.265367316341829,
"grad_norm": 0.27734375,
"learning_rate": 0.0002785231384920023,
"loss": 0.4304,
"step": 844
},
{
"epoch": 1.2713643178410794,
"grad_norm": 0.298828125,
"learning_rate": 0.000278312384699068,
"loss": 0.5052,
"step": 848
},
{
"epoch": 1.2773613193403297,
"grad_norm": 0.27734375,
"learning_rate": 0.0002781006824261838,
"loss": 0.5248,
"step": 852
},
{
"epoch": 1.2833583208395802,
"grad_norm": 0.275390625,
"learning_rate": 0.0002778880332382443,
"loss": 0.5219,
"step": 856
},
{
"epoch": 1.2893553223388305,
"grad_norm": 0.255859375,
"learning_rate": 0.0002776744387071437,
"loss": 0.5177,
"step": 860
},
{
"epoch": 1.295352323838081,
"grad_norm": 0.267578125,
"learning_rate": 0.00027745990041176406,
"loss": 0.5015,
"step": 864
},
{
"epoch": 1.3013493253373314,
"grad_norm": 0.302734375,
"learning_rate": 0.00027724441993796386,
"loss": 0.5045,
"step": 868
},
{
"epoch": 1.3073463268365817,
"grad_norm": 0.259765625,
"learning_rate": 0.000277027998878566,
"loss": 0.5399,
"step": 872
},
{
"epoch": 1.313343328335832,
"grad_norm": 0.283203125,
"learning_rate": 0.0002768106388333462,
"loss": 0.4533,
"step": 876
},
{
"epoch": 1.3193403298350825,
"grad_norm": 0.275390625,
"learning_rate": 0.0002765923414090211,
"loss": 0.4942,
"step": 880
},
{
"epoch": 1.3253373313343328,
"grad_norm": 0.265625,
"learning_rate": 0.00027637310821923637,
"loss": 0.4559,
"step": 884
},
{
"epoch": 1.3313343328335832,
"grad_norm": 0.279296875,
"learning_rate": 0.00027615294088455494,
"loss": 0.4603,
"step": 888
},
{
"epoch": 1.3373313343328337,
"grad_norm": 0.26953125,
"learning_rate": 0.00027593184103244474,
"loss": 0.5045,
"step": 892
},
{
"epoch": 1.343328335832084,
"grad_norm": 0.26953125,
"learning_rate": 0.000275709810297267,
"loss": 0.5183,
"step": 896
},
{
"epoch": 1.3493253373313343,
"grad_norm": 0.271484375,
"learning_rate": 0.00027548685032026393,
"loss": 0.5529,
"step": 900
},
{
"epoch": 1.3553223388305846,
"grad_norm": 0.283203125,
"learning_rate": 0.0002752629627495466,
"loss": 0.5169,
"step": 904
},
{
"epoch": 1.3613193403298351,
"grad_norm": 0.306640625,
"learning_rate": 0.0002750381492400829,
"loss": 0.5303,
"step": 908
},
{
"epoch": 1.3673163418290855,
"grad_norm": 0.291015625,
"learning_rate": 0.0002748124114536852,
"loss": 0.5258,
"step": 912
},
{
"epoch": 1.3733133433283358,
"grad_norm": 0.283203125,
"learning_rate": 0.0002745857510589979,
"loss": 0.5352,
"step": 916
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.306640625,
"learning_rate": 0.00027435816973148564,
"loss": 0.5202,
"step": 920
},
{
"epoch": 1.3853073463268366,
"grad_norm": 0.259765625,
"learning_rate": 0.0002741296691534204,
"loss": 0.4443,
"step": 924
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.28515625,
"learning_rate": 0.0002739002510138691,
"loss": 0.4865,
"step": 928
},
{
"epoch": 1.3973013493253372,
"grad_norm": 0.2578125,
"learning_rate": 0.00027366991700868127,
"loss": 0.5044,
"step": 932
},
{
"epoch": 1.4032983508245878,
"grad_norm": 0.29296875,
"learning_rate": 0.00027343866884047674,
"loss": 0.4876,
"step": 936
},
{
"epoch": 1.409295352323838,
"grad_norm": 0.28515625,
"learning_rate": 0.0002732065082186324,
"loss": 0.5361,
"step": 940
},
{
"epoch": 1.4152923538230884,
"grad_norm": 0.291015625,
"learning_rate": 0.00027297343685927036,
"loss": 0.4938,
"step": 944
},
{
"epoch": 1.421289355322339,
"grad_norm": 0.298828125,
"learning_rate": 0.0002727394564852445,
"loss": 0.5098,
"step": 948
},
{
"epoch": 1.4272863568215892,
"grad_norm": 0.279296875,
"learning_rate": 0.0002725045688261283,
"loss": 0.5342,
"step": 952
},
{
"epoch": 1.4332833583208395,
"grad_norm": 0.27734375,
"learning_rate": 0.00027226877561820187,
"loss": 0.48,
"step": 956
},
{
"epoch": 1.4392803598200898,
"grad_norm": 0.29296875,
"learning_rate": 0.0002720320786044391,
"loss": 0.4997,
"step": 960
},
{
"epoch": 1.4452773613193404,
"grad_norm": 0.296875,
"learning_rate": 0.0002717944795344946,
"loss": 0.5382,
"step": 964
},
{
"epoch": 1.4512743628185907,
"grad_norm": 0.28515625,
"learning_rate": 0.00027155598016469115,
"loss": 0.5305,
"step": 968
},
{
"epoch": 1.4572713643178412,
"grad_norm": 0.275390625,
"learning_rate": 0.00027131658225800637,
"loss": 0.5172,
"step": 972
},
{
"epoch": 1.4632683658170915,
"grad_norm": 0.2734375,
"learning_rate": 0.00027107628758405995,
"loss": 0.5318,
"step": 976
},
{
"epoch": 1.4692653673163418,
"grad_norm": 0.287109375,
"learning_rate": 0.0002708350979191004,
"loss": 0.5143,
"step": 980
},
{
"epoch": 1.4752623688155921,
"grad_norm": 0.28515625,
"learning_rate": 0.00027059301504599187,
"loss": 0.4811,
"step": 984
},
{
"epoch": 1.4812593703148424,
"grad_norm": 0.28125,
"learning_rate": 0.0002703500407542012,
"loss": 0.4862,
"step": 988
},
{
"epoch": 1.487256371814093,
"grad_norm": 0.30859375,
"learning_rate": 0.00027010617683978456,
"loss": 0.5058,
"step": 992
},
{
"epoch": 1.4932533733133433,
"grad_norm": 0.26953125,
"learning_rate": 0.00026986142510537406,
"loss": 0.4691,
"step": 996
},
{
"epoch": 1.4992503748125938,
"grad_norm": 0.30078125,
"learning_rate": 0.0002696157873601646,
"loss": 0.5224,
"step": 1000
},
{
"epoch": 1.5052473763118441,
"grad_norm": 0.306640625,
"learning_rate": 0.00026936926541990046,
"loss": 0.5588,
"step": 1004
},
{
"epoch": 1.5112443778110944,
"grad_norm": 0.283203125,
"learning_rate": 0.00026912186110686186,
"loss": 0.486,
"step": 1008
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.28515625,
"learning_rate": 0.0002688735762498515,
"loss": 0.5366,
"step": 1012
},
{
"epoch": 1.523238380809595,
"grad_norm": 0.298828125,
"learning_rate": 0.00026862441268418085,
"loss": 0.5101,
"step": 1016
},
{
"epoch": 1.5292353823088456,
"grad_norm": 0.275390625,
"learning_rate": 0.000268374372251657,
"loss": 0.5154,
"step": 1020
},
{
"epoch": 1.535232383808096,
"grad_norm": 0.275390625,
"learning_rate": 0.00026812345680056867,
"loss": 0.5155,
"step": 1024
},
{
"epoch": 1.5412293853073464,
"grad_norm": 0.283203125,
"learning_rate": 0.00026787166818567263,
"loss": 0.5368,
"step": 1028
},
{
"epoch": 1.5472263868065967,
"grad_norm": 0.291015625,
"learning_rate": 0.00026761900826818033,
"loss": 0.537,
"step": 1032
},
{
"epoch": 1.553223388305847,
"grad_norm": 0.267578125,
"learning_rate": 0.0002673654789157435,
"loss": 0.5323,
"step": 1036
},
{
"epoch": 1.5592203898050974,
"grad_norm": 0.294921875,
"learning_rate": 0.0002671110820024408,
"loss": 0.5142,
"step": 1040
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.27734375,
"learning_rate": 0.00026685581940876396,
"loss": 0.5343,
"step": 1044
},
{
"epoch": 1.5712143928035982,
"grad_norm": 0.279296875,
"learning_rate": 0.00026659969302160377,
"loss": 0.5076,
"step": 1048
},
{
"epoch": 1.5772113943028487,
"grad_norm": 0.267578125,
"learning_rate": 0.00026634270473423606,
"loss": 0.499,
"step": 1052
},
{
"epoch": 1.583208395802099,
"grad_norm": 0.28125,
"learning_rate": 0.0002660848564463079,
"loss": 0.485,
"step": 1056
},
{
"epoch": 1.5892053973013494,
"grad_norm": 0.279296875,
"learning_rate": 0.00026582615006382333,
"loss": 0.5186,
"step": 1060
},
{
"epoch": 1.5952023988005997,
"grad_norm": 0.291015625,
"learning_rate": 0.00026556658749912944,
"loss": 0.5256,
"step": 1064
},
{
"epoch": 1.60119940029985,
"grad_norm": 0.28515625,
"learning_rate": 0.00026530617067090225,
"loss": 0.5223,
"step": 1068
},
{
"epoch": 1.6071964017991005,
"grad_norm": 0.28515625,
"learning_rate": 0.0002650449015041324,
"loss": 0.509,
"step": 1072
},
{
"epoch": 1.6131934032983508,
"grad_norm": 0.275390625,
"learning_rate": 0.0002647827819301109,
"loss": 0.5089,
"step": 1076
},
{
"epoch": 1.6191904047976013,
"grad_norm": 0.27734375,
"learning_rate": 0.0002645198138864151,
"loss": 0.4925,
"step": 1080
},
{
"epoch": 1.6251874062968517,
"grad_norm": 0.279296875,
"learning_rate": 0.0002642559993168942,
"loss": 0.5303,
"step": 1084
},
{
"epoch": 1.631184407796102,
"grad_norm": 0.275390625,
"learning_rate": 0.0002639913401716546,
"loss": 0.5077,
"step": 1088
},
{
"epoch": 1.6371814092953523,
"grad_norm": 0.28515625,
"learning_rate": 0.0002637258384070461,
"loss": 0.5554,
"step": 1092
},
{
"epoch": 1.6431784107946026,
"grad_norm": 0.263671875,
"learning_rate": 0.0002634594959856471,
"loss": 0.4447,
"step": 1096
},
{
"epoch": 1.6491754122938531,
"grad_norm": 0.296875,
"learning_rate": 0.00026319231487624984,
"loss": 0.4951,
"step": 1100
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.28125,
"learning_rate": 0.0002629242970538463,
"loss": 0.5053,
"step": 1104
},
{
"epoch": 1.661169415292354,
"grad_norm": 0.2578125,
"learning_rate": 0.0002626554444996133,
"loss": 0.4702,
"step": 1108
},
{
"epoch": 1.6671664167916043,
"grad_norm": 0.2890625,
"learning_rate": 0.0002623857592008982,
"loss": 0.477,
"step": 1112
},
{
"epoch": 1.6731634182908546,
"grad_norm": 0.2734375,
"learning_rate": 0.00026211524315120365,
"loss": 0.4858,
"step": 1116
},
{
"epoch": 1.6791604197901049,
"grad_norm": 0.279296875,
"learning_rate": 0.0002618438983501734,
"loss": 0.4938,
"step": 1120
},
{
"epoch": 1.6851574212893552,
"grad_norm": 0.294921875,
"learning_rate": 0.00026157172680357717,
"loss": 0.4687,
"step": 1124
},
{
"epoch": 1.6911544227886057,
"grad_norm": 0.294921875,
"learning_rate": 0.0002612987305232961,
"loss": 0.4976,
"step": 1128
},
{
"epoch": 1.697151424287856,
"grad_norm": 0.2890625,
"learning_rate": 0.0002610249115273075,
"loss": 0.5319,
"step": 1132
},
{
"epoch": 1.7031484257871066,
"grad_norm": 0.25,
"learning_rate": 0.0002607502718396705,
"loss": 0.5139,
"step": 1136
},
{
"epoch": 1.7091454272863569,
"grad_norm": 0.27734375,
"learning_rate": 0.0002604748134905103,
"loss": 0.4979,
"step": 1140
},
{
"epoch": 1.7151424287856072,
"grad_norm": 0.29296875,
"learning_rate": 0.00026019853851600404,
"loss": 0.5016,
"step": 1144
},
{
"epoch": 1.7211394302848575,
"grad_norm": 0.26953125,
"learning_rate": 0.00025992144895836504,
"loss": 0.4872,
"step": 1148
},
{
"epoch": 1.7271364317841078,
"grad_norm": 0.2734375,
"learning_rate": 0.0002596435468658282,
"loss": 0.5164,
"step": 1152
},
{
"epoch": 1.7331334332833583,
"grad_norm": 0.271484375,
"learning_rate": 0.00025936483429263437,
"loss": 0.4904,
"step": 1156
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.298828125,
"learning_rate": 0.00025908531329901574,
"loss": 0.5198,
"step": 1160
},
{
"epoch": 1.7451274362818592,
"grad_norm": 0.2890625,
"learning_rate": 0.0002588049859511801,
"loss": 0.5574,
"step": 1164
},
{
"epoch": 1.7511244377811095,
"grad_norm": 0.287109375,
"learning_rate": 0.00025852385432129587,
"loss": 0.5086,
"step": 1168
},
{
"epoch": 1.7571214392803598,
"grad_norm": 0.318359375,
"learning_rate": 0.0002582419204874767,
"loss": 0.5387,
"step": 1172
},
{
"epoch": 1.76311844077961,
"grad_norm": 0.294921875,
"learning_rate": 0.000257959186533766,
"loss": 0.5478,
"step": 1176
},
{
"epoch": 1.7691154422788604,
"grad_norm": 0.283203125,
"learning_rate": 0.0002576756545501218,
"loss": 0.4899,
"step": 1180
},
{
"epoch": 1.775112443778111,
"grad_norm": 0.279296875,
"learning_rate": 0.0002573913266324009,
"loss": 0.4824,
"step": 1184
},
{
"epoch": 1.7811094452773615,
"grad_norm": 0.2890625,
"learning_rate": 0.00025710620488234384,
"loss": 0.5113,
"step": 1188
},
{
"epoch": 1.7871064467766118,
"grad_norm": 0.283203125,
"learning_rate": 0.0002568202914075591,
"loss": 0.5235,
"step": 1192
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.27734375,
"learning_rate": 0.0002565335883215074,
"loss": 0.5289,
"step": 1196
},
{
"epoch": 1.7991004497751124,
"grad_norm": 0.26953125,
"learning_rate": 0.00025624609774348633,
"loss": 0.5018,
"step": 1200
},
{
"epoch": 1.8050974512743627,
"grad_norm": 0.2734375,
"learning_rate": 0.0002559578217986147,
"loss": 0.5799,
"step": 1204
},
{
"epoch": 1.811094452773613,
"grad_norm": 0.296875,
"learning_rate": 0.00025566876261781657,
"loss": 0.5077,
"step": 1208
},
{
"epoch": 1.8170914542728636,
"grad_norm": 0.2890625,
"learning_rate": 0.00025537892233780564,
"loss": 0.561,
"step": 1212
},
{
"epoch": 1.823088455772114,
"grad_norm": 0.28515625,
"learning_rate": 0.0002550883031010696,
"loss": 0.4929,
"step": 1216
},
{
"epoch": 1.8290854572713644,
"grad_norm": 0.3046875,
"learning_rate": 0.00025479690705585393,
"loss": 0.5342,
"step": 1220
},
{
"epoch": 1.8350824587706147,
"grad_norm": 0.302734375,
"learning_rate": 0.0002545047363561466,
"loss": 0.5061,
"step": 1224
},
{
"epoch": 1.841079460269865,
"grad_norm": 0.287109375,
"learning_rate": 0.00025421179316166147,
"loss": 0.5237,
"step": 1228
},
{
"epoch": 1.8470764617691153,
"grad_norm": 0.267578125,
"learning_rate": 0.00025391807963782276,
"loss": 0.4967,
"step": 1232
},
{
"epoch": 1.8530734632683659,
"grad_norm": 0.27734375,
"learning_rate": 0.000253623597955749,
"loss": 0.5285,
"step": 1236
},
{
"epoch": 1.8590704647676162,
"grad_norm": 0.263671875,
"learning_rate": 0.0002533283502922368,
"loss": 0.4559,
"step": 1240
},
{
"epoch": 1.8650674662668667,
"grad_norm": 0.28125,
"learning_rate": 0.000253032338829745,
"loss": 0.4359,
"step": 1244
},
{
"epoch": 1.871064467766117,
"grad_norm": 0.291015625,
"learning_rate": 0.00025273556575637824,
"loss": 0.4478,
"step": 1248
},
{
"epoch": 1.8770614692653673,
"grad_norm": 0.27734375,
"learning_rate": 0.00025243803326587113,
"loss": 0.4902,
"step": 1252
},
{
"epoch": 1.8830584707646176,
"grad_norm": 0.27734375,
"learning_rate": 0.0002521397435575717,
"loss": 0.4718,
"step": 1256
},
{
"epoch": 1.889055472263868,
"grad_norm": 0.283203125,
"learning_rate": 0.0002518406988364255,
"loss": 0.4678,
"step": 1260
},
{
"epoch": 1.8950524737631185,
"grad_norm": 0.298828125,
"learning_rate": 0.0002515409013129589,
"loss": 0.4982,
"step": 1264
},
{
"epoch": 1.9010494752623688,
"grad_norm": 0.271484375,
"learning_rate": 0.0002512403532032632,
"loss": 0.5777,
"step": 1268
},
{
"epoch": 1.9070464767616193,
"grad_norm": 0.279296875,
"learning_rate": 0.0002509390567289776,
"loss": 0.4771,
"step": 1272
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.279296875,
"learning_rate": 0.0002506370141172737,
"loss": 0.4811,
"step": 1276
},
{
"epoch": 1.91904047976012,
"grad_norm": 0.279296875,
"learning_rate": 0.00025033422760083814,
"loss": 0.4656,
"step": 1280
},
{
"epoch": 1.9250374812593702,
"grad_norm": 0.2890625,
"learning_rate": 0.00025003069941785647,
"loss": 0.5288,
"step": 1284
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.302734375,
"learning_rate": 0.00024972643181199694,
"loss": 0.4915,
"step": 1288
},
{
"epoch": 1.937031484257871,
"grad_norm": 0.30078125,
"learning_rate": 0.00024942142703239317,
"loss": 0.4914,
"step": 1292
},
{
"epoch": 1.9430284857571214,
"grad_norm": 0.28515625,
"learning_rate": 0.0002491156873336282,
"loss": 0.5417,
"step": 1296
},
{
"epoch": 1.949025487256372,
"grad_norm": 0.2890625,
"learning_rate": 0.0002488092149757176,
"loss": 0.5118,
"step": 1300
},
{
"epoch": 1.9550224887556222,
"grad_norm": 0.26953125,
"learning_rate": 0.00024850201222409245,
"loss": 0.4948,
"step": 1304
},
{
"epoch": 1.9610194902548725,
"grad_norm": 0.296875,
"learning_rate": 0.00024819408134958324,
"loss": 0.5132,
"step": 1308
},
{
"epoch": 1.9670164917541229,
"grad_norm": 0.27734375,
"learning_rate": 0.00024788542462840236,
"loss": 0.4743,
"step": 1312
},
{
"epoch": 1.9730134932533732,
"grad_norm": 0.279296875,
"learning_rate": 0.00024757604434212785,
"loss": 0.5555,
"step": 1316
},
{
"epoch": 1.9790104947526237,
"grad_norm": 0.287109375,
"learning_rate": 0.00024726594277768625,
"loss": 0.496,
"step": 1320
},
{
"epoch": 1.9850074962518742,
"grad_norm": 0.296875,
"learning_rate": 0.0002469551222273358,
"loss": 0.4981,
"step": 1324
},
{
"epoch": 1.9910044977511245,
"grad_norm": 0.283203125,
"learning_rate": 0.0002466435849886494,
"loss": 0.5064,
"step": 1328
},
{
"epoch": 1.9970014992503748,
"grad_norm": 0.275390625,
"learning_rate": 0.0002463313333644976,
"loss": 0.4856,
"step": 1332
},
{
"epoch": 2.002998500749625,
"grad_norm": 0.2421875,
"learning_rate": 0.0002460183696630319,
"loss": 0.4316,
"step": 1336
},
{
"epoch": 2.0089955022488755,
"grad_norm": 0.279296875,
"learning_rate": 0.0002457046961976672,
"loss": 0.4442,
"step": 1340
},
{
"epoch": 2.0149925037481258,
"grad_norm": 0.2890625,
"learning_rate": 0.0002453903152870651,
"loss": 0.3908,
"step": 1344
},
{
"epoch": 2.0209895052473765,
"grad_norm": 0.298828125,
"learning_rate": 0.00024507522925511655,
"loss": 0.3686,
"step": 1348
},
{
"epoch": 2.026986506746627,
"grad_norm": 0.279296875,
"learning_rate": 0.00024475944043092474,
"loss": 0.3864,
"step": 1352
},
{
"epoch": 2.032983508245877,
"grad_norm": 0.27734375,
"learning_rate": 0.00024444295114878787,
"loss": 0.3697,
"step": 1356
},
{
"epoch": 2.0389805097451275,
"grad_norm": 0.265625,
"learning_rate": 0.00024412576374818184,
"loss": 0.3737,
"step": 1360
},
{
"epoch": 2.0449775112443778,
"grad_norm": 0.30859375,
"learning_rate": 0.00024380788057374315,
"loss": 0.4196,
"step": 1364
},
{
"epoch": 2.050974512743628,
"grad_norm": 0.26953125,
"learning_rate": 0.00024348930397525125,
"loss": 0.3743,
"step": 1368
},
{
"epoch": 2.0569715142428784,
"grad_norm": 0.28515625,
"learning_rate": 0.00024317003630761156,
"loss": 0.3874,
"step": 1372
},
{
"epoch": 2.062968515742129,
"grad_norm": 0.27734375,
"learning_rate": 0.00024285007993083763,
"loss": 0.3758,
"step": 1376
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.322265625,
"learning_rate": 0.00024252943721003416,
"loss": 0.4214,
"step": 1380
},
{
"epoch": 2.0749625187406298,
"grad_norm": 0.271484375,
"learning_rate": 0.00024220811051537902,
"loss": 0.4145,
"step": 1384
},
{
"epoch": 2.08095952023988,
"grad_norm": 0.279296875,
"learning_rate": 0.00024188610222210624,
"loss": 0.3586,
"step": 1388
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.296875,
"learning_rate": 0.00024156341471048801,
"loss": 0.4311,
"step": 1392
},
{
"epoch": 2.0929535232383807,
"grad_norm": 0.279296875,
"learning_rate": 0.00024124005036581738,
"loss": 0.3881,
"step": 1396
},
{
"epoch": 2.098950524737631,
"grad_norm": 0.296875,
"learning_rate": 0.0002409160115783905,
"loss": 0.4552,
"step": 1400
},
{
"epoch": 2.1049475262368817,
"grad_norm": 0.28515625,
"learning_rate": 0.00024059130074348888,
"loss": 0.4048,
"step": 1404
},
{
"epoch": 2.110944527736132,
"grad_norm": 0.27734375,
"learning_rate": 0.0002402659202613619,
"loss": 0.3692,
"step": 1408
},
{
"epoch": 2.1169415292353824,
"grad_norm": 0.30859375,
"learning_rate": 0.00023993987253720896,
"loss": 0.418,
"step": 1412
},
{
"epoch": 2.1229385307346327,
"grad_norm": 0.283203125,
"learning_rate": 0.00023961315998116158,
"loss": 0.4435,
"step": 1416
},
{
"epoch": 2.128935532233883,
"grad_norm": 0.259765625,
"learning_rate": 0.0002392857850082657,
"loss": 0.4075,
"step": 1420
},
{
"epoch": 2.1349325337331333,
"grad_norm": 0.3203125,
"learning_rate": 0.00023895775003846388,
"loss": 0.4119,
"step": 1424
},
{
"epoch": 2.1409295352323836,
"grad_norm": 0.28515625,
"learning_rate": 0.00023862905749657743,
"loss": 0.3709,
"step": 1428
},
{
"epoch": 2.1469265367316344,
"grad_norm": 0.310546875,
"learning_rate": 0.0002382997098122882,
"loss": 0.379,
"step": 1432
},
{
"epoch": 2.1529235382308847,
"grad_norm": 0.291015625,
"learning_rate": 0.0002379697094201209,
"loss": 0.3731,
"step": 1436
},
{
"epoch": 2.158920539730135,
"grad_norm": 0.2890625,
"learning_rate": 0.00023763905875942516,
"loss": 0.3762,
"step": 1440
},
{
"epoch": 2.1649175412293853,
"grad_norm": 0.287109375,
"learning_rate": 0.0002373077602743572,
"loss": 0.4093,
"step": 1444
},
{
"epoch": 2.1709145427286356,
"grad_norm": 0.302734375,
"learning_rate": 0.00023697581641386208,
"loss": 0.3765,
"step": 1448
},
{
"epoch": 2.176911544227886,
"grad_norm": 0.29296875,
"learning_rate": 0.00023664322963165527,
"loss": 0.4056,
"step": 1452
},
{
"epoch": 2.1829085457271367,
"grad_norm": 0.3125,
"learning_rate": 0.00023631000238620483,
"loss": 0.4,
"step": 1456
},
{
"epoch": 2.188905547226387,
"grad_norm": 0.298828125,
"learning_rate": 0.00023597613714071308,
"loss": 0.4249,
"step": 1460
},
{
"epoch": 2.1949025487256373,
"grad_norm": 0.296875,
"learning_rate": 0.00023564163636309837,
"loss": 0.3966,
"step": 1464
},
{
"epoch": 2.2008995502248876,
"grad_norm": 0.28515625,
"learning_rate": 0.00023530650252597693,
"loss": 0.3794,
"step": 1468
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.30078125,
"learning_rate": 0.00023497073810664442,
"loss": 0.4001,
"step": 1472
},
{
"epoch": 2.212893553223388,
"grad_norm": 0.310546875,
"learning_rate": 0.00023463434558705792,
"loss": 0.4304,
"step": 1476
},
{
"epoch": 2.2188905547226385,
"grad_norm": 0.298828125,
"learning_rate": 0.00023429732745381733,
"loss": 0.3824,
"step": 1480
},
{
"epoch": 2.224887556221889,
"grad_norm": 0.298828125,
"learning_rate": 0.00023395968619814692,
"loss": 0.3911,
"step": 1484
},
{
"epoch": 2.2308845577211396,
"grad_norm": 0.302734375,
"learning_rate": 0.00023362142431587727,
"loss": 0.3931,
"step": 1488
},
{
"epoch": 2.23688155922039,
"grad_norm": 0.28515625,
"learning_rate": 0.0002332825443074265,
"loss": 0.4401,
"step": 1492
},
{
"epoch": 2.24287856071964,
"grad_norm": 0.3046875,
"learning_rate": 0.00023294304867778183,
"loss": 0.3967,
"step": 1496
},
{
"epoch": 2.2488755622188905,
"grad_norm": 0.2890625,
"learning_rate": 0.00023260293993648126,
"loss": 0.4004,
"step": 1500
},
{
"epoch": 2.254872563718141,
"grad_norm": 0.294921875,
"learning_rate": 0.00023226222059759486,
"loss": 0.3928,
"step": 1504
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.275390625,
"learning_rate": 0.00023192089317970616,
"loss": 0.3957,
"step": 1508
},
{
"epoch": 2.266866566716642,
"grad_norm": 0.291015625,
"learning_rate": 0.00023157896020589353,
"loss": 0.4173,
"step": 1512
},
{
"epoch": 2.272863568215892,
"grad_norm": 0.302734375,
"learning_rate": 0.00023123642420371177,
"loss": 0.4401,
"step": 1516
},
{
"epoch": 2.2788605697151425,
"grad_norm": 0.283203125,
"learning_rate": 0.0002308932877051731,
"loss": 0.4012,
"step": 1520
},
{
"epoch": 2.284857571214393,
"grad_norm": 0.296875,
"learning_rate": 0.0002305495532467286,
"loss": 0.4244,
"step": 1524
},
{
"epoch": 2.290854572713643,
"grad_norm": 0.310546875,
"learning_rate": 0.00023020522336924943,
"loss": 0.4158,
"step": 1528
},
{
"epoch": 2.2968515742128934,
"grad_norm": 0.318359375,
"learning_rate": 0.00022986030061800816,
"loss": 0.4394,
"step": 1532
},
{
"epoch": 2.3028485757121437,
"grad_norm": 0.31640625,
"learning_rate": 0.00022951478754265977,
"loss": 0.3715,
"step": 1536
},
{
"epoch": 2.3088455772113945,
"grad_norm": 0.302734375,
"learning_rate": 0.00022916868669722293,
"loss": 0.3814,
"step": 1540
},
{
"epoch": 2.314842578710645,
"grad_norm": 0.296875,
"learning_rate": 0.00022882200064006097,
"loss": 0.3815,
"step": 1544
},
{
"epoch": 2.320839580209895,
"grad_norm": 0.30859375,
"learning_rate": 0.00022847473193386334,
"loss": 0.3833,
"step": 1548
},
{
"epoch": 2.3268365817091454,
"grad_norm": 0.27734375,
"learning_rate": 0.00022812688314562615,
"loss": 0.3981,
"step": 1552
},
{
"epoch": 2.3328335832083957,
"grad_norm": 0.298828125,
"learning_rate": 0.0002277784568466336,
"loss": 0.4014,
"step": 1556
},
{
"epoch": 2.338830584707646,
"grad_norm": 0.29296875,
"learning_rate": 0.0002274294556124387,
"loss": 0.413,
"step": 1560
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.296875,
"learning_rate": 0.00022707988202284453,
"loss": 0.4232,
"step": 1564
},
{
"epoch": 2.350824587706147,
"grad_norm": 0.310546875,
"learning_rate": 0.00022672973866188484,
"loss": 0.4016,
"step": 1568
},
{
"epoch": 2.3568215892053974,
"grad_norm": 0.310546875,
"learning_rate": 0.0002263790281178052,
"loss": 0.4247,
"step": 1572
},
{
"epoch": 2.3628185907046477,
"grad_norm": 0.3046875,
"learning_rate": 0.00022602775298304374,
"loss": 0.393,
"step": 1576
},
{
"epoch": 2.368815592203898,
"grad_norm": 0.3125,
"learning_rate": 0.00022567591585421202,
"loss": 0.3931,
"step": 1580
},
{
"epoch": 2.3748125937031483,
"grad_norm": 0.3125,
"learning_rate": 0.00022532351933207584,
"loss": 0.3926,
"step": 1584
},
{
"epoch": 2.3808095952023987,
"grad_norm": 0.3203125,
"learning_rate": 0.00022497056602153602,
"loss": 0.3971,
"step": 1588
},
{
"epoch": 2.386806596701649,
"grad_norm": 0.306640625,
"learning_rate": 0.00022461705853160912,
"loss": 0.4126,
"step": 1592
},
{
"epoch": 2.3928035982008997,
"grad_norm": 0.291015625,
"learning_rate": 0.00022426299947540825,
"loss": 0.3858,
"step": 1596
},
{
"epoch": 2.39880059970015,
"grad_norm": 0.2890625,
"learning_rate": 0.00022390839147012353,
"loss": 0.4325,
"step": 1600
},
{
"epoch": 2.4047976011994003,
"grad_norm": 0.28515625,
"learning_rate": 0.00022355323713700302,
"loss": 0.3314,
"step": 1604
},
{
"epoch": 2.4107946026986506,
"grad_norm": 0.306640625,
"learning_rate": 0.00022319753910133314,
"loss": 0.4244,
"step": 1608
},
{
"epoch": 2.416791604197901,
"grad_norm": 0.310546875,
"learning_rate": 0.0002228412999924194,
"loss": 0.4494,
"step": 1612
},
{
"epoch": 2.4227886056971513,
"grad_norm": 0.3125,
"learning_rate": 0.00022248452244356677,
"loss": 0.4027,
"step": 1616
},
{
"epoch": 2.428785607196402,
"grad_norm": 0.318359375,
"learning_rate": 0.00022212720909206056,
"loss": 0.4296,
"step": 1620
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.30859375,
"learning_rate": 0.00022176936257914647,
"loss": 0.377,
"step": 1624
},
{
"epoch": 2.4407796101949026,
"grad_norm": 0.3046875,
"learning_rate": 0.0002214109855500115,
"loss": 0.4368,
"step": 1628
},
{
"epoch": 2.446776611694153,
"grad_norm": 0.302734375,
"learning_rate": 0.00022105208065376417,
"loss": 0.4073,
"step": 1632
},
{
"epoch": 2.4527736131934033,
"grad_norm": 0.322265625,
"learning_rate": 0.0002206926505434148,
"loss": 0.4051,
"step": 1636
},
{
"epoch": 2.4587706146926536,
"grad_norm": 0.30078125,
"learning_rate": 0.00022033269787585634,
"loss": 0.4175,
"step": 1640
},
{
"epoch": 2.464767616191904,
"grad_norm": 0.291015625,
"learning_rate": 0.00021997222531184427,
"loss": 0.4093,
"step": 1644
},
{
"epoch": 2.470764617691154,
"grad_norm": 0.314453125,
"learning_rate": 0.0002196112355159772,
"loss": 0.4557,
"step": 1648
},
{
"epoch": 2.476761619190405,
"grad_norm": 0.306640625,
"learning_rate": 0.000219249731156677,
"loss": 0.3951,
"step": 1652
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.30859375,
"learning_rate": 0.00021888771490616936,
"loss": 0.4413,
"step": 1656
},
{
"epoch": 2.4887556221889056,
"grad_norm": 0.275390625,
"learning_rate": 0.0002185251894404637,
"loss": 0.3882,
"step": 1660
},
{
"epoch": 2.494752623688156,
"grad_norm": 0.296875,
"learning_rate": 0.00021816215743933359,
"loss": 0.4303,
"step": 1664
},
{
"epoch": 2.500749625187406,
"grad_norm": 0.31640625,
"learning_rate": 0.0002177986215862968,
"loss": 0.3868,
"step": 1668
},
{
"epoch": 2.506746626686657,
"grad_norm": 0.3125,
"learning_rate": 0.0002174345845685957,
"loss": 0.4185,
"step": 1672
},
{
"epoch": 2.5127436281859072,
"grad_norm": 0.298828125,
"learning_rate": 0.00021707004907717717,
"loss": 0.4411,
"step": 1676
},
{
"epoch": 2.5187406296851576,
"grad_norm": 0.30859375,
"learning_rate": 0.00021670501780667284,
"loss": 0.449,
"step": 1680
},
{
"epoch": 2.524737631184408,
"grad_norm": 0.3046875,
"learning_rate": 0.00021633949345537895,
"loss": 0.4258,
"step": 1684
},
{
"epoch": 2.530734632683658,
"grad_norm": 0.2890625,
"learning_rate": 0.0002159734787252368,
"loss": 0.4221,
"step": 1688
},
{
"epoch": 2.5367316341829085,
"grad_norm": 0.2890625,
"learning_rate": 0.00021560697632181243,
"loss": 0.3824,
"step": 1692
},
{
"epoch": 2.542728635682159,
"grad_norm": 0.3125,
"learning_rate": 0.00021523998895427675,
"loss": 0.4164,
"step": 1696
},
{
"epoch": 2.548725637181409,
"grad_norm": 0.29296875,
"learning_rate": 0.00021487251933538547,
"loss": 0.3595,
"step": 1700
},
{
"epoch": 2.5547226386806594,
"grad_norm": 0.31640625,
"learning_rate": 0.00021450457018145925,
"loss": 0.3977,
"step": 1704
},
{
"epoch": 2.56071964017991,
"grad_norm": 0.298828125,
"learning_rate": 0.00021413614421236313,
"loss": 0.4427,
"step": 1708
},
{
"epoch": 2.5667166416791605,
"grad_norm": 0.263671875,
"learning_rate": 0.00021376724415148718,
"loss": 0.3741,
"step": 1712
},
{
"epoch": 2.572713643178411,
"grad_norm": 0.30859375,
"learning_rate": 0.00021339787272572555,
"loss": 0.3822,
"step": 1716
},
{
"epoch": 2.578710644677661,
"grad_norm": 0.30859375,
"learning_rate": 0.00021302803266545696,
"loss": 0.4308,
"step": 1720
},
{
"epoch": 2.5847076461769114,
"grad_norm": 0.322265625,
"learning_rate": 0.00021265772670452402,
"loss": 0.3995,
"step": 1724
},
{
"epoch": 2.590704647676162,
"grad_norm": 0.302734375,
"learning_rate": 0.0002122869575802135,
"loss": 0.3994,
"step": 1728
},
{
"epoch": 2.5967016491754125,
"grad_norm": 0.57421875,
"learning_rate": 0.00021191572803323571,
"loss": 0.3803,
"step": 1732
},
{
"epoch": 2.6026986506746628,
"grad_norm": 0.314453125,
"learning_rate": 0.00021154404080770447,
"loss": 0.4211,
"step": 1736
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.314453125,
"learning_rate": 0.00021117189865111664,
"loss": 0.4121,
"step": 1740
},
{
"epoch": 2.6146926536731634,
"grad_norm": 0.306640625,
"learning_rate": 0.00021079930431433197,
"loss": 0.3982,
"step": 1744
},
{
"epoch": 2.6206896551724137,
"grad_norm": 0.30078125,
"learning_rate": 0.00021042626055155266,
"loss": 0.4339,
"step": 1748
},
{
"epoch": 2.626686656671664,
"grad_norm": 0.294921875,
"learning_rate": 0.00021005277012030324,
"loss": 0.4151,
"step": 1752
},
{
"epoch": 2.6326836581709143,
"grad_norm": 0.322265625,
"learning_rate": 0.00020967883578140966,
"loss": 0.3805,
"step": 1756
},
{
"epoch": 2.638680659670165,
"grad_norm": 0.318359375,
"learning_rate": 0.0002093044602989796,
"loss": 0.4125,
"step": 1760
},
{
"epoch": 2.6446776611694154,
"grad_norm": 0.318359375,
"learning_rate": 0.0002089296464403813,
"loss": 0.4266,
"step": 1764
},
{
"epoch": 2.6506746626686657,
"grad_norm": 0.294921875,
"learning_rate": 0.00020855439697622374,
"loss": 0.4417,
"step": 1768
},
{
"epoch": 2.656671664167916,
"grad_norm": 0.294921875,
"learning_rate": 0.00020817871468033566,
"loss": 0.4165,
"step": 1772
},
{
"epoch": 2.6626686656671663,
"grad_norm": 0.291015625,
"learning_rate": 0.00020780260232974545,
"loss": 0.4082,
"step": 1776
},
{
"epoch": 2.668665667166417,
"grad_norm": 0.318359375,
"learning_rate": 0.00020742606270466026,
"loss": 0.4115,
"step": 1780
},
{
"epoch": 2.6746626686656674,
"grad_norm": 0.3203125,
"learning_rate": 0.0002070490985884459,
"loss": 0.3905,
"step": 1784
},
{
"epoch": 2.6806596701649177,
"grad_norm": 0.3046875,
"learning_rate": 0.00020667171276760567,
"loss": 0.3935,
"step": 1788
},
{
"epoch": 2.686656671664168,
"grad_norm": 0.326171875,
"learning_rate": 0.00020629390803176046,
"loss": 0.4366,
"step": 1792
},
{
"epoch": 2.6926536731634183,
"grad_norm": 0.3046875,
"learning_rate": 0.0002059156871736274,
"loss": 0.4184,
"step": 1796
},
{
"epoch": 2.6986506746626686,
"grad_norm": 0.310546875,
"learning_rate": 0.0002055370529889999,
"loss": 0.395,
"step": 1800
},
{
"epoch": 2.704647676161919,
"grad_norm": 0.27734375,
"learning_rate": 0.00020515800827672638,
"loss": 0.3656,
"step": 1804
},
{
"epoch": 2.7106446776611692,
"grad_norm": 0.2890625,
"learning_rate": 0.00020477855583869015,
"loss": 0.4209,
"step": 1808
},
{
"epoch": 2.7166416791604195,
"grad_norm": 0.3046875,
"learning_rate": 0.0002043986984797881,
"loss": 0.4143,
"step": 1812
},
{
"epoch": 2.7226386806596703,
"grad_norm": 0.3203125,
"learning_rate": 0.00020401843900791055,
"loss": 0.4105,
"step": 1816
},
{
"epoch": 2.7286356821589206,
"grad_norm": 0.30078125,
"learning_rate": 0.00020363778023392,
"loss": 0.4174,
"step": 1820
},
{
"epoch": 2.734632683658171,
"grad_norm": 0.291015625,
"learning_rate": 0.00020325672497163087,
"loss": 0.4063,
"step": 1824
},
{
"epoch": 2.7406296851574212,
"grad_norm": 0.298828125,
"learning_rate": 0.00020287527603778804,
"loss": 0.4233,
"step": 1828
},
{
"epoch": 2.7466266866566715,
"grad_norm": 0.31640625,
"learning_rate": 0.0002024934362520467,
"loss": 0.4659,
"step": 1832
},
{
"epoch": 2.7526236881559223,
"grad_norm": 0.314453125,
"learning_rate": 0.000202111208436951,
"loss": 0.4075,
"step": 1836
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.296875,
"learning_rate": 0.00020172859541791352,
"loss": 0.4011,
"step": 1840
},
{
"epoch": 2.764617691154423,
"grad_norm": 0.326171875,
"learning_rate": 0.00020134560002319418,
"loss": 0.4006,
"step": 1844
},
{
"epoch": 2.770614692653673,
"grad_norm": 0.30078125,
"learning_rate": 0.00020096222508387938,
"loss": 0.4012,
"step": 1848
},
{
"epoch": 2.7766116941529235,
"grad_norm": 0.33984375,
"learning_rate": 0.00020057847343386124,
"loss": 0.4657,
"step": 1852
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.306640625,
"learning_rate": 0.0002001943479098163,
"loss": 0.3579,
"step": 1856
},
{
"epoch": 2.788605697151424,
"grad_norm": 0.3046875,
"learning_rate": 0.0001998098513511849,
"loss": 0.4232,
"step": 1860
},
{
"epoch": 2.7946026986506745,
"grad_norm": 0.3125,
"learning_rate": 0.0001994249866001501,
"loss": 0.4228,
"step": 1864
},
{
"epoch": 2.8005997001499248,
"grad_norm": 0.302734375,
"learning_rate": 0.00019903975650161648,
"loss": 0.4214,
"step": 1868
},
{
"epoch": 2.8065967016491755,
"grad_norm": 0.326171875,
"learning_rate": 0.00019865416390318935,
"loss": 0.4308,
"step": 1872
},
{
"epoch": 2.812593703148426,
"grad_norm": 0.279296875,
"learning_rate": 0.0001982682116551536,
"loss": 0.3585,
"step": 1876
},
{
"epoch": 2.818590704647676,
"grad_norm": 0.30859375,
"learning_rate": 0.00019788190261045248,
"loss": 0.4224,
"step": 1880
},
{
"epoch": 2.8245877061469264,
"grad_norm": 0.326171875,
"learning_rate": 0.000197495239624667,
"loss": 0.4206,
"step": 1884
},
{
"epoch": 2.8305847076461768,
"grad_norm": 0.322265625,
"learning_rate": 0.00019710822555599417,
"loss": 0.4052,
"step": 1888
},
{
"epoch": 2.8365817091454275,
"grad_norm": 0.294921875,
"learning_rate": 0.00019672086326522634,
"loss": 0.399,
"step": 1892
},
{
"epoch": 2.842578710644678,
"grad_norm": 0.31640625,
"learning_rate": 0.0001963331556157298,
"loss": 0.387,
"step": 1896
},
{
"epoch": 2.848575712143928,
"grad_norm": 0.318359375,
"learning_rate": 0.0001959451054734239,
"loss": 0.3893,
"step": 1900
},
{
"epoch": 2.8545727136431784,
"grad_norm": 0.302734375,
"learning_rate": 0.00019555671570675953,
"loss": 0.3967,
"step": 1904
},
{
"epoch": 2.8605697151424287,
"grad_norm": 0.31640625,
"learning_rate": 0.00019516798918669807,
"loss": 0.4241,
"step": 1908
},
{
"epoch": 2.866566716641679,
"grad_norm": 0.3203125,
"learning_rate": 0.00019477892878669021,
"loss": 0.4166,
"step": 1912
},
{
"epoch": 2.8725637181409294,
"grad_norm": 0.30859375,
"learning_rate": 0.00019438953738265479,
"loss": 0.3727,
"step": 1916
},
{
"epoch": 2.8785607196401797,
"grad_norm": 0.30859375,
"learning_rate": 0.0001939998178529571,
"loss": 0.3908,
"step": 1920
},
{
"epoch": 2.8845577211394304,
"grad_norm": 0.33203125,
"learning_rate": 0.00019360977307838833,
"loss": 0.3843,
"step": 1924
},
{
"epoch": 2.8905547226386807,
"grad_norm": 0.330078125,
"learning_rate": 0.0001932194059421435,
"loss": 0.4424,
"step": 1928
},
{
"epoch": 2.896551724137931,
"grad_norm": 0.3046875,
"learning_rate": 0.0001928287193298007,
"loss": 0.3926,
"step": 1932
},
{
"epoch": 2.9025487256371814,
"grad_norm": 0.341796875,
"learning_rate": 0.00019243771612929955,
"loss": 0.4391,
"step": 1936
},
{
"epoch": 2.9085457271364317,
"grad_norm": 0.314453125,
"learning_rate": 0.0001920463992309199,
"loss": 0.4248,
"step": 1940
},
{
"epoch": 2.9145427286356824,
"grad_norm": 0.298828125,
"learning_rate": 0.00019165477152726035,
"loss": 0.4236,
"step": 1944
},
{
"epoch": 2.9205397301349327,
"grad_norm": 0.314453125,
"learning_rate": 0.0001912628359132171,
"loss": 0.4503,
"step": 1948
},
{
"epoch": 2.926536731634183,
"grad_norm": 0.326171875,
"learning_rate": 0.00019087059528596223,
"loss": 0.4249,
"step": 1952
},
{
"epoch": 2.9325337331334334,
"grad_norm": 0.349609375,
"learning_rate": 0.00019047805254492265,
"loss": 0.4145,
"step": 1956
},
{
"epoch": 2.9385307346326837,
"grad_norm": 0.30859375,
"learning_rate": 0.0001900852105917584,
"loss": 0.3811,
"step": 1960
},
{
"epoch": 2.944527736131934,
"grad_norm": 0.27734375,
"learning_rate": 0.00018969207233034127,
"loss": 0.3733,
"step": 1964
},
{
"epoch": 2.9505247376311843,
"grad_norm": 0.318359375,
"learning_rate": 0.0001892986406667333,
"loss": 0.4685,
"step": 1968
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.31640625,
"learning_rate": 0.0001889049185091655,
"loss": 0.4259,
"step": 1972
},
{
"epoch": 2.962518740629685,
"grad_norm": 0.30078125,
"learning_rate": 0.00018851090876801605,
"loss": 0.4425,
"step": 1976
},
{
"epoch": 2.9685157421289357,
"grad_norm": 0.30078125,
"learning_rate": 0.00018811661435578903,
"loss": 0.3932,
"step": 1980
},
{
"epoch": 2.974512743628186,
"grad_norm": 0.310546875,
"learning_rate": 0.00018772203818709273,
"loss": 0.4028,
"step": 1984
},
{
"epoch": 2.9805097451274363,
"grad_norm": 0.318359375,
"learning_rate": 0.0001873271831786183,
"loss": 0.4215,
"step": 1988
},
{
"epoch": 2.9865067466266866,
"grad_norm": 0.306640625,
"learning_rate": 0.00018693205224911777,
"loss": 0.4076,
"step": 1992
},
{
"epoch": 2.992503748125937,
"grad_norm": 0.291015625,
"learning_rate": 0.00018653664831938318,
"loss": 0.4261,
"step": 1996
},
{
"epoch": 2.9985007496251876,
"grad_norm": 0.3046875,
"learning_rate": 0.00018614097431222425,
"loss": 0.4096,
"step": 2000
},
{
"epoch": 3.004497751124438,
"grad_norm": 0.30859375,
"learning_rate": 0.00018574503315244722,
"loss": 0.3218,
"step": 2004
},
{
"epoch": 3.0104947526236883,
"grad_norm": 0.26953125,
"learning_rate": 0.0001853488277668331,
"loss": 0.2858,
"step": 2008
},
{
"epoch": 3.0164917541229386,
"grad_norm": 0.326171875,
"learning_rate": 0.0001849523610841161,
"loss": 0.33,
"step": 2012
},
{
"epoch": 3.022488755622189,
"grad_norm": 0.3046875,
"learning_rate": 0.00018455563603496185,
"loss": 0.2721,
"step": 2016
},
{
"epoch": 3.028485757121439,
"grad_norm": 0.291015625,
"learning_rate": 0.0001841586555519458,
"loss": 0.3042,
"step": 2020
},
{
"epoch": 3.0344827586206895,
"grad_norm": 0.314453125,
"learning_rate": 0.00018376142256953167,
"loss": 0.3035,
"step": 2024
},
{
"epoch": 3.04047976011994,
"grad_norm": 0.296875,
"learning_rate": 0.00018336394002404954,
"loss": 0.2887,
"step": 2028
},
{
"epoch": 3.0464767616191906,
"grad_norm": 0.294921875,
"learning_rate": 0.00018296621085367424,
"loss": 0.2429,
"step": 2032
},
{
"epoch": 3.052473763118441,
"grad_norm": 0.298828125,
"learning_rate": 0.00018256823799840376,
"loss": 0.295,
"step": 2036
},
{
"epoch": 3.058470764617691,
"grad_norm": 0.294921875,
"learning_rate": 0.00018217002440003733,
"loss": 0.2938,
"step": 2040
},
{
"epoch": 3.0644677661169415,
"grad_norm": 0.30078125,
"learning_rate": 0.00018177157300215365,
"loss": 0.2914,
"step": 2044
},
{
"epoch": 3.070464767616192,
"grad_norm": 0.30859375,
"learning_rate": 0.00018137288675008938,
"loss": 0.33,
"step": 2048
},
{
"epoch": 3.076461769115442,
"grad_norm": 0.298828125,
"learning_rate": 0.00018097396859091715,
"loss": 0.2802,
"step": 2052
},
{
"epoch": 3.082458770614693,
"grad_norm": 0.32421875,
"learning_rate": 0.00018057482147342379,
"loss": 0.2736,
"step": 2056
},
{
"epoch": 3.088455772113943,
"grad_norm": 0.322265625,
"learning_rate": 0.0001801754483480887,
"loss": 0.3102,
"step": 2060
},
{
"epoch": 3.0944527736131935,
"grad_norm": 0.30078125,
"learning_rate": 0.0001797758521670617,
"loss": 0.3081,
"step": 2064
},
{
"epoch": 3.100449775112444,
"grad_norm": 0.310546875,
"learning_rate": 0.00017937603588414177,
"loss": 0.3164,
"step": 2068
},
{
"epoch": 3.106446776611694,
"grad_norm": 0.32421875,
"learning_rate": 0.00017897600245475454,
"loss": 0.3019,
"step": 2072
},
{
"epoch": 3.1124437781109444,
"grad_norm": 0.29296875,
"learning_rate": 0.0001785757548359309,
"loss": 0.2853,
"step": 2076
},
{
"epoch": 3.1184407796101947,
"grad_norm": 0.31640625,
"learning_rate": 0.00017817529598628513,
"loss": 0.2779,
"step": 2080
},
{
"epoch": 3.1244377811094455,
"grad_norm": 0.314453125,
"learning_rate": 0.00017777462886599276,
"loss": 0.3017,
"step": 2084
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.3359375,
"learning_rate": 0.00017737375643676895,
"loss": 0.3012,
"step": 2088
},
{
"epoch": 3.136431784107946,
"grad_norm": 0.30078125,
"learning_rate": 0.0001769726816618464,
"loss": 0.2831,
"step": 2092
},
{
"epoch": 3.1424287856071964,
"grad_norm": 0.30078125,
"learning_rate": 0.00017657140750595366,
"loss": 0.2922,
"step": 2096
},
{
"epoch": 3.1484257871064467,
"grad_norm": 0.3203125,
"learning_rate": 0.00017616993693529302,
"loss": 0.3342,
"step": 2100
},
{
"epoch": 3.154422788605697,
"grad_norm": 0.328125,
"learning_rate": 0.00017576827291751864,
"loss": 0.2842,
"step": 2104
},
{
"epoch": 3.1604197901049473,
"grad_norm": 0.314453125,
"learning_rate": 0.00017536641842171472,
"loss": 0.3514,
"step": 2108
},
{
"epoch": 3.166416791604198,
"grad_norm": 0.302734375,
"learning_rate": 0.0001749643764183734,
"loss": 0.3121,
"step": 2112
},
{
"epoch": 3.1724137931034484,
"grad_norm": 0.3203125,
"learning_rate": 0.00017456214987937282,
"loss": 0.3121,
"step": 2116
},
{
"epoch": 3.1784107946026987,
"grad_norm": 0.314453125,
"learning_rate": 0.00017415974177795534,
"loss": 0.3049,
"step": 2120
},
{
"epoch": 3.184407796101949,
"grad_norm": 0.306640625,
"learning_rate": 0.0001737571550887053,
"loss": 0.293,
"step": 2124
},
{
"epoch": 3.1904047976011993,
"grad_norm": 0.341796875,
"learning_rate": 0.00017335439278752727,
"loss": 0.3108,
"step": 2128
},
{
"epoch": 3.1964017991004496,
"grad_norm": 0.306640625,
"learning_rate": 0.00017295145785162377,
"loss": 0.2983,
"step": 2132
},
{
"epoch": 3.2023988005997,
"grad_norm": 0.32421875,
"learning_rate": 0.00017254835325947364,
"loss": 0.3318,
"step": 2136
},
{
"epoch": 3.2083958020989507,
"grad_norm": 0.291015625,
"learning_rate": 0.00017214508199080953,
"loss": 0.3164,
"step": 2140
},
{
"epoch": 3.214392803598201,
"grad_norm": 0.33984375,
"learning_rate": 0.00017174164702659647,
"loss": 0.3074,
"step": 2144
},
{
"epoch": 3.2203898050974513,
"grad_norm": 0.310546875,
"learning_rate": 0.00017133805134900926,
"loss": 0.2884,
"step": 2148
},
{
"epoch": 3.2263868065967016,
"grad_norm": 0.314453125,
"learning_rate": 0.00017093429794141094,
"loss": 0.3038,
"step": 2152
},
{
"epoch": 3.232383808095952,
"grad_norm": 0.361328125,
"learning_rate": 0.00017053038978833018,
"loss": 0.3217,
"step": 2156
},
{
"epoch": 3.2383808095952022,
"grad_norm": 0.333984375,
"learning_rate": 0.0001701263298754398,
"loss": 0.3117,
"step": 2160
},
{
"epoch": 3.244377811094453,
"grad_norm": 0.341796875,
"learning_rate": 0.00016972212118953426,
"loss": 0.2811,
"step": 2164
},
{
"epoch": 3.2503748125937033,
"grad_norm": 0.326171875,
"learning_rate": 0.00016931776671850785,
"loss": 0.2991,
"step": 2168
},
{
"epoch": 3.2563718140929536,
"grad_norm": 0.3046875,
"learning_rate": 0.00016891326945133237,
"loss": 0.3019,
"step": 2172
},
{
"epoch": 3.262368815592204,
"grad_norm": 0.326171875,
"learning_rate": 0.00016850863237803527,
"loss": 0.3305,
"step": 2176
},
{
"epoch": 3.2683658170914542,
"grad_norm": 0.32421875,
"learning_rate": 0.0001681038584896774,
"loss": 0.3355,
"step": 2180
},
{
"epoch": 3.2743628185907045,
"grad_norm": 0.31640625,
"learning_rate": 0.0001676989507783309,
"loss": 0.3139,
"step": 2184
},
{
"epoch": 3.280359820089955,
"grad_norm": 0.3359375,
"learning_rate": 0.00016729391223705727,
"loss": 0.2821,
"step": 2188
},
{
"epoch": 3.286356821589205,
"grad_norm": 0.27734375,
"learning_rate": 0.0001668887458598849,
"loss": 0.2992,
"step": 2192
},
{
"epoch": 3.292353823088456,
"grad_norm": 0.3203125,
"learning_rate": 0.00016648345464178723,
"loss": 0.3048,
"step": 2196
},
{
"epoch": 3.2983508245877062,
"grad_norm": 0.31640625,
"learning_rate": 0.00016607804157866066,
"loss": 0.3044,
"step": 2200
},
{
"epoch": 3.3043478260869565,
"grad_norm": 0.328125,
"learning_rate": 0.00016567250966730197,
"loss": 0.298,
"step": 2204
},
{
"epoch": 3.310344827586207,
"grad_norm": 0.3359375,
"learning_rate": 0.00016526686190538678,
"loss": 0.2494,
"step": 2208
},
{
"epoch": 3.316341829085457,
"grad_norm": 0.28125,
"learning_rate": 0.00016486110129144675,
"loss": 0.2682,
"step": 2212
},
{
"epoch": 3.3223388305847075,
"grad_norm": 0.30859375,
"learning_rate": 0.00016445523082484802,
"loss": 0.3378,
"step": 2216
},
{
"epoch": 3.3283358320839582,
"grad_norm": 0.33984375,
"learning_rate": 0.00016404925350576858,
"loss": 0.271,
"step": 2220
},
{
"epoch": 3.3343328335832085,
"grad_norm": 0.353515625,
"learning_rate": 0.00016364317233517637,
"loss": 0.326,
"step": 2224
},
{
"epoch": 3.340329835082459,
"grad_norm": 0.326171875,
"learning_rate": 0.00016323699031480686,
"loss": 0.3056,
"step": 2228
},
{
"epoch": 3.346326836581709,
"grad_norm": 0.3515625,
"learning_rate": 0.00016283071044714123,
"loss": 0.3266,
"step": 2232
},
{
"epoch": 3.3523238380809595,
"grad_norm": 0.318359375,
"learning_rate": 0.0001624243357353837,
"loss": 0.3001,
"step": 2236
},
{
"epoch": 3.3583208395802098,
"grad_norm": 0.333984375,
"learning_rate": 0.0001620178691834397,
"loss": 0.3,
"step": 2240
},
{
"epoch": 3.36431784107946,
"grad_norm": 0.302734375,
"learning_rate": 0.00016161131379589355,
"loss": 0.3292,
"step": 2244
},
{
"epoch": 3.370314842578711,
"grad_norm": 0.34765625,
"learning_rate": 0.00016120467257798614,
"loss": 0.3232,
"step": 2248
},
{
"epoch": 3.376311844077961,
"grad_norm": 0.326171875,
"learning_rate": 0.000160797948535593,
"loss": 0.3401,
"step": 2252
},
{
"epoch": 3.3823088455772115,
"grad_norm": 0.353515625,
"learning_rate": 0.00016039114467520163,
"loss": 0.2963,
"step": 2256
},
{
"epoch": 3.3883058470764618,
"grad_norm": 0.306640625,
"learning_rate": 0.00015998426400388977,
"loss": 0.3083,
"step": 2260
},
{
"epoch": 3.394302848575712,
"grad_norm": 0.32421875,
"learning_rate": 0.00015957730952930284,
"loss": 0.3113,
"step": 2264
},
{
"epoch": 3.4002998500749624,
"grad_norm": 0.33984375,
"learning_rate": 0.00015917028425963185,
"loss": 0.3149,
"step": 2268
},
{
"epoch": 3.406296851574213,
"grad_norm": 0.314453125,
"learning_rate": 0.0001587631912035911,
"loss": 0.315,
"step": 2272
},
{
"epoch": 3.4122938530734634,
"grad_norm": 0.341796875,
"learning_rate": 0.00015835603337039592,
"loss": 0.2763,
"step": 2276
},
{
"epoch": 3.4182908545727138,
"grad_norm": 0.306640625,
"learning_rate": 0.00015794881376974054,
"loss": 0.3223,
"step": 2280
},
{
"epoch": 3.424287856071964,
"grad_norm": 0.322265625,
"learning_rate": 0.00015754153541177584,
"loss": 0.2963,
"step": 2284
},
{
"epoch": 3.4302848575712144,
"grad_norm": 0.349609375,
"learning_rate": 0.00015713420130708682,
"loss": 0.3092,
"step": 2288
},
{
"epoch": 3.4362818590704647,
"grad_norm": 0.298828125,
"learning_rate": 0.0001567268144666708,
"loss": 0.2752,
"step": 2292
},
{
"epoch": 3.442278860569715,
"grad_norm": 0.33203125,
"learning_rate": 0.00015631937790191468,
"loss": 0.2993,
"step": 2296
},
{
"epoch": 3.4482758620689653,
"grad_norm": 0.31640625,
"learning_rate": 0.00015591189462457313,
"loss": 0.3338,
"step": 2300
},
{
"epoch": 3.454272863568216,
"grad_norm": 0.326171875,
"learning_rate": 0.000155504367646746,
"loss": 0.322,
"step": 2304
},
{
"epoch": 3.4602698650674664,
"grad_norm": 0.33203125,
"learning_rate": 0.00015509679998085618,
"loss": 0.3167,
"step": 2308
},
{
"epoch": 3.4662668665667167,
"grad_norm": 0.33984375,
"learning_rate": 0.00015468919463962737,
"loss": 0.3199,
"step": 2312
},
{
"epoch": 3.472263868065967,
"grad_norm": 0.328125,
"learning_rate": 0.00015428155463606178,
"loss": 0.312,
"step": 2316
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.32421875,
"learning_rate": 0.00015387388298341767,
"loss": 0.3105,
"step": 2320
},
{
"epoch": 3.4842578710644676,
"grad_norm": 0.306640625,
"learning_rate": 0.00015346618269518753,
"loss": 0.3061,
"step": 2324
},
{
"epoch": 3.4902548725637184,
"grad_norm": 0.341796875,
"learning_rate": 0.0001530584567850753,
"loss": 0.3315,
"step": 2328
},
{
"epoch": 3.4962518740629687,
"grad_norm": 0.326171875,
"learning_rate": 0.00015265070826697442,
"loss": 0.2991,
"step": 2332
},
{
"epoch": 3.502248875562219,
"grad_norm": 0.3203125,
"learning_rate": 0.0001522429401549454,
"loss": 0.3368,
"step": 2336
},
{
"epoch": 3.5082458770614693,
"grad_norm": 0.36328125,
"learning_rate": 0.00015183515546319368,
"loss": 0.3422,
"step": 2340
},
{
"epoch": 3.5142428785607196,
"grad_norm": 0.337890625,
"learning_rate": 0.000151427357206047,
"loss": 0.3261,
"step": 2344
},
{
"epoch": 3.52023988005997,
"grad_norm": 0.326171875,
"learning_rate": 0.00015101954839793377,
"loss": 0.3051,
"step": 2348
},
{
"epoch": 3.52623688155922,
"grad_norm": 0.322265625,
"learning_rate": 0.00015061173205336003,
"loss": 0.3019,
"step": 2352
},
{
"epoch": 3.5322338830584705,
"grad_norm": 0.318359375,
"learning_rate": 0.00015020391118688778,
"loss": 0.3085,
"step": 2356
},
{
"epoch": 3.5382308845577213,
"grad_norm": 0.3359375,
"learning_rate": 0.00014979608881311222,
"loss": 0.323,
"step": 2360
},
{
"epoch": 3.5442278860569716,
"grad_norm": 0.341796875,
"learning_rate": 0.00014938826794663997,
"loss": 0.3158,
"step": 2364
},
{
"epoch": 3.550224887556222,
"grad_norm": 0.326171875,
"learning_rate": 0.0001489804516020662,
"loss": 0.3029,
"step": 2368
},
{
"epoch": 3.556221889055472,
"grad_norm": 0.328125,
"learning_rate": 0.000148572642793953,
"loss": 0.3353,
"step": 2372
},
{
"epoch": 3.5622188905547225,
"grad_norm": 0.33203125,
"learning_rate": 0.00014816484453680635,
"loss": 0.3086,
"step": 2376
},
{
"epoch": 3.5682158920539733,
"grad_norm": 0.3515625,
"learning_rate": 0.00014775705984505455,
"loss": 0.3599,
"step": 2380
},
{
"epoch": 3.5742128935532236,
"grad_norm": 0.3671875,
"learning_rate": 0.00014734929173302556,
"loss": 0.2845,
"step": 2384
},
{
"epoch": 3.580209895052474,
"grad_norm": 0.32421875,
"learning_rate": 0.00014694154321492466,
"loss": 0.3228,
"step": 2388
},
{
"epoch": 3.586206896551724,
"grad_norm": 0.33984375,
"learning_rate": 0.00014653381730481247,
"loss": 0.347,
"step": 2392
},
{
"epoch": 3.5922038980509745,
"grad_norm": 0.34765625,
"learning_rate": 0.0001461261170165823,
"loss": 0.3353,
"step": 2396
},
{
"epoch": 3.598200899550225,
"grad_norm": 0.330078125,
"learning_rate": 0.00014571844536393828,
"loss": 0.3423,
"step": 2400
},
{
"epoch": 3.604197901049475,
"grad_norm": 0.328125,
"learning_rate": 0.00014531080536037263,
"loss": 0.3268,
"step": 2404
},
{
"epoch": 3.6101949025487254,
"grad_norm": 0.33203125,
"learning_rate": 0.00014490320001914384,
"loss": 0.3282,
"step": 2408
},
{
"epoch": 3.6161919040479757,
"grad_norm": 0.33984375,
"learning_rate": 0.00014449563235325403,
"loss": 0.3233,
"step": 2412
},
{
"epoch": 3.6221889055472265,
"grad_norm": 0.328125,
"learning_rate": 0.0001440881053754269,
"loss": 0.277,
"step": 2416
},
{
"epoch": 3.628185907046477,
"grad_norm": 0.341796875,
"learning_rate": 0.00014368062209808532,
"loss": 0.34,
"step": 2420
},
{
"epoch": 3.634182908545727,
"grad_norm": 0.32421875,
"learning_rate": 0.0001432731855333292,
"loss": 0.3308,
"step": 2424
},
{
"epoch": 3.6401799100449774,
"grad_norm": 0.341796875,
"learning_rate": 0.00014286579869291315,
"loss": 0.3361,
"step": 2428
},
{
"epoch": 3.6461769115442277,
"grad_norm": 0.31640625,
"learning_rate": 0.00014245846458822416,
"loss": 0.2908,
"step": 2432
},
{
"epoch": 3.6521739130434785,
"grad_norm": 0.326171875,
"learning_rate": 0.00014205118623025943,
"loss": 0.3122,
"step": 2436
},
{
"epoch": 3.658170914542729,
"grad_norm": 0.32421875,
"learning_rate": 0.00014164396662960408,
"loss": 0.2552,
"step": 2440
},
{
"epoch": 3.664167916041979,
"grad_norm": 0.341796875,
"learning_rate": 0.00014123680879640893,
"loss": 0.3299,
"step": 2444
},
{
"epoch": 3.6701649175412294,
"grad_norm": 0.33203125,
"learning_rate": 0.00014082971574036815,
"loss": 0.3271,
"step": 2448
},
{
"epoch": 3.6761619190404797,
"grad_norm": 0.33203125,
"learning_rate": 0.00014042269047069718,
"loss": 0.2984,
"step": 2452
},
{
"epoch": 3.68215892053973,
"grad_norm": 0.345703125,
"learning_rate": 0.00014001573599611026,
"loss": 0.2954,
"step": 2456
},
{
"epoch": 3.6881559220389803,
"grad_norm": 0.345703125,
"learning_rate": 0.00013960885532479834,
"loss": 0.3048,
"step": 2460
},
{
"epoch": 3.6941529235382307,
"grad_norm": 0.341796875,
"learning_rate": 0.00013920205146440698,
"loss": 0.3506,
"step": 2464
},
{
"epoch": 3.7001499250374814,
"grad_norm": 0.29296875,
"learning_rate": 0.00013879532742201378,
"loss": 0.3517,
"step": 2468
},
{
"epoch": 3.7061469265367317,
"grad_norm": 0.3515625,
"learning_rate": 0.00013838868620410645,
"loss": 0.288,
"step": 2472
},
{
"epoch": 3.712143928035982,
"grad_norm": 0.310546875,
"learning_rate": 0.00013798213081656026,
"loss": 0.2907,
"step": 2476
},
{
"epoch": 3.7181409295352323,
"grad_norm": 0.357421875,
"learning_rate": 0.0001375756642646163,
"loss": 0.329,
"step": 2480
},
{
"epoch": 3.7241379310344827,
"grad_norm": 0.345703125,
"learning_rate": 0.00013716928955285874,
"loss": 0.3179,
"step": 2484
},
{
"epoch": 3.7301349325337334,
"grad_norm": 0.3203125,
"learning_rate": 0.0001367630096851931,
"loss": 0.287,
"step": 2488
},
{
"epoch": 3.7361319340329837,
"grad_norm": 0.30859375,
"learning_rate": 0.00013635682766482363,
"loss": 0.2958,
"step": 2492
},
{
"epoch": 3.742128935532234,
"grad_norm": 0.349609375,
"learning_rate": 0.00013595074649423144,
"loss": 0.3526,
"step": 2496
},
{
"epoch": 3.7481259370314843,
"grad_norm": 0.3203125,
"learning_rate": 0.00013554476917515199,
"loss": 0.2866,
"step": 2500
},
{
"epoch": 3.7541229385307346,
"grad_norm": 0.30078125,
"learning_rate": 0.00013513889870855322,
"loss": 0.335,
"step": 2504
},
{
"epoch": 3.760119940029985,
"grad_norm": 0.328125,
"learning_rate": 0.00013473313809461324,
"loss": 0.3568,
"step": 2508
},
{
"epoch": 3.7661169415292353,
"grad_norm": 0.408203125,
"learning_rate": 0.00013432749033269798,
"loss": 0.3101,
"step": 2512
},
{
"epoch": 3.7721139430284856,
"grad_norm": 0.33203125,
"learning_rate": 0.00013392195842133934,
"loss": 0.3066,
"step": 2516
},
{
"epoch": 3.778110944527736,
"grad_norm": 0.318359375,
"learning_rate": 0.00013351654535821275,
"loss": 0.3164,
"step": 2520
},
{
"epoch": 3.7841079460269866,
"grad_norm": 0.328125,
"learning_rate": 0.00013311125414011511,
"loss": 0.3246,
"step": 2524
},
{
"epoch": 3.790104947526237,
"grad_norm": 0.33203125,
"learning_rate": 0.00013270608776294276,
"loss": 0.3198,
"step": 2528
},
{
"epoch": 3.7961019490254873,
"grad_norm": 0.328125,
"learning_rate": 0.0001323010492216691,
"loss": 0.3005,
"step": 2532
},
{
"epoch": 3.8020989505247376,
"grad_norm": 0.330078125,
"learning_rate": 0.0001318961415103226,
"loss": 0.305,
"step": 2536
},
{
"epoch": 3.808095952023988,
"grad_norm": 0.302734375,
"learning_rate": 0.00013149136762196474,
"loss": 0.326,
"step": 2540
},
{
"epoch": 3.8140929535232386,
"grad_norm": 0.345703125,
"learning_rate": 0.00013108673054866763,
"loss": 0.3226,
"step": 2544
},
{
"epoch": 3.820089955022489,
"grad_norm": 0.34765625,
"learning_rate": 0.0001306822332814921,
"loss": 0.3224,
"step": 2548
},
{
"epoch": 3.8260869565217392,
"grad_norm": 0.328125,
"learning_rate": 0.0001302778788104657,
"loss": 0.295,
"step": 2552
},
{
"epoch": 3.8320839580209896,
"grad_norm": 0.357421875,
"learning_rate": 0.00012987367012456014,
"loss": 0.3086,
"step": 2556
},
{
"epoch": 3.83808095952024,
"grad_norm": 0.3046875,
"learning_rate": 0.00012946961021166983,
"loss": 0.3273,
"step": 2560
},
{
"epoch": 3.84407796101949,
"grad_norm": 0.33203125,
"learning_rate": 0.00012906570205858906,
"loss": 0.308,
"step": 2564
},
{
"epoch": 3.8500749625187405,
"grad_norm": 0.3359375,
"learning_rate": 0.00012866194865099074,
"loss": 0.2829,
"step": 2568
},
{
"epoch": 3.856071964017991,
"grad_norm": 0.328125,
"learning_rate": 0.00012825835297340353,
"loss": 0.3349,
"step": 2572
},
{
"epoch": 3.862068965517241,
"grad_norm": 0.353515625,
"learning_rate": 0.0001278549180091905,
"loss": 0.3356,
"step": 2576
},
{
"epoch": 3.868065967016492,
"grad_norm": 0.33203125,
"learning_rate": 0.0001274516467405264,
"loss": 0.3379,
"step": 2580
},
{
"epoch": 3.874062968515742,
"grad_norm": 0.326171875,
"learning_rate": 0.00012704854214837618,
"loss": 0.3108,
"step": 2584
},
{
"epoch": 3.8800599700149925,
"grad_norm": 0.34375,
"learning_rate": 0.0001266456072124727,
"loss": 0.3004,
"step": 2588
},
{
"epoch": 3.886056971514243,
"grad_norm": 0.30859375,
"learning_rate": 0.00012624284491129464,
"loss": 0.304,
"step": 2592
},
{
"epoch": 3.892053973013493,
"grad_norm": 0.3125,
"learning_rate": 0.00012584025822204466,
"loss": 0.2709,
"step": 2596
},
{
"epoch": 3.898050974512744,
"grad_norm": 0.333984375,
"learning_rate": 0.00012543785012062716,
"loss": 0.2899,
"step": 2600
},
{
"epoch": 3.904047976011994,
"grad_norm": 0.3203125,
"learning_rate": 0.00012503562358162664,
"loss": 0.2571,
"step": 2604
},
{
"epoch": 3.9100449775112445,
"grad_norm": 0.314453125,
"learning_rate": 0.00012463358157828528,
"loss": 0.3106,
"step": 2608
},
{
"epoch": 3.9160419790104948,
"grad_norm": 0.31640625,
"learning_rate": 0.00012423172708248136,
"loss": 0.2812,
"step": 2612
},
{
"epoch": 3.922038980509745,
"grad_norm": 0.298828125,
"learning_rate": 0.000123830063064707,
"loss": 0.3079,
"step": 2616
},
{
"epoch": 3.9280359820089954,
"grad_norm": 0.328125,
"learning_rate": 0.00012342859249404636,
"loss": 0.3603,
"step": 2620
},
{
"epoch": 3.9340329835082457,
"grad_norm": 0.333984375,
"learning_rate": 0.0001230273183381536,
"loss": 0.3429,
"step": 2624
},
{
"epoch": 3.940029985007496,
"grad_norm": 0.349609375,
"learning_rate": 0.00012262624356323105,
"loss": 0.3389,
"step": 2628
},
{
"epoch": 3.9460269865067468,
"grad_norm": 0.3359375,
"learning_rate": 0.00012222537113400724,
"loss": 0.3027,
"step": 2632
},
{
"epoch": 3.952023988005997,
"grad_norm": 0.35546875,
"learning_rate": 0.00012182470401371487,
"loss": 0.3059,
"step": 2636
},
{
"epoch": 3.9580209895052474,
"grad_norm": 0.326171875,
"learning_rate": 0.0001214242451640691,
"loss": 0.3146,
"step": 2640
},
{
"epoch": 3.9640179910044977,
"grad_norm": 0.34375,
"learning_rate": 0.00012102399754524547,
"loss": 0.3037,
"step": 2644
},
{
"epoch": 3.970014992503748,
"grad_norm": 0.3203125,
"learning_rate": 0.00012062396411585825,
"loss": 0.354,
"step": 2648
},
{
"epoch": 3.9760119940029988,
"grad_norm": 0.3046875,
"learning_rate": 0.00012022414783293825,
"loss": 0.2754,
"step": 2652
},
{
"epoch": 3.982008995502249,
"grad_norm": 0.40625,
"learning_rate": 0.00011982455165191132,
"loss": 0.3144,
"step": 2656
},
{
"epoch": 3.9880059970014994,
"grad_norm": 0.3515625,
"learning_rate": 0.00011942517852657619,
"loss": 0.3208,
"step": 2660
},
{
"epoch": 3.9940029985007497,
"grad_norm": 0.310546875,
"learning_rate": 0.00011902603140908281,
"loss": 0.3026,
"step": 2664
},
{
"epoch": 4.0,
"grad_norm": 0.466796875,
"learning_rate": 0.00011862711324991058,
"loss": 0.2802,
"step": 2668
},
{
"epoch": 4.00599700149925,
"grad_norm": 0.27734375,
"learning_rate": 0.00011822842699784631,
"loss": 0.2288,
"step": 2672
},
{
"epoch": 4.011994002998501,
"grad_norm": 0.31640625,
"learning_rate": 0.00011782997559996267,
"loss": 0.2148,
"step": 2676
},
{
"epoch": 4.017991004497751,
"grad_norm": 0.318359375,
"learning_rate": 0.00011743176200159619,
"loss": 0.2308,
"step": 2680
},
{
"epoch": 4.023988005997001,
"grad_norm": 0.3515625,
"learning_rate": 0.00011703378914632574,
"loss": 0.2583,
"step": 2684
},
{
"epoch": 4.0299850074962515,
"grad_norm": 0.3125,
"learning_rate": 0.00011663605997595045,
"loss": 0.2436,
"step": 2688
},
{
"epoch": 4.035982008995502,
"grad_norm": 0.302734375,
"learning_rate": 0.00011623857743046834,
"loss": 0.2802,
"step": 2692
},
{
"epoch": 4.041979010494753,
"grad_norm": 0.31640625,
"learning_rate": 0.00011584134444805418,
"loss": 0.2094,
"step": 2696
},
{
"epoch": 4.047976011994003,
"grad_norm": 0.33203125,
"learning_rate": 0.00011544436396503816,
"loss": 0.1985,
"step": 2700
},
{
"epoch": 4.053973013493254,
"grad_norm": 0.314453125,
"learning_rate": 0.00011504763891588389,
"loss": 0.2294,
"step": 2704
},
{
"epoch": 4.059970014992504,
"grad_norm": 0.291015625,
"learning_rate": 0.00011465117223316685,
"loss": 0.2212,
"step": 2708
},
{
"epoch": 4.065967016491754,
"grad_norm": 0.2890625,
"learning_rate": 0.00011425496684755278,
"loss": 0.2316,
"step": 2712
},
{
"epoch": 4.071964017991005,
"grad_norm": 0.322265625,
"learning_rate": 0.00011385902568777574,
"loss": 0.2127,
"step": 2716
},
{
"epoch": 4.077961019490255,
"grad_norm": 0.298828125,
"learning_rate": 0.00011346335168061682,
"loss": 0.2041,
"step": 2720
},
{
"epoch": 4.083958020989505,
"grad_norm": 0.3203125,
"learning_rate": 0.00011306794775088218,
"loss": 0.2162,
"step": 2724
},
{
"epoch": 4.0899550224887555,
"grad_norm": 0.3125,
"learning_rate": 0.00011267281682138175,
"loss": 0.223,
"step": 2728
},
{
"epoch": 4.095952023988006,
"grad_norm": 0.3125,
"learning_rate": 0.00011227796181290724,
"loss": 0.2364,
"step": 2732
},
{
"epoch": 4.101949025487256,
"grad_norm": 0.33203125,
"learning_rate": 0.00011188338564421098,
"loss": 0.2462,
"step": 2736
},
{
"epoch": 4.1079460269865065,
"grad_norm": 0.322265625,
"learning_rate": 0.00011148909123198395,
"loss": 0.2335,
"step": 2740
},
{
"epoch": 4.113943028485757,
"grad_norm": 0.294921875,
"learning_rate": 0.00011109508149083453,
"loss": 0.2305,
"step": 2744
},
{
"epoch": 4.119940029985007,
"grad_norm": 0.30078125,
"learning_rate": 0.00011070135933326671,
"loss": 0.2231,
"step": 2748
},
{
"epoch": 4.125937031484258,
"grad_norm": 0.328125,
"learning_rate": 0.0001103079276696587,
"loss": 0.2242,
"step": 2752
},
{
"epoch": 4.131934032983509,
"grad_norm": 0.33203125,
"learning_rate": 0.0001099147894082416,
"loss": 0.2473,
"step": 2756
},
{
"epoch": 4.137931034482759,
"grad_norm": 0.302734375,
"learning_rate": 0.00010952194745507728,
"loss": 0.2219,
"step": 2760
},
{
"epoch": 4.143928035982009,
"grad_norm": 0.330078125,
"learning_rate": 0.00010912940471403777,
"loss": 0.1971,
"step": 2764
},
{
"epoch": 4.1499250374812595,
"grad_norm": 0.28125,
"learning_rate": 0.00010873716408678288,
"loss": 0.2007,
"step": 2768
},
{
"epoch": 4.15592203898051,
"grad_norm": 0.3046875,
"learning_rate": 0.00010834522847273966,
"loss": 0.23,
"step": 2772
},
{
"epoch": 4.16191904047976,
"grad_norm": 0.294921875,
"learning_rate": 0.0001079536007690801,
"loss": 0.2149,
"step": 2776
},
{
"epoch": 4.1679160419790104,
"grad_norm": 0.302734375,
"learning_rate": 0.00010756228387070046,
"loss": 0.2343,
"step": 2780
},
{
"epoch": 4.173913043478261,
"grad_norm": 0.296875,
"learning_rate": 0.00010717128067019929,
"loss": 0.2125,
"step": 2784
},
{
"epoch": 4.179910044977511,
"grad_norm": 0.318359375,
"learning_rate": 0.00010678059405785647,
"loss": 0.194,
"step": 2788
},
{
"epoch": 4.185907046476761,
"grad_norm": 0.318359375,
"learning_rate": 0.00010639022692161167,
"loss": 0.2039,
"step": 2792
},
{
"epoch": 4.191904047976012,
"grad_norm": 0.29296875,
"learning_rate": 0.00010600018214704283,
"loss": 0.2133,
"step": 2796
},
{
"epoch": 4.197901049475262,
"grad_norm": 0.326171875,
"learning_rate": 0.00010561046261734522,
"loss": 0.2073,
"step": 2800
},
{
"epoch": 4.203898050974512,
"grad_norm": 0.32421875,
"learning_rate": 0.00010522107121330975,
"loss": 0.2046,
"step": 2804
},
{
"epoch": 4.2098950524737635,
"grad_norm": 0.326171875,
"learning_rate": 0.00010483201081330194,
"loss": 0.2083,
"step": 2808
},
{
"epoch": 4.215892053973014,
"grad_norm": 0.333984375,
"learning_rate": 0.00010444328429324048,
"loss": 0.2455,
"step": 2812
},
{
"epoch": 4.221889055472264,
"grad_norm": 0.3046875,
"learning_rate": 0.0001040548945265761,
"loss": 0.2274,
"step": 2816
},
{
"epoch": 4.227886056971514,
"grad_norm": 0.328125,
"learning_rate": 0.00010366684438427018,
"loss": 0.2318,
"step": 2820
},
{
"epoch": 4.233883058470765,
"grad_norm": 0.337890625,
"learning_rate": 0.0001032791367347737,
"loss": 0.2193,
"step": 2824
},
{
"epoch": 4.239880059970015,
"grad_norm": 0.302734375,
"learning_rate": 0.00010289177444400583,
"loss": 0.2116,
"step": 2828
},
{
"epoch": 4.245877061469265,
"grad_norm": 0.3125,
"learning_rate": 0.00010250476037533299,
"loss": 0.222,
"step": 2832
},
{
"epoch": 4.251874062968516,
"grad_norm": 0.390625,
"learning_rate": 0.00010211809738954748,
"loss": 0.1968,
"step": 2836
},
{
"epoch": 4.257871064467766,
"grad_norm": 0.33984375,
"learning_rate": 0.00010173178834484643,
"loss": 0.235,
"step": 2840
},
{
"epoch": 4.263868065967016,
"grad_norm": 0.326171875,
"learning_rate": 0.00010134583609681065,
"loss": 0.2511,
"step": 2844
},
{
"epoch": 4.269865067466267,
"grad_norm": 0.341796875,
"learning_rate": 0.00010096024349838352,
"loss": 0.2757,
"step": 2848
},
{
"epoch": 4.275862068965517,
"grad_norm": 0.330078125,
"learning_rate": 0.0001005750133998499,
"loss": 0.2311,
"step": 2852
},
{
"epoch": 4.281859070464767,
"grad_norm": 0.302734375,
"learning_rate": 0.00010019014864881507,
"loss": 0.2427,
"step": 2856
},
{
"epoch": 4.287856071964018,
"grad_norm": 0.330078125,
"learning_rate": 9.980565209018374e-05,
"loss": 0.2064,
"step": 2860
},
{
"epoch": 4.293853073463269,
"grad_norm": 0.333984375,
"learning_rate": 9.942152656613876e-05,
"loss": 0.2334,
"step": 2864
},
{
"epoch": 4.299850074962519,
"grad_norm": 0.34765625,
"learning_rate": 9.903777491612056e-05,
"loss": 0.1884,
"step": 2868
},
{
"epoch": 4.305847076461769,
"grad_norm": 0.32421875,
"learning_rate": 9.865439997680582e-05,
"loss": 0.2225,
"step": 2872
},
{
"epoch": 4.31184407796102,
"grad_norm": 0.333984375,
"learning_rate": 9.827140458208643e-05,
"loss": 0.225,
"step": 2876
},
{
"epoch": 4.31784107946027,
"grad_norm": 0.33984375,
"learning_rate": 9.788879156304896e-05,
"loss": 0.2365,
"step": 2880
},
{
"epoch": 4.32383808095952,
"grad_norm": 0.296875,
"learning_rate": 9.750656374795327e-05,
"loss": 0.2335,
"step": 2884
},
{
"epoch": 4.329835082458771,
"grad_norm": 0.353515625,
"learning_rate": 9.712472396221193e-05,
"loss": 0.2408,
"step": 2888
},
{
"epoch": 4.335832083958021,
"grad_norm": 0.34375,
"learning_rate": 9.674327502836913e-05,
"loss": 0.257,
"step": 2892
},
{
"epoch": 4.341829085457271,
"grad_norm": 0.3046875,
"learning_rate": 9.636221976607995e-05,
"loss": 0.1954,
"step": 2896
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.337890625,
"learning_rate": 9.598156099208947e-05,
"loss": 0.2215,
"step": 2900
},
{
"epoch": 4.353823088455772,
"grad_norm": 0.341796875,
"learning_rate": 9.560130152021191e-05,
"loss": 0.2466,
"step": 2904
},
{
"epoch": 4.359820089955022,
"grad_norm": 0.318359375,
"learning_rate": 9.522144416130987e-05,
"loss": 0.2279,
"step": 2908
},
{
"epoch": 4.365817091454273,
"grad_norm": 0.34765625,
"learning_rate": 9.484199172327358e-05,
"loss": 0.205,
"step": 2912
},
{
"epoch": 4.371814092953524,
"grad_norm": 0.31640625,
"learning_rate": 9.446294701100011e-05,
"loss": 0.2063,
"step": 2916
},
{
"epoch": 4.377811094452774,
"grad_norm": 0.333984375,
"learning_rate": 9.408431282637256e-05,
"loss": 0.2412,
"step": 2920
},
{
"epoch": 4.383808095952024,
"grad_norm": 0.37109375,
"learning_rate": 9.370609196823956e-05,
"loss": 0.254,
"step": 2924
},
{
"epoch": 4.389805097451275,
"grad_norm": 0.361328125,
"learning_rate": 9.33282872323943e-05,
"loss": 0.2525,
"step": 2928
},
{
"epoch": 4.395802098950525,
"grad_norm": 0.330078125,
"learning_rate": 9.295090141155415e-05,
"loss": 0.2227,
"step": 2932
},
{
"epoch": 4.401799100449775,
"grad_norm": 0.318359375,
"learning_rate": 9.257393729533971e-05,
"loss": 0.2173,
"step": 2936
},
{
"epoch": 4.4077961019490255,
"grad_norm": 0.36328125,
"learning_rate": 9.219739767025461e-05,
"loss": 0.2299,
"step": 2940
},
{
"epoch": 4.413793103448276,
"grad_norm": 0.357421875,
"learning_rate": 9.182128531966434e-05,
"loss": 0.2214,
"step": 2944
},
{
"epoch": 4.419790104947526,
"grad_norm": 0.33203125,
"learning_rate": 9.144560302377629e-05,
"loss": 0.2443,
"step": 2948
},
{
"epoch": 4.425787106446776,
"grad_norm": 0.345703125,
"learning_rate": 9.107035355961867e-05,
"loss": 0.205,
"step": 2952
},
{
"epoch": 4.431784107946027,
"grad_norm": 0.376953125,
"learning_rate": 9.069553970102035e-05,
"loss": 0.2666,
"step": 2956
},
{
"epoch": 4.437781109445277,
"grad_norm": 0.30859375,
"learning_rate": 9.03211642185903e-05,
"loss": 0.1858,
"step": 2960
},
{
"epoch": 4.443778110944527,
"grad_norm": 0.337890625,
"learning_rate": 8.994722987969674e-05,
"loss": 0.2402,
"step": 2964
},
{
"epoch": 4.449775112443778,
"grad_norm": 0.333984375,
"learning_rate": 8.957373944844733e-05,
"loss": 0.2283,
"step": 2968
},
{
"epoch": 4.455772113943029,
"grad_norm": 0.328125,
"learning_rate": 8.920069568566804e-05,
"loss": 0.2357,
"step": 2972
},
{
"epoch": 4.461769115442279,
"grad_norm": 0.337890625,
"learning_rate": 8.882810134888341e-05,
"loss": 0.2099,
"step": 2976
},
{
"epoch": 4.4677661169415295,
"grad_norm": 0.322265625,
"learning_rate": 8.845595919229552e-05,
"loss": 0.2315,
"step": 2980
},
{
"epoch": 4.47376311844078,
"grad_norm": 0.349609375,
"learning_rate": 8.808427196676429e-05,
"loss": 0.2123,
"step": 2984
},
{
"epoch": 4.47976011994003,
"grad_norm": 0.3515625,
"learning_rate": 8.771304241978647e-05,
"loss": 0.2223,
"step": 2988
},
{
"epoch": 4.48575712143928,
"grad_norm": 0.318359375,
"learning_rate": 8.734227329547592e-05,
"loss": 0.1933,
"step": 2992
},
{
"epoch": 4.491754122938531,
"grad_norm": 0.359375,
"learning_rate": 8.697196733454305e-05,
"loss": 0.2669,
"step": 2996
},
{
"epoch": 4.497751124437781,
"grad_norm": 0.3359375,
"learning_rate": 8.660212727427438e-05,
"loss": 0.2182,
"step": 3000
},
{
"epoch": 4.503748125937031,
"grad_norm": 0.3046875,
"learning_rate": 8.623275584851283e-05,
"loss": 0.2159,
"step": 3004
},
{
"epoch": 4.509745127436282,
"grad_norm": 0.3203125,
"learning_rate": 8.58638557876368e-05,
"loss": 0.233,
"step": 3008
},
{
"epoch": 4.515742128935532,
"grad_norm": 0.328125,
"learning_rate": 8.549542981854078e-05,
"loss": 0.2061,
"step": 3012
},
{
"epoch": 4.521739130434782,
"grad_norm": 0.330078125,
"learning_rate": 8.512748066461446e-05,
"loss": 0.2279,
"step": 3016
},
{
"epoch": 4.527736131934033,
"grad_norm": 0.328125,
"learning_rate": 8.47600110457233e-05,
"loss": 0.2274,
"step": 3020
},
{
"epoch": 4.533733133433284,
"grad_norm": 0.306640625,
"learning_rate": 8.439302367818756e-05,
"loss": 0.2008,
"step": 3024
},
{
"epoch": 4.539730134932534,
"grad_norm": 0.33203125,
"learning_rate": 8.40265212747632e-05,
"loss": 0.2722,
"step": 3028
},
{
"epoch": 4.545727136431784,
"grad_norm": 0.34375,
"learning_rate": 8.366050654462102e-05,
"loss": 0.2094,
"step": 3032
},
{
"epoch": 4.551724137931035,
"grad_norm": 0.328125,
"learning_rate": 8.329498219332716e-05,
"loss": 0.204,
"step": 3036
},
{
"epoch": 4.557721139430285,
"grad_norm": 0.33203125,
"learning_rate": 8.29299509228228e-05,
"loss": 0.2176,
"step": 3040
},
{
"epoch": 4.563718140929535,
"grad_norm": 0.328125,
"learning_rate": 8.256541543140424e-05,
"loss": 0.2103,
"step": 3044
},
{
"epoch": 4.569715142428786,
"grad_norm": 0.328125,
"learning_rate": 8.220137841370316e-05,
"loss": 0.2291,
"step": 3048
},
{
"epoch": 4.575712143928036,
"grad_norm": 0.345703125,
"learning_rate": 8.183784256066643e-05,
"loss": 0.2572,
"step": 3052
},
{
"epoch": 4.581709145427286,
"grad_norm": 0.318359375,
"learning_rate": 8.147481055953629e-05,
"loss": 0.1984,
"step": 3056
},
{
"epoch": 4.5877061469265366,
"grad_norm": 0.3046875,
"learning_rate": 8.111228509383057e-05,
"loss": 0.2025,
"step": 3060
},
{
"epoch": 4.593703148425787,
"grad_norm": 0.3359375,
"learning_rate": 8.075026884332297e-05,
"loss": 0.228,
"step": 3064
},
{
"epoch": 4.599700149925037,
"grad_norm": 0.318359375,
"learning_rate": 8.038876448402282e-05,
"loss": 0.2427,
"step": 3068
},
{
"epoch": 4.6056971514242875,
"grad_norm": 0.333984375,
"learning_rate": 8.002777468815569e-05,
"loss": 0.2203,
"step": 3072
},
{
"epoch": 4.611694152923539,
"grad_norm": 0.296875,
"learning_rate": 7.966730212414362e-05,
"loss": 0.2291,
"step": 3076
},
{
"epoch": 4.617691154422789,
"grad_norm": 0.3359375,
"learning_rate": 7.930734945658519e-05,
"loss": 0.2482,
"step": 3080
},
{
"epoch": 4.623688155922039,
"grad_norm": 0.310546875,
"learning_rate": 7.894791934623587e-05,
"loss": 0.2045,
"step": 3084
},
{
"epoch": 4.62968515742129,
"grad_norm": 0.3203125,
"learning_rate": 7.858901444998846e-05,
"loss": 0.2065,
"step": 3088
},
{
"epoch": 4.63568215892054,
"grad_norm": 0.341796875,
"learning_rate": 7.82306374208535e-05,
"loss": 0.2197,
"step": 3092
},
{
"epoch": 4.64167916041979,
"grad_norm": 0.314453125,
"learning_rate": 7.787279090793946e-05,
"loss": 0.2139,
"step": 3096
},
{
"epoch": 4.6476761619190405,
"grad_norm": 0.353515625,
"learning_rate": 7.751547755643325e-05,
"loss": 0.2555,
"step": 3100
},
{
"epoch": 4.653673163418291,
"grad_norm": 0.328125,
"learning_rate": 7.715870000758061e-05,
"loss": 0.2481,
"step": 3104
},
{
"epoch": 4.659670164917541,
"grad_norm": 0.369140625,
"learning_rate": 7.680246089866683e-05,
"loss": 0.22,
"step": 3108
},
{
"epoch": 4.6656671664167915,
"grad_norm": 0.357421875,
"learning_rate": 7.644676286299698e-05,
"loss": 0.2324,
"step": 3112
},
{
"epoch": 4.671664167916042,
"grad_norm": 0.310546875,
"learning_rate": 7.609160852987643e-05,
"loss": 0.2062,
"step": 3116
},
{
"epoch": 4.677661169415292,
"grad_norm": 0.30859375,
"learning_rate": 7.573700052459173e-05,
"loss": 0.2048,
"step": 3120
},
{
"epoch": 4.683658170914542,
"grad_norm": 0.34765625,
"learning_rate": 7.53829414683908e-05,
"loss": 0.2774,
"step": 3124
},
{
"epoch": 4.689655172413794,
"grad_norm": 0.396484375,
"learning_rate": 7.5029433978464e-05,
"loss": 0.2455,
"step": 3128
},
{
"epoch": 4.695652173913043,
"grad_norm": 0.349609375,
"learning_rate": 7.467648066792415e-05,
"loss": 0.2411,
"step": 3132
},
{
"epoch": 4.701649175412294,
"grad_norm": 0.33984375,
"learning_rate": 7.432408414578798e-05,
"loss": 0.2107,
"step": 3136
},
{
"epoch": 4.7076461769115445,
"grad_norm": 0.3359375,
"learning_rate": 7.397224701695622e-05,
"loss": 0.2526,
"step": 3140
},
{
"epoch": 4.713643178410795,
"grad_norm": 0.3203125,
"learning_rate": 7.362097188219476e-05,
"loss": 0.2119,
"step": 3144
},
{
"epoch": 4.719640179910045,
"grad_norm": 0.337890625,
"learning_rate": 7.327026133811515e-05,
"loss": 0.2031,
"step": 3148
},
{
"epoch": 4.7256371814092955,
"grad_norm": 0.310546875,
"learning_rate": 7.292011797715548e-05,
"loss": 0.2313,
"step": 3152
},
{
"epoch": 4.731634182908546,
"grad_norm": 0.33203125,
"learning_rate": 7.257054438756125e-05,
"loss": 0.2361,
"step": 3156
},
{
"epoch": 4.737631184407796,
"grad_norm": 0.32421875,
"learning_rate": 7.222154315336641e-05,
"loss": 0.2032,
"step": 3160
},
{
"epoch": 4.743628185907046,
"grad_norm": 0.34765625,
"learning_rate": 7.187311685437385e-05,
"loss": 0.249,
"step": 3164
},
{
"epoch": 4.749625187406297,
"grad_norm": 0.326171875,
"learning_rate": 7.152526806613663e-05,
"loss": 0.2215,
"step": 3168
},
{
"epoch": 4.755622188905547,
"grad_norm": 0.3671875,
"learning_rate": 7.1177999359939e-05,
"loss": 0.229,
"step": 3172
},
{
"epoch": 4.761619190404797,
"grad_norm": 0.36328125,
"learning_rate": 7.083131330277711e-05,
"loss": 0.2435,
"step": 3176
},
{
"epoch": 4.767616191904048,
"grad_norm": 0.30078125,
"learning_rate": 7.048521245734027e-05,
"loss": 0.217,
"step": 3180
},
{
"epoch": 4.773613193403298,
"grad_norm": 0.330078125,
"learning_rate": 7.013969938199183e-05,
"loss": 0.2311,
"step": 3184
},
{
"epoch": 4.779610194902549,
"grad_norm": 0.314453125,
"learning_rate": 6.979477663075056e-05,
"loss": 0.2059,
"step": 3188
},
{
"epoch": 4.785607196401799,
"grad_norm": 0.341796875,
"learning_rate": 6.945044675327143e-05,
"loss": 0.2165,
"step": 3192
},
{
"epoch": 4.79160419790105,
"grad_norm": 0.361328125,
"learning_rate": 6.910671229482687e-05,
"loss": 0.2198,
"step": 3196
},
{
"epoch": 4.7976011994003,
"grad_norm": 0.353515625,
"learning_rate": 6.87635757962882e-05,
"loss": 0.2366,
"step": 3200
},
{
"epoch": 4.80359820089955,
"grad_norm": 0.306640625,
"learning_rate": 6.842103979410638e-05,
"loss": 0.1956,
"step": 3204
},
{
"epoch": 4.809595202398801,
"grad_norm": 0.328125,
"learning_rate": 6.807910682029387e-05,
"loss": 0.2408,
"step": 3208
},
{
"epoch": 4.815592203898051,
"grad_norm": 0.33984375,
"learning_rate": 6.77377794024051e-05,
"loss": 0.2167,
"step": 3212
},
{
"epoch": 4.821589205397301,
"grad_norm": 0.328125,
"learning_rate": 6.739706006351873e-05,
"loss": 0.2393,
"step": 3216
},
{
"epoch": 4.827586206896552,
"grad_norm": 0.3515625,
"learning_rate": 6.705695132221815e-05,
"loss": 0.2243,
"step": 3220
},
{
"epoch": 4.833583208395802,
"grad_norm": 0.30078125,
"learning_rate": 6.671745569257357e-05,
"loss": 0.1713,
"step": 3224
},
{
"epoch": 4.839580209895052,
"grad_norm": 0.302734375,
"learning_rate": 6.637857568412272e-05,
"loss": 0.2091,
"step": 3228
},
{
"epoch": 4.8455772113943025,
"grad_norm": 0.34765625,
"learning_rate": 6.604031380185308e-05,
"loss": 0.2094,
"step": 3232
},
{
"epoch": 4.851574212893553,
"grad_norm": 0.333984375,
"learning_rate": 6.570267254618266e-05,
"loss": 0.2276,
"step": 3236
},
{
"epoch": 4.857571214392804,
"grad_norm": 0.34375,
"learning_rate": 6.536565441294204e-05,
"loss": 0.2228,
"step": 3240
},
{
"epoch": 4.863568215892054,
"grad_norm": 0.330078125,
"learning_rate": 6.502926189335556e-05,
"loss": 0.2045,
"step": 3244
},
{
"epoch": 4.869565217391305,
"grad_norm": 0.3515625,
"learning_rate": 6.469349747402306e-05,
"loss": 0.2182,
"step": 3248
},
{
"epoch": 4.875562218890555,
"grad_norm": 0.349609375,
"learning_rate": 6.43583636369016e-05,
"loss": 0.2321,
"step": 3252
},
{
"epoch": 4.881559220389805,
"grad_norm": 0.326171875,
"learning_rate": 6.402386285928692e-05,
"loss": 0.2101,
"step": 3256
},
{
"epoch": 4.887556221889056,
"grad_norm": 0.349609375,
"learning_rate": 6.368999761379517e-05,
"loss": 0.2074,
"step": 3260
},
{
"epoch": 4.893553223388306,
"grad_norm": 0.322265625,
"learning_rate": 6.33567703683447e-05,
"loss": 0.1884,
"step": 3264
},
{
"epoch": 4.899550224887556,
"grad_norm": 0.375,
"learning_rate": 6.302418358613792e-05,
"loss": 0.2224,
"step": 3268
},
{
"epoch": 4.9055472263868065,
"grad_norm": 0.3359375,
"learning_rate": 6.269223972564277e-05,
"loss": 0.2385,
"step": 3272
},
{
"epoch": 4.911544227886057,
"grad_norm": 0.375,
"learning_rate": 6.236094124057479e-05,
"loss": 0.2544,
"step": 3276
},
{
"epoch": 4.917541229385307,
"grad_norm": 0.3125,
"learning_rate": 6.203029057987905e-05,
"loss": 0.2074,
"step": 3280
},
{
"epoch": 4.923538230884557,
"grad_norm": 0.333984375,
"learning_rate": 6.17002901877118e-05,
"loss": 0.1999,
"step": 3284
},
{
"epoch": 4.929535232383808,
"grad_norm": 0.36328125,
"learning_rate": 6.137094250342257e-05,
"loss": 0.2107,
"step": 3288
},
{
"epoch": 4.935532233883059,
"grad_norm": 0.345703125,
"learning_rate": 6.104224996153605e-05,
"loss": 0.2358,
"step": 3292
},
{
"epoch": 4.941529235382308,
"grad_norm": 0.369140625,
"learning_rate": 6.0714214991734276e-05,
"loss": 0.2614,
"step": 3296
},
{
"epoch": 4.94752623688156,
"grad_norm": 0.322265625,
"learning_rate": 6.038684001883845e-05,
"loss": 0.2255,
"step": 3300
},
{
"epoch": 4.95352323838081,
"grad_norm": 0.326171875,
"learning_rate": 6.0060127462791065e-05,
"loss": 0.2323,
"step": 3304
},
{
"epoch": 4.95952023988006,
"grad_norm": 0.36328125,
"learning_rate": 5.9734079738638064e-05,
"loss": 0.2579,
"step": 3308
},
{
"epoch": 4.9655172413793105,
"grad_norm": 0.337890625,
"learning_rate": 5.9408699256511124e-05,
"loss": 0.2105,
"step": 3312
},
{
"epoch": 4.971514242878561,
"grad_norm": 0.32421875,
"learning_rate": 5.9083988421609544e-05,
"loss": 0.222,
"step": 3316
},
{
"epoch": 4.977511244377811,
"grad_norm": 0.349609375,
"learning_rate": 5.875994963418259e-05,
"loss": 0.2258,
"step": 3320
},
{
"epoch": 4.983508245877061,
"grad_norm": 0.359375,
"learning_rate": 5.8436585289511966e-05,
"loss": 0.2295,
"step": 3324
},
{
"epoch": 4.989505247376312,
"grad_norm": 0.341796875,
"learning_rate": 5.811389777789372e-05,
"loss": 0.2235,
"step": 3328
},
{
"epoch": 4.995502248875562,
"grad_norm": 0.318359375,
"learning_rate": 5.779188948462099e-05,
"loss": 0.2327,
"step": 3332
},
{
"epoch": 5.001499250374812,
"grad_norm": 0.279296875,
"learning_rate": 5.747056278996586e-05,
"loss": 0.2092,
"step": 3336
},
{
"epoch": 5.007496251874063,
"grad_norm": 0.26953125,
"learning_rate": 5.714992006916236e-05,
"loss": 0.207,
"step": 3340
},
{
"epoch": 5.013493253373313,
"grad_norm": 0.298828125,
"learning_rate": 5.682996369238843e-05,
"loss": 0.1806,
"step": 3344
},
{
"epoch": 5.019490254872563,
"grad_norm": 0.283203125,
"learning_rate": 5.6510696024748734e-05,
"loss": 0.188,
"step": 3348
},
{
"epoch": 5.0254872563718145,
"grad_norm": 0.29296875,
"learning_rate": 5.619211942625687e-05,
"loss": 0.1713,
"step": 3352
},
{
"epoch": 5.031484257871065,
"grad_norm": 0.337890625,
"learning_rate": 5.5874236251818124e-05,
"loss": 0.1948,
"step": 3356
},
{
"epoch": 5.037481259370315,
"grad_norm": 0.3203125,
"learning_rate": 5.555704885121213e-05,
"loss": 0.1687,
"step": 3360
},
{
"epoch": 5.043478260869565,
"grad_norm": 0.298828125,
"learning_rate": 5.5240559569075246e-05,
"loss": 0.1593,
"step": 3364
},
{
"epoch": 5.049475262368816,
"grad_norm": 0.287109375,
"learning_rate": 5.4924770744883434e-05,
"loss": 0.1625,
"step": 3368
},
{
"epoch": 5.055472263868066,
"grad_norm": 0.3046875,
"learning_rate": 5.4609684712934855e-05,
"loss": 0.2044,
"step": 3372
},
{
"epoch": 5.061469265367316,
"grad_norm": 0.28125,
"learning_rate": 5.4295303802332786e-05,
"loss": 0.201,
"step": 3376
},
{
"epoch": 5.067466266866567,
"grad_norm": 0.291015625,
"learning_rate": 5.3981630336968104e-05,
"loss": 0.1713,
"step": 3380
},
{
"epoch": 5.073463268365817,
"grad_norm": 0.341796875,
"learning_rate": 5.3668666635502397e-05,
"loss": 0.1783,
"step": 3384
},
{
"epoch": 5.079460269865067,
"grad_norm": 0.298828125,
"learning_rate": 5.3356415011350605e-05,
"loss": 0.2147,
"step": 3388
},
{
"epoch": 5.085457271364318,
"grad_norm": 0.3125,
"learning_rate": 5.304487777266418e-05,
"loss": 0.1921,
"step": 3392
},
{
"epoch": 5.091454272863568,
"grad_norm": 0.30859375,
"learning_rate": 5.2734057222313714e-05,
"loss": 0.1801,
"step": 3396
},
{
"epoch": 5.097451274362818,
"grad_norm": 0.318359375,
"learning_rate": 5.242395565787209e-05,
"loss": 0.2036,
"step": 3400
},
{
"epoch": 5.103448275862069,
"grad_norm": 0.31640625,
"learning_rate": 5.211457537159761e-05,
"loss": 0.1686,
"step": 3404
},
{
"epoch": 5.10944527736132,
"grad_norm": 0.33203125,
"learning_rate": 5.1805918650416706e-05,
"loss": 0.2032,
"step": 3408
},
{
"epoch": 5.11544227886057,
"grad_norm": 0.291015625,
"learning_rate": 5.1497987775907514e-05,
"loss": 0.1512,
"step": 3412
},
{
"epoch": 5.12143928035982,
"grad_norm": 0.322265625,
"learning_rate": 5.1190785024282385e-05,
"loss": 0.1644,
"step": 3416
},
{
"epoch": 5.127436281859071,
"grad_norm": 0.267578125,
"learning_rate": 5.088431266637177e-05,
"loss": 0.1709,
"step": 3420
},
{
"epoch": 5.133433283358321,
"grad_norm": 0.275390625,
"learning_rate": 5.05785729676068e-05,
"loss": 0.1417,
"step": 3424
},
{
"epoch": 5.139430284857571,
"grad_norm": 0.2890625,
"learning_rate": 5.027356818800312e-05,
"loss": 0.1518,
"step": 3428
},
{
"epoch": 5.145427286356822,
"grad_norm": 0.3046875,
"learning_rate": 4.996930058214351e-05,
"loss": 0.1861,
"step": 3432
},
{
"epoch": 5.151424287856072,
"grad_norm": 0.28125,
"learning_rate": 4.96657723991619e-05,
"loss": 0.1766,
"step": 3436
},
{
"epoch": 5.157421289355322,
"grad_norm": 0.30859375,
"learning_rate": 4.936298588272626e-05,
"loss": 0.1931,
"step": 3440
},
{
"epoch": 5.1634182908545725,
"grad_norm": 0.326171875,
"learning_rate": 4.906094327102233e-05,
"loss": 0.1589,
"step": 3444
},
{
"epoch": 5.169415292353823,
"grad_norm": 0.32421875,
"learning_rate": 4.8759646796736814e-05,
"loss": 0.1664,
"step": 3448
},
{
"epoch": 5.175412293853073,
"grad_norm": 0.296875,
"learning_rate": 4.845909868704102e-05,
"loss": 0.1806,
"step": 3452
},
{
"epoch": 5.181409295352323,
"grad_norm": 0.310546875,
"learning_rate": 4.815930116357448e-05,
"loss": 0.1722,
"step": 3456
},
{
"epoch": 5.187406296851575,
"grad_norm": 0.318359375,
"learning_rate": 4.786025644242828e-05,
"loss": 0.1689,
"step": 3460
},
{
"epoch": 5.193403298350825,
"grad_norm": 0.29296875,
"learning_rate": 4.756196673412891e-05,
"loss": 0.1683,
"step": 3464
},
{
"epoch": 5.199400299850075,
"grad_norm": 0.306640625,
"learning_rate": 4.726443424362174e-05,
"loss": 0.1673,
"step": 3468
},
{
"epoch": 5.2053973013493255,
"grad_norm": 0.3125,
"learning_rate": 4.696766117025499e-05,
"loss": 0.1806,
"step": 3472
},
{
"epoch": 5.211394302848576,
"grad_norm": 0.30078125,
"learning_rate": 4.667164970776316e-05,
"loss": 0.1878,
"step": 3476
},
{
"epoch": 5.217391304347826,
"grad_norm": 0.33984375,
"learning_rate": 4.637640204425095e-05,
"loss": 0.1947,
"step": 3480
},
{
"epoch": 5.2233883058470765,
"grad_norm": 0.33203125,
"learning_rate": 4.608192036217719e-05,
"loss": 0.1852,
"step": 3484
},
{
"epoch": 5.229385307346327,
"grad_norm": 0.33203125,
"learning_rate": 4.5788206838338526e-05,
"loss": 0.1878,
"step": 3488
},
{
"epoch": 5.235382308845577,
"grad_norm": 0.330078125,
"learning_rate": 4.5495263643853396e-05,
"loss": 0.1675,
"step": 3492
},
{
"epoch": 5.241379310344827,
"grad_norm": 0.28515625,
"learning_rate": 4.520309294414603e-05,
"loss": 0.1613,
"step": 3496
},
{
"epoch": 5.247376311844078,
"grad_norm": 0.287109375,
"learning_rate": 4.491169689893045e-05,
"loss": 0.1876,
"step": 3500
},
{
"epoch": 5.253373313343328,
"grad_norm": 0.3125,
"learning_rate": 4.462107766219441e-05,
"loss": 0.1874,
"step": 3504
},
{
"epoch": 5.259370314842578,
"grad_norm": 0.330078125,
"learning_rate": 4.4331237382183496e-05,
"loss": 0.1597,
"step": 3508
},
{
"epoch": 5.265367316341829,
"grad_norm": 0.333984375,
"learning_rate": 4.4042178201385305e-05,
"loss": 0.2056,
"step": 3512
},
{
"epoch": 5.27136431784108,
"grad_norm": 0.291015625,
"learning_rate": 4.375390225651366e-05,
"loss": 0.1552,
"step": 3516
},
{
"epoch": 5.27736131934033,
"grad_norm": 0.314453125,
"learning_rate": 4.346641167849264e-05,
"loss": 0.1765,
"step": 3520
},
{
"epoch": 5.2833583208395805,
"grad_norm": 0.287109375,
"learning_rate": 4.31797085924409e-05,
"loss": 0.1917,
"step": 3524
},
{
"epoch": 5.289355322338831,
"grad_norm": 0.283203125,
"learning_rate": 4.2893795117656135e-05,
"loss": 0.1761,
"step": 3528
},
{
"epoch": 5.295352323838081,
"grad_norm": 0.279296875,
"learning_rate": 4.260867336759905e-05,
"loss": 0.1688,
"step": 3532
},
{
"epoch": 5.301349325337331,
"grad_norm": 0.318359375,
"learning_rate": 4.232434544987825e-05,
"loss": 0.1692,
"step": 3536
},
{
"epoch": 5.307346326836582,
"grad_norm": 0.30859375,
"learning_rate": 4.2040813466233966e-05,
"loss": 0.1563,
"step": 3540
},
{
"epoch": 5.313343328335832,
"grad_norm": 0.30859375,
"learning_rate": 4.17580795125233e-05,
"loss": 0.1555,
"step": 3544
},
{
"epoch": 5.319340329835082,
"grad_norm": 0.30859375,
"learning_rate": 4.1476145678704066e-05,
"loss": 0.1778,
"step": 3548
},
{
"epoch": 5.325337331334333,
"grad_norm": 0.322265625,
"learning_rate": 4.119501404881986e-05,
"loss": 0.1586,
"step": 3552
},
{
"epoch": 5.331334332833583,
"grad_norm": 0.287109375,
"learning_rate": 4.091468670098424e-05,
"loss": 0.1762,
"step": 3556
},
{
"epoch": 5.337331334332833,
"grad_norm": 0.3203125,
"learning_rate": 4.063516570736558e-05,
"loss": 0.1775,
"step": 3560
},
{
"epoch": 5.3433283358320836,
"grad_norm": 0.3046875,
"learning_rate": 4.0356453134171805e-05,
"loss": 0.1796,
"step": 3564
},
{
"epoch": 5.349325337331335,
"grad_norm": 0.326171875,
"learning_rate": 4.007855104163492e-05,
"loss": 0.1778,
"step": 3568
},
{
"epoch": 5.355322338830585,
"grad_norm": 0.298828125,
"learning_rate": 3.980146148399597e-05,
"loss": 0.164,
"step": 3572
},
{
"epoch": 5.361319340329835,
"grad_norm": 0.333984375,
"learning_rate": 3.952518650948966e-05,
"loss": 0.1757,
"step": 3576
},
{
"epoch": 5.367316341829086,
"grad_norm": 0.259765625,
"learning_rate": 3.924972816032953e-05,
"loss": 0.1457,
"step": 3580
},
{
"epoch": 5.373313343328336,
"grad_norm": 0.3046875,
"learning_rate": 3.8975088472692475e-05,
"loss": 0.1562,
"step": 3584
},
{
"epoch": 5.379310344827586,
"grad_norm": 0.3359375,
"learning_rate": 3.870126947670392e-05,
"loss": 0.199,
"step": 3588
},
{
"epoch": 5.385307346326837,
"grad_norm": 0.302734375,
"learning_rate": 3.84282731964228e-05,
"loss": 0.1527,
"step": 3592
},
{
"epoch": 5.391304347826087,
"grad_norm": 0.306640625,
"learning_rate": 3.81561016498266e-05,
"loss": 0.1487,
"step": 3596
},
{
"epoch": 5.397301349325337,
"grad_norm": 0.318359375,
"learning_rate": 3.788475684879635e-05,
"loss": 0.1573,
"step": 3600
},
{
"epoch": 5.4032983508245875,
"grad_norm": 0.3359375,
"learning_rate": 3.761424079910177e-05,
"loss": 0.1872,
"step": 3604
},
{
"epoch": 5.409295352323838,
"grad_norm": 0.302734375,
"learning_rate": 3.734455550038665e-05,
"loss": 0.1693,
"step": 3608
},
{
"epoch": 5.415292353823088,
"grad_norm": 0.306640625,
"learning_rate": 3.7075702946153665e-05,
"loss": 0.216,
"step": 3612
},
{
"epoch": 5.4212893553223385,
"grad_norm": 0.251953125,
"learning_rate": 3.680768512375017e-05,
"loss": 0.1452,
"step": 3616
},
{
"epoch": 5.42728635682159,
"grad_norm": 0.3359375,
"learning_rate": 3.654050401435287e-05,
"loss": 0.1779,
"step": 3620
},
{
"epoch": 5.43328335832084,
"grad_norm": 0.349609375,
"learning_rate": 3.627416159295384e-05,
"loss": 0.1939,
"step": 3624
},
{
"epoch": 5.43928035982009,
"grad_norm": 0.3203125,
"learning_rate": 3.600865982834536e-05,
"loss": 0.1767,
"step": 3628
},
{
"epoch": 5.445277361319341,
"grad_norm": 0.314453125,
"learning_rate": 3.574400068310587e-05,
"loss": 0.1565,
"step": 3632
},
{
"epoch": 5.451274362818591,
"grad_norm": 0.33984375,
"learning_rate": 3.548018611358486e-05,
"loss": 0.2105,
"step": 3636
},
{
"epoch": 5.457271364317841,
"grad_norm": 0.30078125,
"learning_rate": 3.521721806988911e-05,
"loss": 0.1788,
"step": 3640
},
{
"epoch": 5.4632683658170915,
"grad_norm": 0.302734375,
"learning_rate": 3.4955098495867603e-05,
"loss": 0.168,
"step": 3644
},
{
"epoch": 5.469265367316342,
"grad_norm": 0.322265625,
"learning_rate": 3.469382932909774e-05,
"loss": 0.1773,
"step": 3648
},
{
"epoch": 5.475262368815592,
"grad_norm": 0.31640625,
"learning_rate": 3.443341250087055e-05,
"loss": 0.1772,
"step": 3652
},
{
"epoch": 5.4812593703148424,
"grad_norm": 0.314453125,
"learning_rate": 3.417384993617664e-05,
"loss": 0.182,
"step": 3656
},
{
"epoch": 5.487256371814093,
"grad_norm": 0.30078125,
"learning_rate": 3.3915143553692076e-05,
"loss": 0.1597,
"step": 3660
},
{
"epoch": 5.493253373313343,
"grad_norm": 0.35546875,
"learning_rate": 3.3657295265763906e-05,
"loss": 0.1546,
"step": 3664
},
{
"epoch": 5.499250374812593,
"grad_norm": 0.318359375,
"learning_rate": 3.3400306978396233e-05,
"loss": 0.193,
"step": 3668
},
{
"epoch": 5.505247376311845,
"grad_norm": 0.298828125,
"learning_rate": 3.3144180591236016e-05,
"loss": 0.1674,
"step": 3672
},
{
"epoch": 5.511244377811094,
"grad_norm": 0.328125,
"learning_rate": 3.288891799755921e-05,
"loss": 0.2008,
"step": 3676
},
{
"epoch": 5.517241379310345,
"grad_norm": 0.333984375,
"learning_rate": 3.2634521084256554e-05,
"loss": 0.1927,
"step": 3680
},
{
"epoch": 5.5232383808095955,
"grad_norm": 0.3515625,
"learning_rate": 3.2380991731819644e-05,
"loss": 0.2101,
"step": 3684
},
{
"epoch": 5.529235382308846,
"grad_norm": 0.32421875,
"learning_rate": 3.2128331814327304e-05,
"loss": 0.174,
"step": 3688
},
{
"epoch": 5.535232383808096,
"grad_norm": 0.337890625,
"learning_rate": 3.187654319943134e-05,
"loss": 0.2115,
"step": 3692
},
{
"epoch": 5.541229385307346,
"grad_norm": 0.310546875,
"learning_rate": 3.1625627748343016e-05,
"loss": 0.1934,
"step": 3696
},
{
"epoch": 5.547226386806597,
"grad_norm": 0.3046875,
"learning_rate": 3.137558731581914e-05,
"loss": 0.1807,
"step": 3700
},
{
"epoch": 5.553223388305847,
"grad_norm": 0.3125,
"learning_rate": 3.112642375014853e-05,
"loss": 0.2024,
"step": 3704
},
{
"epoch": 5.559220389805097,
"grad_norm": 0.31640625,
"learning_rate": 3.087813889313812e-05,
"loss": 0.182,
"step": 3708
},
{
"epoch": 5.565217391304348,
"grad_norm": 0.34375,
"learning_rate": 3.063073458009952e-05,
"loss": 0.1723,
"step": 3712
},
{
"epoch": 5.571214392803598,
"grad_norm": 0.306640625,
"learning_rate": 3.0384212639835382e-05,
"loss": 0.169,
"step": 3716
},
{
"epoch": 5.577211394302848,
"grad_norm": 0.322265625,
"learning_rate": 3.013857489462595e-05,
"loss": 0.1952,
"step": 3720
},
{
"epoch": 5.583208395802099,
"grad_norm": 0.291015625,
"learning_rate": 2.9893823160215446e-05,
"loss": 0.1863,
"step": 3724
},
{
"epoch": 5.589205397301349,
"grad_norm": 0.3203125,
"learning_rate": 2.964995924579875e-05,
"loss": 0.1927,
"step": 3728
},
{
"epoch": 5.5952023988006,
"grad_norm": 0.3125,
"learning_rate": 2.94069849540081e-05,
"loss": 0.1806,
"step": 3732
},
{
"epoch": 5.60119940029985,
"grad_norm": 0.28515625,
"learning_rate": 2.9164902080899573e-05,
"loss": 0.1705,
"step": 3736
},
{
"epoch": 5.607196401799101,
"grad_norm": 0.30078125,
"learning_rate": 2.8923712415940037e-05,
"loss": 0.177,
"step": 3740
},
{
"epoch": 5.613193403298351,
"grad_norm": 0.34765625,
"learning_rate": 2.86834177419936e-05,
"loss": 0.1964,
"step": 3744
},
{
"epoch": 5.619190404797601,
"grad_norm": 0.333984375,
"learning_rate": 2.844401983530887e-05,
"loss": 0.1936,
"step": 3748
},
{
"epoch": 5.625187406296852,
"grad_norm": 0.302734375,
"learning_rate": 2.8205520465505365e-05,
"loss": 0.1755,
"step": 3752
},
{
"epoch": 5.631184407796102,
"grad_norm": 0.30859375,
"learning_rate": 2.7967921395560894e-05,
"loss": 0.177,
"step": 3756
},
{
"epoch": 5.637181409295352,
"grad_norm": 0.31640625,
"learning_rate": 2.773122438179809e-05,
"loss": 0.1952,
"step": 3760
},
{
"epoch": 5.643178410794603,
"grad_norm": 0.3203125,
"learning_rate": 2.749543117387164e-05,
"loss": 0.1965,
"step": 3764
},
{
"epoch": 5.649175412293853,
"grad_norm": 0.34375,
"learning_rate": 2.7260543514755493e-05,
"loss": 0.2069,
"step": 3768
},
{
"epoch": 5.655172413793103,
"grad_norm": 0.34765625,
"learning_rate": 2.7026563140729657e-05,
"loss": 0.2158,
"step": 3772
},
{
"epoch": 5.6611694152923535,
"grad_norm": 0.328125,
"learning_rate": 2.6793491781367578e-05,
"loss": 0.1859,
"step": 3776
},
{
"epoch": 5.667166416791604,
"grad_norm": 0.326171875,
"learning_rate": 2.6561331159523247e-05,
"loss": 0.1472,
"step": 3780
},
{
"epoch": 5.673163418290855,
"grad_norm": 0.298828125,
"learning_rate": 2.633008299131868e-05,
"loss": 0.1894,
"step": 3784
},
{
"epoch": 5.679160419790105,
"grad_norm": 0.30859375,
"learning_rate": 2.609974898613093e-05,
"loss": 0.2038,
"step": 3788
},
{
"epoch": 5.685157421289356,
"grad_norm": 0.310546875,
"learning_rate": 2.5870330846579613e-05,
"loss": 0.1641,
"step": 3792
},
{
"epoch": 5.691154422788606,
"grad_norm": 0.330078125,
"learning_rate": 2.56418302685143e-05,
"loss": 0.1894,
"step": 3796
},
{
"epoch": 5.697151424287856,
"grad_norm": 0.30859375,
"learning_rate": 2.541424894100207e-05,
"loss": 0.1738,
"step": 3800
},
{
"epoch": 5.703148425787107,
"grad_norm": 0.34375,
"learning_rate": 2.5187588546314868e-05,
"loss": 0.1835,
"step": 3804
},
{
"epoch": 5.709145427286357,
"grad_norm": 0.27734375,
"learning_rate": 2.4961850759917068e-05,
"loss": 0.1637,
"step": 3808
},
{
"epoch": 5.715142428785607,
"grad_norm": 0.29296875,
"learning_rate": 2.4737037250453356e-05,
"loss": 0.1893,
"step": 3812
},
{
"epoch": 5.7211394302848575,
"grad_norm": 0.345703125,
"learning_rate": 2.4513149679736003e-05,
"loss": 0.1852,
"step": 3816
},
{
"epoch": 5.727136431784108,
"grad_norm": 0.31640625,
"learning_rate": 2.429018970273296e-05,
"loss": 0.1963,
"step": 3820
},
{
"epoch": 5.733133433283358,
"grad_norm": 0.28515625,
"learning_rate": 2.406815896755522e-05,
"loss": 0.1498,
"step": 3824
},
{
"epoch": 5.739130434782608,
"grad_norm": 0.359375,
"learning_rate": 2.3847059115445073e-05,
"loss": 0.1895,
"step": 3828
},
{
"epoch": 5.745127436281859,
"grad_norm": 0.322265625,
"learning_rate": 2.3626891780763584e-05,
"loss": 0.1848,
"step": 3832
},
{
"epoch": 5.75112443778111,
"grad_norm": 0.310546875,
"learning_rate": 2.3407658590978917e-05,
"loss": 0.187,
"step": 3836
},
{
"epoch": 5.757121439280359,
"grad_norm": 0.29296875,
"learning_rate": 2.3189361166653768e-05,
"loss": 0.1572,
"step": 3840
},
{
"epoch": 5.7631184407796106,
"grad_norm": 0.36328125,
"learning_rate": 2.2972001121433976e-05,
"loss": 0.1693,
"step": 3844
},
{
"epoch": 5.769115442278861,
"grad_norm": 0.310546875,
"learning_rate": 2.2755580062036095e-05,
"loss": 0.1786,
"step": 3848
},
{
"epoch": 5.775112443778111,
"grad_norm": 0.318359375,
"learning_rate": 2.2540099588235903e-05,
"loss": 0.1919,
"step": 3852
},
{
"epoch": 5.7811094452773615,
"grad_norm": 0.337890625,
"learning_rate": 2.2325561292856314e-05,
"loss": 0.1889,
"step": 3856
},
{
"epoch": 5.787106446776612,
"grad_norm": 0.3046875,
"learning_rate": 2.2111966761755684e-05,
"loss": 0.166,
"step": 3860
},
{
"epoch": 5.793103448275862,
"grad_norm": 0.3203125,
"learning_rate": 2.1899317573816187e-05,
"loss": 0.182,
"step": 3864
},
{
"epoch": 5.799100449775112,
"grad_norm": 0.30859375,
"learning_rate": 2.1687615300931975e-05,
"loss": 0.188,
"step": 3868
},
{
"epoch": 5.805097451274363,
"grad_norm": 0.3203125,
"learning_rate": 2.1476861507997677e-05,
"loss": 0.1971,
"step": 3872
},
{
"epoch": 5.811094452773613,
"grad_norm": 0.34765625,
"learning_rate": 2.1267057752896766e-05,
"loss": 0.1775,
"step": 3876
},
{
"epoch": 5.817091454272863,
"grad_norm": 0.341796875,
"learning_rate": 2.105820558649016e-05,
"loss": 0.2004,
"step": 3880
},
{
"epoch": 5.823088455772114,
"grad_norm": 0.32421875,
"learning_rate": 2.0850306552604568e-05,
"loss": 0.1598,
"step": 3884
},
{
"epoch": 5.829085457271364,
"grad_norm": 0.328125,
"learning_rate": 2.0643362188021218e-05,
"loss": 0.1838,
"step": 3888
},
{
"epoch": 5.835082458770614,
"grad_norm": 0.298828125,
"learning_rate": 2.0437374022464524e-05,
"loss": 0.1578,
"step": 3892
},
{
"epoch": 5.8410794602698655,
"grad_norm": 0.28515625,
"learning_rate": 2.0232343578590626e-05,
"loss": 0.154,
"step": 3896
},
{
"epoch": 5.847076461769116,
"grad_norm": 0.3046875,
"learning_rate": 2.0028272371976266e-05,
"loss": 0.1684,
"step": 3900
},
{
"epoch": 5.853073463268366,
"grad_norm": 0.306640625,
"learning_rate": 1.98251619111075e-05,
"loss": 0.1873,
"step": 3904
},
{
"epoch": 5.859070464767616,
"grad_norm": 0.333984375,
"learning_rate": 1.9623013697368694e-05,
"loss": 0.1873,
"step": 3908
},
{
"epoch": 5.865067466266867,
"grad_norm": 0.3203125,
"learning_rate": 1.942182922503122e-05,
"loss": 0.2,
"step": 3912
},
{
"epoch": 5.871064467766117,
"grad_norm": 0.32421875,
"learning_rate": 1.9221609981242553e-05,
"loss": 0.1689,
"step": 3916
},
{
"epoch": 5.877061469265367,
"grad_norm": 0.357421875,
"learning_rate": 1.9022357446015185e-05,
"loss": 0.1852,
"step": 3920
},
{
"epoch": 5.883058470764618,
"grad_norm": 0.302734375,
"learning_rate": 1.8824073092215865e-05,
"loss": 0.1719,
"step": 3924
},
{
"epoch": 5.889055472263868,
"grad_norm": 0.30859375,
"learning_rate": 1.8626758385554474e-05,
"loss": 0.1839,
"step": 3928
},
{
"epoch": 5.895052473763118,
"grad_norm": 0.298828125,
"learning_rate": 1.8430414784573287e-05,
"loss": 0.1578,
"step": 3932
},
{
"epoch": 5.901049475262369,
"grad_norm": 0.373046875,
"learning_rate": 1.8235043740636317e-05,
"loss": 0.1848,
"step": 3936
},
{
"epoch": 5.907046476761619,
"grad_norm": 0.318359375,
"learning_rate": 1.8040646697918344e-05,
"loss": 0.197,
"step": 3940
},
{
"epoch": 5.913043478260869,
"grad_norm": 0.318359375,
"learning_rate": 1.784722509339452e-05,
"loss": 0.1977,
"step": 3944
},
{
"epoch": 5.91904047976012,
"grad_norm": 0.314453125,
"learning_rate": 1.76547803568294e-05,
"loss": 0.1732,
"step": 3948
},
{
"epoch": 5.925037481259371,
"grad_norm": 0.330078125,
"learning_rate": 1.7463313910766774e-05,
"loss": 0.1901,
"step": 3952
},
{
"epoch": 5.931034482758621,
"grad_norm": 0.310546875,
"learning_rate": 1.7272827170518773e-05,
"loss": 0.1851,
"step": 3956
},
{
"epoch": 5.937031484257871,
"grad_norm": 0.30078125,
"learning_rate": 1.7083321544155738e-05,
"loss": 0.1888,
"step": 3960
},
{
"epoch": 5.943028485757122,
"grad_norm": 0.32421875,
"learning_rate": 1.6894798432495566e-05,
"loss": 0.2085,
"step": 3964
},
{
"epoch": 5.949025487256372,
"grad_norm": 0.28515625,
"learning_rate": 1.6707259229093413e-05,
"loss": 0.169,
"step": 3968
},
{
"epoch": 5.955022488755622,
"grad_norm": 0.349609375,
"learning_rate": 1.6520705320231532e-05,
"loss": 0.1875,
"step": 3972
},
{
"epoch": 5.9610194902548725,
"grad_norm": 0.3046875,
"learning_rate": 1.633513808490884e-05,
"loss": 0.1768,
"step": 3976
},
{
"epoch": 5.967016491754123,
"grad_norm": 0.322265625,
"learning_rate": 1.6150558894830816e-05,
"loss": 0.1643,
"step": 3980
},
{
"epoch": 5.973013493253373,
"grad_norm": 0.3046875,
"learning_rate": 1.596696911439934e-05,
"loss": 0.1737,
"step": 3984
},
{
"epoch": 5.9790104947526235,
"grad_norm": 0.275390625,
"learning_rate": 1.5784370100702685e-05,
"loss": 0.1728,
"step": 3988
},
{
"epoch": 5.985007496251874,
"grad_norm": 0.314453125,
"learning_rate": 1.5602763203505318e-05,
"loss": 0.1788,
"step": 3992
},
{
"epoch": 5.991004497751124,
"grad_norm": 0.306640625,
"learning_rate": 1.542214976523809e-05,
"loss": 0.1671,
"step": 3996
},
{
"epoch": 5.997001499250375,
"grad_norm": 0.328125,
"learning_rate": 1.5242531120988189e-05,
"loss": 0.2023,
"step": 4000
},
{
"epoch": 6.002998500749626,
"grad_norm": 0.3125,
"learning_rate": 1.5063908598489388e-05,
"loss": 0.1644,
"step": 4004
},
{
"epoch": 6.008995502248876,
"grad_norm": 0.306640625,
"learning_rate": 1.4886283518112136e-05,
"loss": 0.1648,
"step": 4008
},
{
"epoch": 6.014992503748126,
"grad_norm": 0.30859375,
"learning_rate": 1.4709657192853791e-05,
"loss": 0.1742,
"step": 4012
},
{
"epoch": 6.0209895052473765,
"grad_norm": 0.33984375,
"learning_rate": 1.4534030928329054e-05,
"loss": 0.1818,
"step": 4016
},
{
"epoch": 6.026986506746627,
"grad_norm": 0.32421875,
"learning_rate": 1.4359406022760105e-05,
"loss": 0.1813,
"step": 4020
},
{
"epoch": 6.032983508245877,
"grad_norm": 0.287109375,
"learning_rate": 1.4185783766967262e-05,
"loss": 0.1611,
"step": 4024
},
{
"epoch": 6.0389805097451275,
"grad_norm": 0.283203125,
"learning_rate": 1.401316544435907e-05,
"loss": 0.1616,
"step": 4028
},
{
"epoch": 6.044977511244378,
"grad_norm": 0.296875,
"learning_rate": 1.3841552330923277e-05,
"loss": 0.1549,
"step": 4032
},
{
"epoch": 6.050974512743628,
"grad_norm": 0.310546875,
"learning_rate": 1.3670945695217028e-05,
"loss": 0.1715,
"step": 4036
},
{
"epoch": 6.056971514242878,
"grad_norm": 0.287109375,
"learning_rate": 1.3501346798357714e-05,
"loss": 0.1811,
"step": 4040
},
{
"epoch": 6.062968515742129,
"grad_norm": 0.29296875,
"learning_rate": 1.3332756894013425e-05,
"loss": 0.1829,
"step": 4044
},
{
"epoch": 6.068965517241379,
"grad_norm": 0.29296875,
"learning_rate": 1.3165177228393941e-05,
"loss": 0.159,
"step": 4048
},
{
"epoch": 6.074962518740629,
"grad_norm": 0.259765625,
"learning_rate": 1.2998609040241393e-05,
"loss": 0.1612,
"step": 4052
},
{
"epoch": 6.08095952023988,
"grad_norm": 0.310546875,
"learning_rate": 1.2833053560821066e-05,
"loss": 0.1986,
"step": 4056
},
{
"epoch": 6.086956521739131,
"grad_norm": 0.302734375,
"learning_rate": 1.266851201391234e-05,
"loss": 0.174,
"step": 4060
},
{
"epoch": 6.092953523238381,
"grad_norm": 0.271484375,
"learning_rate": 1.250498561579964e-05,
"loss": 0.1619,
"step": 4064
},
{
"epoch": 6.098950524737631,
"grad_norm": 0.322265625,
"learning_rate": 1.2342475575263555e-05,
"loss": 0.1733,
"step": 4068
},
{
"epoch": 6.104947526236882,
"grad_norm": 0.294921875,
"learning_rate": 1.2180983093571656e-05,
"loss": 0.1707,
"step": 4072
},
{
"epoch": 6.110944527736132,
"grad_norm": 0.294921875,
"learning_rate": 1.202050936446986e-05,
"loss": 0.1543,
"step": 4076
},
{
"epoch": 6.116941529235382,
"grad_norm": 0.2890625,
"learning_rate": 1.1861055574173427e-05,
"loss": 0.1436,
"step": 4080
},
{
"epoch": 6.122938530734633,
"grad_norm": 0.294921875,
"learning_rate": 1.1702622901358383e-05,
"loss": 0.1772,
"step": 4084
},
{
"epoch": 6.128935532233883,
"grad_norm": 0.3359375,
"learning_rate": 1.154521251715257e-05,
"loss": 0.1667,
"step": 4088
},
{
"epoch": 6.134932533733133,
"grad_norm": 0.32421875,
"learning_rate": 1.1388825585127175e-05,
"loss": 0.1919,
"step": 4092
},
{
"epoch": 6.140929535232384,
"grad_norm": 0.29296875,
"learning_rate": 1.1233463261288111e-05,
"loss": 0.1616,
"step": 4096
},
{
"epoch": 6.146926536731634,
"grad_norm": 0.296875,
"learning_rate": 1.1079126694067359e-05,
"loss": 0.1386,
"step": 4100
},
{
"epoch": 6.152923538230884,
"grad_norm": 0.333984375,
"learning_rate": 1.0925817024314548e-05,
"loss": 0.1799,
"step": 4104
},
{
"epoch": 6.1589205397301345,
"grad_norm": 0.283203125,
"learning_rate": 1.077353538528855e-05,
"loss": 0.1693,
"step": 4108
},
{
"epoch": 6.164917541229386,
"grad_norm": 0.306640625,
"learning_rate": 1.0622282902649116e-05,
"loss": 0.1523,
"step": 4112
},
{
"epoch": 6.170914542728636,
"grad_norm": 0.2890625,
"learning_rate": 1.0472060694448442e-05,
"loss": 0.1635,
"step": 4116
},
{
"epoch": 6.176911544227886,
"grad_norm": 0.330078125,
"learning_rate": 1.032286987112299e-05,
"loss": 0.1727,
"step": 4120
},
{
"epoch": 6.182908545727137,
"grad_norm": 0.275390625,
"learning_rate": 1.0174711535485286e-05,
"loss": 0.1638,
"step": 4124
},
{
"epoch": 6.188905547226387,
"grad_norm": 0.3125,
"learning_rate": 1.0027586782715774e-05,
"loss": 0.1769,
"step": 4128
},
{
"epoch": 6.194902548725637,
"grad_norm": 0.29296875,
"learning_rate": 9.881496700354646e-06,
"loss": 0.1582,
"step": 4132
},
{
"epoch": 6.200899550224888,
"grad_norm": 0.302734375,
"learning_rate": 9.736442368293861e-06,
"loss": 0.1645,
"step": 4136
},
{
"epoch": 6.206896551724138,
"grad_norm": 0.283203125,
"learning_rate": 9.592424858769204e-06,
"loss": 0.1661,
"step": 4140
},
{
"epoch": 6.212893553223388,
"grad_norm": 0.283203125,
"learning_rate": 9.44944523635222e-06,
"loss": 0.1379,
"step": 4144
},
{
"epoch": 6.2188905547226385,
"grad_norm": 0.341796875,
"learning_rate": 9.307504557942564e-06,
"loss": 0.1912,
"step": 4148
},
{
"epoch": 6.224887556221889,
"grad_norm": 0.31640625,
"learning_rate": 9.166603872759875e-06,
"loss": 0.1775,
"step": 4152
},
{
"epoch": 6.230884557721139,
"grad_norm": 0.30078125,
"learning_rate": 9.026744222336403e-06,
"loss": 0.1539,
"step": 4156
},
{
"epoch": 6.2368815592203894,
"grad_norm": 0.314453125,
"learning_rate": 8.887926640508942e-06,
"loss": 0.1524,
"step": 4160
},
{
"epoch": 6.24287856071964,
"grad_norm": 0.291015625,
"learning_rate": 8.750152153411506e-06,
"loss": 0.1624,
"step": 4164
},
{
"epoch": 6.248875562218891,
"grad_norm": 0.2734375,
"learning_rate": 8.61342177946749e-06,
"loss": 0.1424,
"step": 4168
},
{
"epoch": 6.254872563718141,
"grad_norm": 0.328125,
"learning_rate": 8.477736529382262e-06,
"loss": 0.1799,
"step": 4172
},
{
"epoch": 6.260869565217392,
"grad_norm": 0.291015625,
"learning_rate": 8.343097406135723e-06,
"loss": 0.1645,
"step": 4176
},
{
"epoch": 6.266866566716642,
"grad_norm": 0.302734375,
"learning_rate": 8.20950540497481e-06,
"loss": 0.1806,
"step": 4180
},
{
"epoch": 6.272863568215892,
"grad_norm": 0.30859375,
"learning_rate": 8.076961513406177e-06,
"loss": 0.1766,
"step": 4184
},
{
"epoch": 6.2788605697151425,
"grad_norm": 0.333984375,
"learning_rate": 7.945466711188885e-06,
"loss": 0.1951,
"step": 4188
},
{
"epoch": 6.284857571214393,
"grad_norm": 0.28515625,
"learning_rate": 7.815021970327229e-06,
"loss": 0.1617,
"step": 4192
},
{
"epoch": 6.290854572713643,
"grad_norm": 0.318359375,
"learning_rate": 7.68562825506341e-06,
"loss": 0.1674,
"step": 4196
},
{
"epoch": 6.296851574212893,
"grad_norm": 0.27734375,
"learning_rate": 7.5572865218705595e-06,
"loss": 0.166,
"step": 4200
},
{
"epoch": 6.302848575712144,
"grad_norm": 0.318359375,
"learning_rate": 7.429997719445535e-06,
"loss": 0.147,
"step": 4204
},
{
"epoch": 6.308845577211394,
"grad_norm": 0.306640625,
"learning_rate": 7.30376278870205e-06,
"loss": 0.1955,
"step": 4208
},
{
"epoch": 6.314842578710644,
"grad_norm": 0.359375,
"learning_rate": 7.178582662763566e-06,
"loss": 0.1965,
"step": 4212
},
{
"epoch": 6.320839580209895,
"grad_norm": 0.310546875,
"learning_rate": 7.0544582669564975e-06,
"loss": 0.1743,
"step": 4216
},
{
"epoch": 6.326836581709145,
"grad_norm": 0.33203125,
"learning_rate": 6.931390518803387e-06,
"loss": 0.1767,
"step": 4220
},
{
"epoch": 6.332833583208396,
"grad_norm": 0.30859375,
"learning_rate": 6.8093803280160066e-06,
"loss": 0.1607,
"step": 4224
},
{
"epoch": 6.3388305847076465,
"grad_norm": 0.3046875,
"learning_rate": 6.688428596488798e-06,
"loss": 0.1645,
"step": 4228
},
{
"epoch": 6.344827586206897,
"grad_norm": 0.341796875,
"learning_rate": 6.568536218291981e-06,
"loss": 0.1841,
"step": 4232
},
{
"epoch": 6.350824587706147,
"grad_norm": 0.294921875,
"learning_rate": 6.4497040796652355e-06,
"loss": 0.171,
"step": 4236
},
{
"epoch": 6.356821589205397,
"grad_norm": 0.34375,
"learning_rate": 6.331933059010846e-06,
"loss": 0.179,
"step": 4240
},
{
"epoch": 6.362818590704648,
"grad_norm": 0.32421875,
"learning_rate": 6.215224026887505e-06,
"loss": 0.1605,
"step": 4244
},
{
"epoch": 6.368815592203898,
"grad_norm": 0.32421875,
"learning_rate": 6.099577846003567e-06,
"loss": 0.1864,
"step": 4248
},
{
"epoch": 6.374812593703148,
"grad_norm": 0.27734375,
"learning_rate": 5.984995371210971e-06,
"loss": 0.166,
"step": 4252
},
{
"epoch": 6.380809595202399,
"grad_norm": 0.3046875,
"learning_rate": 5.871477449498729e-06,
"loss": 0.1881,
"step": 4256
},
{
"epoch": 6.386806596701649,
"grad_norm": 0.345703125,
"learning_rate": 5.759024919986699e-06,
"loss": 0.2102,
"step": 4260
},
{
"epoch": 6.392803598200899,
"grad_norm": 0.306640625,
"learning_rate": 5.647638613919437e-06,
"loss": 0.1468,
"step": 4264
},
{
"epoch": 6.39880059970015,
"grad_norm": 0.306640625,
"learning_rate": 5.537319354659969e-06,
"loss": 0.203,
"step": 4268
},
{
"epoch": 6.4047976011994,
"grad_norm": 0.3125,
"learning_rate": 5.4280679576838515e-06,
"loss": 0.1715,
"step": 4272
},
{
"epoch": 6.410794602698651,
"grad_norm": 0.3125,
"learning_rate": 5.319885230572951e-06,
"loss": 0.1833,
"step": 4276
},
{
"epoch": 6.416791604197901,
"grad_norm": 0.3203125,
"learning_rate": 5.2127719730096055e-06,
"loss": 0.1797,
"step": 4280
},
{
"epoch": 6.422788605697152,
"grad_norm": 0.3125,
"learning_rate": 5.1067289767706575e-06,
"loss": 0.1667,
"step": 4284
},
{
"epoch": 6.428785607196402,
"grad_norm": 0.271484375,
"learning_rate": 5.001757025721698e-06,
"loss": 0.1717,
"step": 4288
},
{
"epoch": 6.434782608695652,
"grad_norm": 0.318359375,
"learning_rate": 4.897856895811081e-06,
"loss": 0.1724,
"step": 4292
},
{
"epoch": 6.440779610194903,
"grad_norm": 0.29296875,
"learning_rate": 4.7950293550643505e-06,
"loss": 0.1764,
"step": 4296
},
{
"epoch": 6.446776611694153,
"grad_norm": 0.328125,
"learning_rate": 4.6932751635785746e-06,
"loss": 0.205,
"step": 4300
},
{
"epoch": 6.452773613193403,
"grad_norm": 0.310546875,
"learning_rate": 4.592595073516603e-06,
"loss": 0.184,
"step": 4304
},
{
"epoch": 6.458770614692654,
"grad_norm": 0.306640625,
"learning_rate": 4.492989829101551e-06,
"loss": 0.1755,
"step": 4308
},
{
"epoch": 6.464767616191904,
"grad_norm": 0.341796875,
"learning_rate": 4.394460166611341e-06,
"loss": 0.1813,
"step": 4312
},
{
"epoch": 6.470764617691154,
"grad_norm": 0.306640625,
"learning_rate": 4.297006814373305e-06,
"loss": 0.1683,
"step": 4316
},
{
"epoch": 6.4767616191904045,
"grad_norm": 0.26953125,
"learning_rate": 4.200630492758638e-06,
"loss": 0.1257,
"step": 4320
},
{
"epoch": 6.482758620689655,
"grad_norm": 0.2890625,
"learning_rate": 4.105331914177224e-06,
"loss": 0.1559,
"step": 4324
},
{
"epoch": 6.488755622188906,
"grad_norm": 0.2734375,
"learning_rate": 4.0111117830722465e-06,
"loss": 0.1228,
"step": 4328
},
{
"epoch": 6.494752623688156,
"grad_norm": 0.294921875,
"learning_rate": 3.917970795915154e-06,
"loss": 0.1717,
"step": 4332
},
{
"epoch": 6.500749625187407,
"grad_norm": 0.314453125,
"learning_rate": 3.825909641200326e-06,
"loss": 0.1809,
"step": 4336
},
{
"epoch": 6.506746626686657,
"grad_norm": 0.296875,
"learning_rate": 3.73492899944009e-06,
"loss": 0.1642,
"step": 4340
},
{
"epoch": 6.512743628185907,
"grad_norm": 0.314453125,
"learning_rate": 3.645029543159683e-06,
"loss": 0.1672,
"step": 4344
},
{
"epoch": 6.5187406296851576,
"grad_norm": 0.291015625,
"learning_rate": 3.5562119368922006e-06,
"loss": 0.1804,
"step": 4348
},
{
"epoch": 6.524737631184408,
"grad_norm": 0.318359375,
"learning_rate": 3.46847683717385e-06,
"loss": 0.1683,
"step": 4352
},
{
"epoch": 6.530734632683658,
"grad_norm": 0.302734375,
"learning_rate": 3.3818248925388756e-06,
"loss": 0.1622,
"step": 4356
},
{
"epoch": 6.5367316341829085,
"grad_norm": 0.34765625,
"learning_rate": 3.2962567435149744e-06,
"loss": 0.1687,
"step": 4360
},
{
"epoch": 6.542728635682159,
"grad_norm": 0.3046875,
"learning_rate": 3.2117730226184358e-06,
"loss": 0.1695,
"step": 4364
},
{
"epoch": 6.548725637181409,
"grad_norm": 0.34765625,
"learning_rate": 3.128374354349494e-06,
"loss": 0.1884,
"step": 4368
},
{
"epoch": 6.554722638680659,
"grad_norm": 0.28515625,
"learning_rate": 3.0460613551877513e-06,
"loss": 0.1671,
"step": 4372
},
{
"epoch": 6.56071964017991,
"grad_norm": 0.34375,
"learning_rate": 2.9648346335875094e-06,
"loss": 0.177,
"step": 4376
},
{
"epoch": 6.566716641679161,
"grad_norm": 0.28515625,
"learning_rate": 2.884694789973463e-06,
"loss": 0.1746,
"step": 4380
},
{
"epoch": 6.57271364317841,
"grad_norm": 0.2890625,
"learning_rate": 2.805642416736048e-06,
"loss": 0.1662,
"step": 4384
},
{
"epoch": 6.5787106446776615,
"grad_norm": 0.306640625,
"learning_rate": 2.7276780982272485e-06,
"loss": 0.1771,
"step": 4388
},
{
"epoch": 6.584707646176912,
"grad_norm": 0.26171875,
"learning_rate": 2.650802410756081e-06,
"loss": 0.1639,
"step": 4392
},
{
"epoch": 6.590704647676162,
"grad_norm": 0.314453125,
"learning_rate": 2.5750159225845835e-06,
"loss": 0.16,
"step": 4396
},
{
"epoch": 6.5967016491754125,
"grad_norm": 0.298828125,
"learning_rate": 2.5003191939233668e-06,
"loss": 0.1625,
"step": 4400
},
{
"epoch": 6.602698650674663,
"grad_norm": 0.32421875,
"learning_rate": 2.4267127769276364e-06,
"loss": 0.1752,
"step": 4404
},
{
"epoch": 6.608695652173913,
"grad_norm": 0.291015625,
"learning_rate": 2.3541972156930267e-06,
"loss": 0.1614,
"step": 4408
},
{
"epoch": 6.614692653673163,
"grad_norm": 0.28515625,
"learning_rate": 2.2827730462516567e-06,
"loss": 0.1577,
"step": 4412
},
{
"epoch": 6.620689655172414,
"grad_norm": 0.291015625,
"learning_rate": 2.2124407965680825e-06,
"loss": 0.1518,
"step": 4416
},
{
"epoch": 6.626686656671664,
"grad_norm": 0.3515625,
"learning_rate": 2.1432009865354316e-06,
"loss": 0.1781,
"step": 4420
},
{
"epoch": 6.632683658170914,
"grad_norm": 0.3203125,
"learning_rate": 2.0750541279715925e-06,
"loss": 0.1576,
"step": 4424
},
{
"epoch": 6.638680659670165,
"grad_norm": 0.30859375,
"learning_rate": 2.0080007246153662e-06,
"loss": 0.1574,
"step": 4428
},
{
"epoch": 6.644677661169415,
"grad_norm": 0.306640625,
"learning_rate": 1.942041272122835e-06,
"loss": 0.1695,
"step": 4432
},
{
"epoch": 6.650674662668665,
"grad_norm": 0.32421875,
"learning_rate": 1.8771762580635508e-06,
"loss": 0.1483,
"step": 4436
},
{
"epoch": 6.6566716641679164,
"grad_norm": 0.283203125,
"learning_rate": 1.8134061619170858e-06,
"loss": 0.151,
"step": 4440
},
{
"epoch": 6.662668665667167,
"grad_norm": 0.271484375,
"learning_rate": 1.750731455069404e-06,
"loss": 0.1499,
"step": 4444
},
{
"epoch": 6.668665667166417,
"grad_norm": 0.28515625,
"learning_rate": 1.6891526008094292e-06,
"loss": 0.1633,
"step": 4448
},
{
"epoch": 6.674662668665667,
"grad_norm": 0.30078125,
"learning_rate": 1.628670054325515e-06,
"loss": 0.1664,
"step": 4452
},
{
"epoch": 6.680659670164918,
"grad_norm": 0.31640625,
"learning_rate": 1.5692842627021973e-06,
"loss": 0.1632,
"step": 4456
},
{
"epoch": 6.686656671664168,
"grad_norm": 0.306640625,
"learning_rate": 1.510995664916881e-06,
"loss": 0.1701,
"step": 4460
},
{
"epoch": 6.692653673163418,
"grad_norm": 0.2890625,
"learning_rate": 1.4538046918365076e-06,
"loss": 0.1586,
"step": 4464
},
{
"epoch": 6.698650674662669,
"grad_norm": 0.33984375,
"learning_rate": 1.39771176621441e-06,
"loss": 0.2057,
"step": 4468
},
{
"epoch": 6.704647676161919,
"grad_norm": 0.31640625,
"learning_rate": 1.3427173026872295e-06,
"loss": 0.1734,
"step": 4472
},
{
"epoch": 6.710644677661169,
"grad_norm": 0.3203125,
"learning_rate": 1.2888217077718367e-06,
"loss": 0.1619,
"step": 4476
},
{
"epoch": 6.7166416791604195,
"grad_norm": 0.326171875,
"learning_rate": 1.2360253798622488e-06,
"loss": 0.1809,
"step": 4480
},
{
"epoch": 6.72263868065967,
"grad_norm": 0.27734375,
"learning_rate": 1.1843287092268173e-06,
"loss": 0.1672,
"step": 4484
},
{
"epoch": 6.72863568215892,
"grad_norm": 0.337890625,
"learning_rate": 1.1337320780052117e-06,
"loss": 0.2092,
"step": 4488
},
{
"epoch": 6.734632683658171,
"grad_norm": 0.287109375,
"learning_rate": 1.0842358602056899e-06,
"loss": 0.1593,
"step": 4492
},
{
"epoch": 6.740629685157422,
"grad_norm": 0.30859375,
"learning_rate": 1.0358404217022997e-06,
"loss": 0.1937,
"step": 4496
},
{
"epoch": 6.746626686656672,
"grad_norm": 0.3359375,
"learning_rate": 9.885461202321475e-07,
"loss": 0.1879,
"step": 4500
}
],
"logging_steps": 4,
"max_steps": 4669,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.965429329913184e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}