farmery's picture
Training in progress, epoch 1, checkpoint
af5a767 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0021770682148041,
"eval_steps": 500,
"global_step": 345,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002902757619738752,
"grad_norm": 0.5301488637924194,
"learning_rate": 0.00019999585400705652,
"loss": 5.4533,
"step": 1
},
{
"epoch": 0.005805515239477504,
"grad_norm": 0.5975003838539124,
"learning_rate": 0.00019998341637201124,
"loss": 4.7975,
"step": 2
},
{
"epoch": 0.008708272859216255,
"grad_norm": 0.8240943551063538,
"learning_rate": 0.00019996268812619107,
"loss": 4.9359,
"step": 3
},
{
"epoch": 0.011611030478955007,
"grad_norm": 0.8740971684455872,
"learning_rate": 0.00019993367098837926,
"loss": 4.4682,
"step": 4
},
{
"epoch": 0.01451378809869376,
"grad_norm": 1.309985637664795,
"learning_rate": 0.00019989636736467278,
"loss": 5.2548,
"step": 5
},
{
"epoch": 0.01741654571843251,
"grad_norm": 1.2016607522964478,
"learning_rate": 0.0001998507803482828,
"loss": 4.8472,
"step": 6
},
{
"epoch": 0.020319303338171262,
"grad_norm": 1.1827248334884644,
"learning_rate": 0.00019979691371927832,
"loss": 5.2928,
"step": 7
},
{
"epoch": 0.023222060957910014,
"grad_norm": 1.7223974466323853,
"learning_rate": 0.00019973477194427266,
"loss": 4.7192,
"step": 8
},
{
"epoch": 0.026124818577648767,
"grad_norm": 1.4475376605987549,
"learning_rate": 0.00019966436017605297,
"loss": 4.7133,
"step": 9
},
{
"epoch": 0.02902757619738752,
"grad_norm": 2.1703498363494873,
"learning_rate": 0.00019958568425315314,
"loss": 4.4146,
"step": 10
},
{
"epoch": 0.03193033381712627,
"grad_norm": 1.7314109802246094,
"learning_rate": 0.0001994987506993696,
"loss": 4.2274,
"step": 11
},
{
"epoch": 0.03483309143686502,
"grad_norm": 2.2317986488342285,
"learning_rate": 0.00019940356672322037,
"loss": 4.4908,
"step": 12
},
{
"epoch": 0.03773584905660377,
"grad_norm": 2.3612048625946045,
"learning_rate": 0.00019930014021734733,
"loss": 4.2928,
"step": 13
},
{
"epoch": 0.040638606676342524,
"grad_norm": NaN,
"learning_rate": 0.00019930014021734733,
"loss": 4.3084,
"step": 14
},
{
"epoch": 0.04354136429608128,
"grad_norm": 2.0613327026367188,
"learning_rate": 0.0001991884797578617,
"loss": 3.9954,
"step": 15
},
{
"epoch": 0.04644412191582003,
"grad_norm": 2.3426692485809326,
"learning_rate": 0.00019906859460363307,
"loss": 4.5533,
"step": 16
},
{
"epoch": 0.04934687953555878,
"grad_norm": 2.8758199214935303,
"learning_rate": 0.00019894049469552152,
"loss": 3.4729,
"step": 17
},
{
"epoch": 0.05224963715529753,
"grad_norm": 2.3996334075927734,
"learning_rate": 0.0001988041906555533,
"loss": 4.2112,
"step": 18
},
{
"epoch": 0.055152394775036286,
"grad_norm": 1.8049657344818115,
"learning_rate": 0.0001986596937860402,
"loss": 3.4162,
"step": 19
},
{
"epoch": 0.05805515239477504,
"grad_norm": 2.456997871398926,
"learning_rate": 0.00019850701606864224,
"loss": 4.234,
"step": 20
},
{
"epoch": 0.06095791001451379,
"grad_norm": 2.200556755065918,
"learning_rate": 0.0001983461701633742,
"loss": 3.79,
"step": 21
},
{
"epoch": 0.06386066763425254,
"grad_norm": 2.045299768447876,
"learning_rate": 0.00019817716940755586,
"loss": 4.2698,
"step": 22
},
{
"epoch": 0.06676342525399129,
"grad_norm": 1.7035149335861206,
"learning_rate": 0.000198000027814706,
"loss": 3.8499,
"step": 23
},
{
"epoch": 0.06966618287373004,
"grad_norm": 2.4540529251098633,
"learning_rate": 0.00019781476007338058,
"loss": 4.9429,
"step": 24
},
{
"epoch": 0.07256894049346879,
"grad_norm": 1.9538823366165161,
"learning_rate": 0.00019762138154595446,
"loss": 4.2875,
"step": 25
},
{
"epoch": 0.07547169811320754,
"grad_norm": 2.1658666133880615,
"learning_rate": 0.00019741990826734794,
"loss": 4.0588,
"step": 26
},
{
"epoch": 0.0783744557329463,
"grad_norm": 1.6644055843353271,
"learning_rate": 0.00019721035694369673,
"loss": 3.7266,
"step": 27
},
{
"epoch": 0.08127721335268505,
"grad_norm": 2.193331480026245,
"learning_rate": 0.00019699274495096712,
"loss": 3.9445,
"step": 28
},
{
"epoch": 0.0841799709724238,
"grad_norm": 2.3478739261627197,
"learning_rate": 0.00019676709033351482,
"loss": 3.5157,
"step": 29
},
{
"epoch": 0.08708272859216255,
"grad_norm": 2.0770201683044434,
"learning_rate": 0.0001965334118025888,
"loss": 3.5606,
"step": 30
},
{
"epoch": 0.0899854862119013,
"grad_norm": 2.276620864868164,
"learning_rate": 0.00019629172873477995,
"loss": 3.7209,
"step": 31
},
{
"epoch": 0.09288824383164006,
"grad_norm": 2.3815758228302,
"learning_rate": 0.0001960420611704141,
"loss": 4.2123,
"step": 32
},
{
"epoch": 0.09579100145137881,
"grad_norm": 1.987587809562683,
"learning_rate": 0.0001957844298118904,
"loss": 3.7037,
"step": 33
},
{
"epoch": 0.09869375907111756,
"grad_norm": 1.8462159633636475,
"learning_rate": 0.0001955188560219648,
"loss": 3.0063,
"step": 34
},
{
"epoch": 0.10159651669085631,
"grad_norm": 1.7328358888626099,
"learning_rate": 0.0001952453618219785,
"loss": 4.1731,
"step": 35
},
{
"epoch": 0.10449927431059507,
"grad_norm": 2.9112722873687744,
"learning_rate": 0.00019496396989003193,
"loss": 4.0481,
"step": 36
},
{
"epoch": 0.10740203193033382,
"grad_norm": 2.2112295627593994,
"learning_rate": 0.00019467470355910438,
"loss": 4.5963,
"step": 37
},
{
"epoch": 0.11030478955007257,
"grad_norm": 2.1279897689819336,
"learning_rate": 0.0001943775868151192,
"loss": 3.4653,
"step": 38
},
{
"epoch": 0.11320754716981132,
"grad_norm": 2.1699769496917725,
"learning_rate": 0.00019407264429495484,
"loss": 4.4511,
"step": 39
},
{
"epoch": 0.11611030478955008,
"grad_norm": 1.7325927019119263,
"learning_rate": 0.00019375990128440204,
"loss": 4.1323,
"step": 40
},
{
"epoch": 0.11901306240928883,
"grad_norm": 1.8565714359283447,
"learning_rate": 0.00019343938371606712,
"loss": 4.0433,
"step": 41
},
{
"epoch": 0.12191582002902758,
"grad_norm": 1.9784877300262451,
"learning_rate": 0.0001931111181672216,
"loss": 3.3724,
"step": 42
},
{
"epoch": 0.12481857764876633,
"grad_norm": 1.8009449243545532,
"learning_rate": 0.00019277513185759844,
"loss": 4.197,
"step": 43
},
{
"epoch": 0.12772133526850507,
"grad_norm": 3.194614887237549,
"learning_rate": 0.0001924314526471351,
"loss": 4.0794,
"step": 44
},
{
"epoch": 0.13062409288824384,
"grad_norm": 3.4294867515563965,
"learning_rate": 0.00019208010903366306,
"loss": 4.0895,
"step": 45
},
{
"epoch": 0.13352685050798258,
"grad_norm": 2.3046109676361084,
"learning_rate": 0.00019172113015054532,
"loss": 4.2159,
"step": 46
},
{
"epoch": 0.13642960812772134,
"grad_norm": 3.2261159420013428,
"learning_rate": 0.0001913545457642601,
"loss": 3.2197,
"step": 47
},
{
"epoch": 0.13933236574746008,
"grad_norm": 1.6862419843673706,
"learning_rate": 0.00019098038627193302,
"loss": 3.4144,
"step": 48
},
{
"epoch": 0.14223512336719885,
"grad_norm": 2.0345373153686523,
"learning_rate": 0.0001905986826988164,
"loss": 3.106,
"step": 49
},
{
"epoch": 0.14513788098693758,
"grad_norm": 2.1441516876220703,
"learning_rate": 0.00019020946669571654,
"loss": 3.979,
"step": 50
},
{
"epoch": 0.14804063860667635,
"grad_norm": 2.6867835521698,
"learning_rate": 0.0001898127705363696,
"loss": 4.0657,
"step": 51
},
{
"epoch": 0.1509433962264151,
"grad_norm": 2.0316073894500732,
"learning_rate": 0.00018940862711476513,
"loss": 3.9072,
"step": 52
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.004814863204956,
"learning_rate": 0.00018899706994241858,
"loss": 4.1832,
"step": 53
},
{
"epoch": 0.1567489114658926,
"grad_norm": 1.810863971710205,
"learning_rate": 0.00018857813314559257,
"loss": 3.3366,
"step": 54
},
{
"epoch": 0.15965166908563136,
"grad_norm": 2.068857192993164,
"learning_rate": 0.00018815185146246716,
"loss": 4.1484,
"step": 55
},
{
"epoch": 0.1625544267053701,
"grad_norm": 1.900846242904663,
"learning_rate": 0.00018771826024025946,
"loss": 3.1681,
"step": 56
},
{
"epoch": 0.16545718432510886,
"grad_norm": 2.1605849266052246,
"learning_rate": 0.00018727739543229231,
"loss": 3.1671,
"step": 57
},
{
"epoch": 0.1683599419448476,
"grad_norm": 1.944718360900879,
"learning_rate": 0.00018682929359501338,
"loss": 4.5958,
"step": 58
},
{
"epoch": 0.17126269956458637,
"grad_norm": 2.9172914028167725,
"learning_rate": 0.00018637399188496382,
"loss": 4.122,
"step": 59
},
{
"epoch": 0.1741654571843251,
"grad_norm": 2.346954822540283,
"learning_rate": 0.00018591152805569715,
"loss": 4.1201,
"step": 60
},
{
"epoch": 0.17706821480406387,
"grad_norm": 2.2824630737304688,
"learning_rate": 0.00018544194045464886,
"loss": 4.2025,
"step": 61
},
{
"epoch": 0.1799709724238026,
"grad_norm": 1.8054004907608032,
"learning_rate": 0.0001849652680199565,
"loss": 3.6063,
"step": 62
},
{
"epoch": 0.18287373004354138,
"grad_norm": 2.1201300621032715,
"learning_rate": 0.0001844815502772311,
"loss": 3.5376,
"step": 63
},
{
"epoch": 0.18577648766328012,
"grad_norm": 1.7177382707595825,
"learning_rate": 0.00018399082733627965,
"loss": 3.7342,
"step": 64
},
{
"epoch": 0.18867924528301888,
"grad_norm": 2.9193296432495117,
"learning_rate": 0.00018349313988777914,
"loss": 2.5638,
"step": 65
},
{
"epoch": 0.19158200290275762,
"grad_norm": 1.9819329977035522,
"learning_rate": 0.00018298852919990252,
"loss": 4.2545,
"step": 66
},
{
"epoch": 0.19448476052249636,
"grad_norm": 1.8844672441482544,
"learning_rate": 0.00018247703711489686,
"loss": 3.6233,
"step": 67
},
{
"epoch": 0.19738751814223512,
"grad_norm": 1.8098646402359009,
"learning_rate": 0.00018195870604561365,
"loss": 3.9222,
"step": 68
},
{
"epoch": 0.20029027576197386,
"grad_norm": 2.1591079235076904,
"learning_rate": 0.000181433578971992,
"loss": 3.7097,
"step": 69
},
{
"epoch": 0.20319303338171263,
"grad_norm": 2.3508942127227783,
"learning_rate": 0.00018090169943749476,
"loss": 3.7016,
"step": 70
},
{
"epoch": 0.20609579100145137,
"grad_norm": 1.943665623664856,
"learning_rate": 0.00018036311154549784,
"loss": 3.5324,
"step": 71
},
{
"epoch": 0.20899854862119013,
"grad_norm": 1.8940976858139038,
"learning_rate": 0.00017981785995563324,
"loss": 3.9551,
"step": 72
},
{
"epoch": 0.21190130624092887,
"grad_norm": 2.0404138565063477,
"learning_rate": 0.00017926598988008582,
"loss": 3.3151,
"step": 73
},
{
"epoch": 0.21480406386066764,
"grad_norm": 2.0190603733062744,
"learning_rate": 0.00017870754707984443,
"loss": 4.3073,
"step": 74
},
{
"epoch": 0.21770682148040638,
"grad_norm": 1.989651083946228,
"learning_rate": 0.00017814257786090719,
"loss": 3.1581,
"step": 75
},
{
"epoch": 0.22060957910014514,
"grad_norm": 3.1509041786193848,
"learning_rate": 0.000177571129070442,
"loss": 3.8772,
"step": 76
},
{
"epoch": 0.22351233671988388,
"grad_norm": 1.903363585472107,
"learning_rate": 0.00017699324809290193,
"loss": 4.1305,
"step": 77
},
{
"epoch": 0.22641509433962265,
"grad_norm": 2.1415135860443115,
"learning_rate": 0.00017640898284609612,
"loss": 4.2865,
"step": 78
},
{
"epoch": 0.22931785195936139,
"grad_norm": 1.6867640018463135,
"learning_rate": 0.0001758183817772163,
"loss": 2.6165,
"step": 79
},
{
"epoch": 0.23222060957910015,
"grad_norm": 1.9801138639450073,
"learning_rate": 0.0001752214938588198,
"loss": 4.0186,
"step": 80
},
{
"epoch": 0.2351233671988389,
"grad_norm": 2.25994610786438,
"learning_rate": 0.00017461836858476856,
"loss": 3.8012,
"step": 81
},
{
"epoch": 0.23802612481857766,
"grad_norm": 3.3158185482025146,
"learning_rate": 0.0001740090559661252,
"loss": 3.2479,
"step": 82
},
{
"epoch": 0.2409288824383164,
"grad_norm": 2.139110803604126,
"learning_rate": 0.00017339360652700604,
"loss": 2.6925,
"step": 83
},
{
"epoch": 0.24383164005805516,
"grad_norm": 1.8995939493179321,
"learning_rate": 0.00017277207130039174,
"loss": 4.1114,
"step": 84
},
{
"epoch": 0.2467343976777939,
"grad_norm": 2.1001484394073486,
"learning_rate": 0.00017214450182389559,
"loss": 4.0802,
"step": 85
},
{
"epoch": 0.24963715529753266,
"grad_norm": 1.6680461168289185,
"learning_rate": 0.00017151095013548994,
"loss": 3.1914,
"step": 86
},
{
"epoch": 0.2525399129172714,
"grad_norm": 1.978389859199524,
"learning_rate": 0.00017087146876919144,
"loss": 3.858,
"step": 87
},
{
"epoch": 0.25544267053701014,
"grad_norm": 1.8887652158737183,
"learning_rate": 0.00017022611075070474,
"loss": 3.5546,
"step": 88
},
{
"epoch": 0.25834542815674894,
"grad_norm": 2.8925201892852783,
"learning_rate": 0.00016957492959302558,
"loss": 4.478,
"step": 89
},
{
"epoch": 0.2612481857764877,
"grad_norm": 1.920861005783081,
"learning_rate": 0.00016891797929200375,
"loss": 4.2126,
"step": 90
},
{
"epoch": 0.2641509433962264,
"grad_norm": 1.6321172714233398,
"learning_rate": 0.00016825531432186543,
"loss": 3.0669,
"step": 91
},
{
"epoch": 0.26705370101596515,
"grad_norm": 2.127535343170166,
"learning_rate": 0.00016758698963069643,
"loss": 3.0706,
"step": 92
},
{
"epoch": 0.26995645863570394,
"grad_norm": 2.0623557567596436,
"learning_rate": 0.00016691306063588583,
"loss": 4.0167,
"step": 93
},
{
"epoch": 0.2728592162554427,
"grad_norm": 1.740623950958252,
"learning_rate": 0.00016623358321953078,
"loss": 3.4032,
"step": 94
},
{
"epoch": 0.2757619738751814,
"grad_norm": 2.192186117172241,
"learning_rate": 0.00016554861372380272,
"loss": 3.6432,
"step": 95
},
{
"epoch": 0.27866473149492016,
"grad_norm": 3.0152950286865234,
"learning_rate": 0.0001648582089462756,
"loss": 3.1592,
"step": 96
},
{
"epoch": 0.28156748911465895,
"grad_norm": 1.8867627382278442,
"learning_rate": 0.0001641624261352161,
"loss": 3.3498,
"step": 97
},
{
"epoch": 0.2844702467343977,
"grad_norm": 1.9052848815917969,
"learning_rate": 0.00016346132298483676,
"loss": 3.1272,
"step": 98
},
{
"epoch": 0.28737300435413643,
"grad_norm": 1.7073307037353516,
"learning_rate": 0.00016275495763051184,
"loss": 3.206,
"step": 99
},
{
"epoch": 0.29027576197387517,
"grad_norm": 2.7498321533203125,
"learning_rate": 0.00016204338864395684,
"loss": 3.2865,
"step": 100
},
{
"epoch": 0.2931785195936139,
"grad_norm": 1.8562026023864746,
"learning_rate": 0.00016132667502837165,
"loss": 3.2549,
"step": 101
},
{
"epoch": 0.2960812772133527,
"grad_norm": 1.724124789237976,
"learning_rate": 0.00016060487621354815,
"loss": 3.6638,
"step": 102
},
{
"epoch": 0.29898403483309144,
"grad_norm": 1.7688038349151611,
"learning_rate": 0.00015987805205094227,
"loss": 2.7772,
"step": 103
},
{
"epoch": 0.3018867924528302,
"grad_norm": 2.1941487789154053,
"learning_rate": 0.0001591462628087109,
"loss": 2.8096,
"step": 104
},
{
"epoch": 0.3047895500725689,
"grad_norm": 1.7136414051055908,
"learning_rate": 0.00015840956916671477,
"loss": 3.4411,
"step": 105
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.4751169681549072,
"learning_rate": 0.00015766803221148673,
"loss": 2.9833,
"step": 106
},
{
"epoch": 0.31059506531204645,
"grad_norm": 2.0611205101013184,
"learning_rate": 0.00015692171343116638,
"loss": 2.6663,
"step": 107
},
{
"epoch": 0.3134978229317852,
"grad_norm": 1.6866419315338135,
"learning_rate": 0.00015617067471040174,
"loss": 3.2627,
"step": 108
},
{
"epoch": 0.3164005805515239,
"grad_norm": 1.548632025718689,
"learning_rate": 0.0001554149783252175,
"loss": 2.8767,
"step": 109
},
{
"epoch": 0.3193033381712627,
"grad_norm": 1.8421952724456787,
"learning_rate": 0.00015465468693785125,
"loss": 3.7856,
"step": 110
},
{
"epoch": 0.32220609579100146,
"grad_norm": 1.7316609621047974,
"learning_rate": 0.00015388986359155758,
"loss": 4.3054,
"step": 111
},
{
"epoch": 0.3251088534107402,
"grad_norm": 2.4119129180908203,
"learning_rate": 0.00015312057170538035,
"loss": 3.9081,
"step": 112
},
{
"epoch": 0.32801161103047893,
"grad_norm": 1.9937965869903564,
"learning_rate": 0.00015234687506889428,
"loss": 4.6076,
"step": 113
},
{
"epoch": 0.3309143686502177,
"grad_norm": 1.777130126953125,
"learning_rate": 0.0001515688378369152,
"loss": 2.5866,
"step": 114
},
{
"epoch": 0.33381712626995647,
"grad_norm": 2.239431142807007,
"learning_rate": 0.00015078652452418063,
"loss": 3.2308,
"step": 115
},
{
"epoch": 0.3367198838896952,
"grad_norm": NaN,
"learning_rate": 0.00015078652452418063,
"loss": 3.0439,
"step": 116
},
{
"epoch": 0.33962264150943394,
"grad_norm": 2.8517825603485107,
"learning_rate": 0.00015000000000000001,
"loss": 3.546,
"step": 117
},
{
"epoch": 0.34252539912917274,
"grad_norm": 2.105649948120117,
"learning_rate": 0.00014920932948287593,
"loss": 3.2135,
"step": 118
},
{
"epoch": 0.3454281567489115,
"grad_norm": NaN,
"learning_rate": 0.00014920932948287593,
"loss": 3.8332,
"step": 119
},
{
"epoch": 0.3483309143686502,
"grad_norm": NaN,
"learning_rate": 0.00014920932948287593,
"loss": 3.4586,
"step": 120
},
{
"epoch": 0.35123367198838895,
"grad_norm": 5.404326915740967,
"learning_rate": 0.00014841457853509606,
"loss": 3.331,
"step": 121
},
{
"epoch": 0.35413642960812775,
"grad_norm": 2.6620254516601562,
"learning_rate": 0.00014761581305729684,
"loss": 3.9836,
"step": 122
},
{
"epoch": 0.3570391872278665,
"grad_norm": 2.549010753631592,
"learning_rate": 0.00014681309928299893,
"loss": 3.7899,
"step": 123
},
{
"epoch": 0.3599419448476052,
"grad_norm": 3.8975048065185547,
"learning_rate": 0.00014600650377311522,
"loss": 3.5173,
"step": 124
},
{
"epoch": 0.36284470246734396,
"grad_norm": 5.37324857711792,
"learning_rate": 0.00014519609341043157,
"loss": 3.0372,
"step": 125
},
{
"epoch": 0.36574746008708275,
"grad_norm": 1.9681342840194702,
"learning_rate": 0.00014438193539406089,
"loss": 3.5476,
"step": 126
},
{
"epoch": 0.3686502177068215,
"grad_norm": 1.8248546123504639,
"learning_rate": 0.0001435640972338709,
"loss": 3.7966,
"step": 127
},
{
"epoch": 0.37155297532656023,
"grad_norm": 1.9447401762008667,
"learning_rate": 0.00014274264674488658,
"loss": 3.7259,
"step": 128
},
{
"epoch": 0.37445573294629897,
"grad_norm": 1.9753526449203491,
"learning_rate": 0.00014191765204166643,
"loss": 3.6636,
"step": 129
},
{
"epoch": 0.37735849056603776,
"grad_norm": 1.8528705835342407,
"learning_rate": 0.00014108918153265485,
"loss": 3.8717,
"step": 130
},
{
"epoch": 0.3802612481857765,
"grad_norm": 1.6633983850479126,
"learning_rate": 0.00014025730391450947,
"loss": 3.6534,
"step": 131
},
{
"epoch": 0.38316400580551524,
"grad_norm": 2.0460166931152344,
"learning_rate": 0.00013942208816640505,
"loss": 4.3184,
"step": 132
},
{
"epoch": 0.386066763425254,
"grad_norm": 1.5878854990005493,
"learning_rate": 0.00013858360354431355,
"loss": 3.1587,
"step": 133
},
{
"epoch": 0.3889695210449927,
"grad_norm": 2.3371992111206055,
"learning_rate": 0.00013774191957526143,
"loss": 2.9895,
"step": 134
},
{
"epoch": 0.3918722786647315,
"grad_norm": 1.7218937873840332,
"learning_rate": 0.00013689710605156472,
"loss": 3.9084,
"step": 135
},
{
"epoch": 0.39477503628447025,
"grad_norm": 2.266514539718628,
"learning_rate": 0.00013604923302504147,
"loss": 3.7989,
"step": 136
},
{
"epoch": 0.397677793904209,
"grad_norm": 1.6445748805999756,
"learning_rate": 0.00013519837080120346,
"loss": 3.4014,
"step": 137
},
{
"epoch": 0.4005805515239477,
"grad_norm": 1.972373127937317,
"learning_rate": 0.00013434458993342614,
"loss": 3.2058,
"step": 138
},
{
"epoch": 0.4034833091436865,
"grad_norm": 2.3418309688568115,
"learning_rate": 0.00013348796121709862,
"loss": 4.0443,
"step": 139
},
{
"epoch": 0.40638606676342526,
"grad_norm": 1.811594843864441,
"learning_rate": 0.00013262855568375317,
"loss": 3.5496,
"step": 140
},
{
"epoch": 0.409288824383164,
"grad_norm": 1.8474693298339844,
"learning_rate": 0.00013176644459517528,
"loss": 3.6035,
"step": 141
},
{
"epoch": 0.41219158200290273,
"grad_norm": 1.9336134195327759,
"learning_rate": 0.00013090169943749476,
"loss": 2.7298,
"step": 142
},
{
"epoch": 0.41509433962264153,
"grad_norm": 1.8413362503051758,
"learning_rate": 0.00013003439191525807,
"loss": 2.8708,
"step": 143
},
{
"epoch": 0.41799709724238027,
"grad_norm": 2.0277211666107178,
"learning_rate": 0.0001291645939454825,
"loss": 3.8391,
"step": 144
},
{
"epoch": 0.420899854862119,
"grad_norm": 1.8813992738723755,
"learning_rate": 0.000128292377651693,
"loss": 3.4416,
"step": 145
},
{
"epoch": 0.42380261248185774,
"grad_norm": 2.2389297485351562,
"learning_rate": 0.00012741781535794154,
"loss": 3.3343,
"step": 146
},
{
"epoch": 0.42670537010159654,
"grad_norm": 2.1361331939697266,
"learning_rate": 0.0001265409795828101,
"loss": 3.6481,
"step": 147
},
{
"epoch": 0.4296081277213353,
"grad_norm": 1.7442470788955688,
"learning_rate": 0.00012566194303339739,
"loss": 2.8798,
"step": 148
},
{
"epoch": 0.432510885341074,
"grad_norm": 1.9861546754837036,
"learning_rate": 0.00012478077859929,
"loss": 2.6437,
"step": 149
},
{
"epoch": 0.43541364296081275,
"grad_norm": 1.9143513441085815,
"learning_rate": 0.0001238975593465185,
"loss": 3.0054,
"step": 150
},
{
"epoch": 0.43831640058055155,
"grad_norm": 1.910510778427124,
"learning_rate": 0.00012301235851149865,
"loss": 3.073,
"step": 151
},
{
"epoch": 0.4412191582002903,
"grad_norm": 2.015235424041748,
"learning_rate": 0.0001221252494949588,
"loss": 3.3852,
"step": 152
},
{
"epoch": 0.444121915820029,
"grad_norm": 2.3909735679626465,
"learning_rate": 0.00012123630585585333,
"loss": 3.6159,
"step": 153
},
{
"epoch": 0.44702467343976776,
"grad_norm": 4.09874963760376,
"learning_rate": 0.0001203456013052634,
"loss": 3.8377,
"step": 154
},
{
"epoch": 0.44992743105950656,
"grad_norm": 2.008082151412964,
"learning_rate": 0.00011945320970028461,
"loss": 3.3051,
"step": 155
},
{
"epoch": 0.4528301886792453,
"grad_norm": 1.7395459413528442,
"learning_rate": 0.00011855920503790292,
"loss": 2.8138,
"step": 156
},
{
"epoch": 0.45573294629898403,
"grad_norm": 3.456113815307617,
"learning_rate": 0.00011766366144885877,
"loss": 3.8382,
"step": 157
},
{
"epoch": 0.45863570391872277,
"grad_norm": 1.6849101781845093,
"learning_rate": 0.0001167666531915001,
"loss": 3.2607,
"step": 158
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.4074480533599854,
"learning_rate": 0.00011586825464562514,
"loss": 3.1549,
"step": 159
},
{
"epoch": 0.4644412191582003,
"grad_norm": 1.906053900718689,
"learning_rate": 0.00011496854030631443,
"loss": 3.0266,
"step": 160
},
{
"epoch": 0.46734397677793904,
"grad_norm": 3.594622850418091,
"learning_rate": 0.00011406758477775406,
"loss": 2.9502,
"step": 161
},
{
"epoch": 0.4702467343976778,
"grad_norm": 1.7513110637664795,
"learning_rate": 0.00011316546276704924,
"loss": 3.0875,
"step": 162
},
{
"epoch": 0.4731494920174166,
"grad_norm": 1.782333254814148,
"learning_rate": 0.00011226224907802985,
"loss": 3.1332,
"step": 163
},
{
"epoch": 0.4760522496371553,
"grad_norm": 1.809478759765625,
"learning_rate": 0.00011135801860504749,
"loss": 3.6647,
"step": 164
},
{
"epoch": 0.47895500725689405,
"grad_norm": 1.8948771953582764,
"learning_rate": 0.00011045284632676536,
"loss": 4.1531,
"step": 165
},
{
"epoch": 0.4818577648766328,
"grad_norm": 2.1463427543640137,
"learning_rate": 0.00010954680729994102,
"loss": 3.9761,
"step": 166
},
{
"epoch": 0.4847605224963715,
"grad_norm": 3.1157124042510986,
"learning_rate": 0.00010863997665320272,
"loss": 3.3557,
"step": 167
},
{
"epoch": 0.4876632801161103,
"grad_norm": 1.641317367553711,
"learning_rate": 0.0001077324295808197,
"loss": 2.8117,
"step": 168
},
{
"epoch": 0.49056603773584906,
"grad_norm": 2.0440993309020996,
"learning_rate": 0.0001068242413364671,
"loss": 3.7747,
"step": 169
},
{
"epoch": 0.4934687953555878,
"grad_norm": 1.8725652694702148,
"learning_rate": 0.00010591548722698599,
"loss": 3.5484,
"step": 170
},
{
"epoch": 0.49637155297532654,
"grad_norm": 2.0633366107940674,
"learning_rate": 0.00010500624260613892,
"loss": 3.1863,
"step": 171
},
{
"epoch": 0.49927431059506533,
"grad_norm": 1.8762496709823608,
"learning_rate": 0.00010409658286836143,
"loss": 3.2581,
"step": 172
},
{
"epoch": 0.502177068214804,
"grad_norm": 2.147141695022583,
"learning_rate": 0.00010318658344251066,
"loss": 3.5548,
"step": 173
},
{
"epoch": 0.5050798258345428,
"grad_norm": 1.9856010675430298,
"learning_rate": 0.00010227631978561056,
"loss": 3.4,
"step": 174
},
{
"epoch": 0.5079825834542816,
"grad_norm": 4.999744892120361,
"learning_rate": 0.0001013658673765951,
"loss": 3.1381,
"step": 175
},
{
"epoch": 0.5108853410740203,
"grad_norm": 1.9928354024887085,
"learning_rate": 0.00010045530171004955,
"loss": 2.8732,
"step": 176
},
{
"epoch": 0.5137880986937591,
"grad_norm": 1.749778389930725,
"learning_rate": 9.954469828995045e-05,
"loss": 3.6324,
"step": 177
},
{
"epoch": 0.5166908563134979,
"grad_norm": 2.014143943786621,
"learning_rate": 9.863413262340491e-05,
"loss": 3.073,
"step": 178
},
{
"epoch": 0.5195936139332366,
"grad_norm": 2.1828532218933105,
"learning_rate": 9.772368021438943e-05,
"loss": 3.5193,
"step": 179
},
{
"epoch": 0.5224963715529753,
"grad_norm": 1.9171918630599976,
"learning_rate": 9.681341655748934e-05,
"loss": 3.6872,
"step": 180
},
{
"epoch": 0.525399129172714,
"grad_norm": 2.2952675819396973,
"learning_rate": 9.590341713163858e-05,
"loss": 3.7747,
"step": 181
},
{
"epoch": 0.5283018867924528,
"grad_norm": 2.325395345687866,
"learning_rate": 9.499375739386112e-05,
"loss": 3.6792,
"step": 182
},
{
"epoch": 0.5312046444121916,
"grad_norm": 1.756514072418213,
"learning_rate": 9.4084512773014e-05,
"loss": 2.9106,
"step": 183
},
{
"epoch": 0.5341074020319303,
"grad_norm": 1.7968791723251343,
"learning_rate": 9.317575866353292e-05,
"loss": 3.4895,
"step": 184
},
{
"epoch": 0.5370101596516691,
"grad_norm": 2.017638921737671,
"learning_rate": 9.226757041918033e-05,
"loss": 3.3524,
"step": 185
},
{
"epoch": 0.5399129172714079,
"grad_norm": 1.6511162519454956,
"learning_rate": 9.136002334679731e-05,
"loss": 2.5666,
"step": 186
},
{
"epoch": 0.5428156748911466,
"grad_norm": 1.884466290473938,
"learning_rate": 9.0453192700059e-05,
"loss": 3.2128,
"step": 187
},
{
"epoch": 0.5457184325108854,
"grad_norm": 2.771385669708252,
"learning_rate": 8.954715367323468e-05,
"loss": 4.2103,
"step": 188
},
{
"epoch": 0.548621190130624,
"grad_norm": 1.8222163915634155,
"learning_rate": 8.86419813949525e-05,
"loss": 4.1274,
"step": 189
},
{
"epoch": 0.5515239477503628,
"grad_norm": 1.7211194038391113,
"learning_rate": 8.773775092197017e-05,
"loss": 3.0317,
"step": 190
},
{
"epoch": 0.5544267053701016,
"grad_norm": 2.3453516960144043,
"learning_rate": 8.683453723295074e-05,
"loss": 3.9362,
"step": 191
},
{
"epoch": 0.5573294629898403,
"grad_norm": 1.9646939039230347,
"learning_rate": 8.593241522224597e-05,
"loss": 3.1403,
"step": 192
},
{
"epoch": 0.5602322206095791,
"grad_norm": 1.6175512075424194,
"learning_rate": 8.503145969368562e-05,
"loss": 3.0328,
"step": 193
},
{
"epoch": 0.5631349782293179,
"grad_norm": 1.8903875350952148,
"learning_rate": 8.413174535437487e-05,
"loss": 3.1679,
"step": 194
},
{
"epoch": 0.5660377358490566,
"grad_norm": 1.789034128189087,
"learning_rate": 8.323334680849992e-05,
"loss": 2.8819,
"step": 195
},
{
"epoch": 0.5689404934687954,
"grad_norm": 2.002990245819092,
"learning_rate": 8.233633855114127e-05,
"loss": 3.258,
"step": 196
},
{
"epoch": 0.5718432510885341,
"grad_norm": 2.053255796432495,
"learning_rate": 8.14407949620971e-05,
"loss": 3.7645,
"step": 197
},
{
"epoch": 0.5747460087082729,
"grad_norm": 2.13325834274292,
"learning_rate": 8.054679029971541e-05,
"loss": 2.9198,
"step": 198
},
{
"epoch": 0.5776487663280117,
"grad_norm": 2.154493808746338,
"learning_rate": 7.965439869473664e-05,
"loss": 2.9222,
"step": 199
},
{
"epoch": 0.5805515239477503,
"grad_norm": 1.912862777709961,
"learning_rate": 7.87636941441467e-05,
"loss": 4.0231,
"step": 200
},
{
"epoch": 0.5834542815674891,
"grad_norm": 1.8815771341323853,
"learning_rate": 7.787475050504125e-05,
"loss": 2.7792,
"step": 201
},
{
"epoch": 0.5863570391872278,
"grad_norm": 2.248081922531128,
"learning_rate": 7.698764148850137e-05,
"loss": 2.6916,
"step": 202
},
{
"epoch": 0.5892597968069666,
"grad_norm": 1.8417608737945557,
"learning_rate": 7.610244065348153e-05,
"loss": 2.9318,
"step": 203
},
{
"epoch": 0.5921625544267054,
"grad_norm": 2.505697250366211,
"learning_rate": 7.521922140071002e-05,
"loss": 3.8375,
"step": 204
},
{
"epoch": 0.5950653120464441,
"grad_norm": 2.0701253414154053,
"learning_rate": 7.433805696660266e-05,
"loss": 3.4407,
"step": 205
},
{
"epoch": 0.5979680696661829,
"grad_norm": 2.3337976932525635,
"learning_rate": 7.34590204171899e-05,
"loss": 3.9581,
"step": 206
},
{
"epoch": 0.6008708272859217,
"grad_norm": 2.4665446281433105,
"learning_rate": 7.258218464205848e-05,
"loss": 3.5468,
"step": 207
},
{
"epoch": 0.6037735849056604,
"grad_norm": 1.7483268976211548,
"learning_rate": 7.170762234830699e-05,
"loss": 2.8491,
"step": 208
},
{
"epoch": 0.6066763425253991,
"grad_norm": 1.9214202165603638,
"learning_rate": 7.08354060545175e-05,
"loss": 3.1274,
"step": 209
},
{
"epoch": 0.6095791001451378,
"grad_norm": 2.279972553253174,
"learning_rate": 6.996560808474195e-05,
"loss": 3.6062,
"step": 210
},
{
"epoch": 0.6124818577648766,
"grad_norm": 2.0444631576538086,
"learning_rate": 6.909830056250527e-05,
"loss": 3.5751,
"step": 211
},
{
"epoch": 0.6153846153846154,
"grad_norm": 2.0278406143188477,
"learning_rate": 6.823355540482475e-05,
"loss": 2.8403,
"step": 212
},
{
"epoch": 0.6182873730043541,
"grad_norm": 5.514923095703125,
"learning_rate": 6.737144431624687e-05,
"loss": 3.4911,
"step": 213
},
{
"epoch": 0.6211901306240929,
"grad_norm": 3.965879201889038,
"learning_rate": 6.651203878290139e-05,
"loss": 3.5421,
"step": 214
},
{
"epoch": 0.6240928882438317,
"grad_norm": 1.7389204502105713,
"learning_rate": 6.565541006657387e-05,
"loss": 2.7665,
"step": 215
},
{
"epoch": 0.6269956458635704,
"grad_norm": 1.7644435167312622,
"learning_rate": 6.480162919879657e-05,
"loss": 2.3306,
"step": 216
},
{
"epoch": 0.6298984034833092,
"grad_norm": 1.799849033355713,
"learning_rate": 6.395076697495854e-05,
"loss": 3.06,
"step": 217
},
{
"epoch": 0.6328011611030478,
"grad_norm": 1.7353590726852417,
"learning_rate": 6.310289394843528e-05,
"loss": 3.0691,
"step": 218
},
{
"epoch": 0.6357039187227866,
"grad_norm": 1.8332058191299438,
"learning_rate": 6.225808042473858e-05,
"loss": 3.4982,
"step": 219
},
{
"epoch": 0.6386066763425254,
"grad_norm": 2.136359691619873,
"learning_rate": 6.141639645568646e-05,
"loss": 3.3539,
"step": 220
},
{
"epoch": 0.6415094339622641,
"grad_norm": 2.038928508758545,
"learning_rate": 6.057791183359496e-05,
"loss": 2.6658,
"step": 221
},
{
"epoch": 0.6444121915820029,
"grad_norm": 2.400620222091675,
"learning_rate": 5.974269608549052e-05,
"loss": 3.4144,
"step": 222
},
{
"epoch": 0.6473149492017417,
"grad_norm": 1.9838178157806396,
"learning_rate": 5.8910818467345185e-05,
"loss": 3.2745,
"step": 223
},
{
"epoch": 0.6502177068214804,
"grad_norm": 1.9232710599899292,
"learning_rate": 5.8082347958333625e-05,
"loss": 3.5748,
"step": 224
},
{
"epoch": 0.6531204644412192,
"grad_norm": 2.4304771423339844,
"learning_rate": 5.725735325511343e-05,
"loss": 3.3168,
"step": 225
},
{
"epoch": 0.6560232220609579,
"grad_norm": 1.825479507446289,
"learning_rate": 5.643590276612909e-05,
"loss": 2.7848,
"step": 226
},
{
"epoch": 0.6589259796806967,
"grad_norm": 2.0149223804473877,
"learning_rate": 5.561806460593917e-05,
"loss": 3.2352,
"step": 227
},
{
"epoch": 0.6618287373004355,
"grad_norm": 2.0452849864959717,
"learning_rate": 5.4803906589568476e-05,
"loss": 3.2581,
"step": 228
},
{
"epoch": 0.6647314949201741,
"grad_norm": 1.8912854194641113,
"learning_rate": 5.399349622688479e-05,
"loss": 3.1843,
"step": 229
},
{
"epoch": 0.6676342525399129,
"grad_norm": 1.9609266519546509,
"learning_rate": 5.3186900717001095e-05,
"loss": 2.4325,
"step": 230
},
{
"epoch": 0.6705370101596516,
"grad_norm": 2.2313523292541504,
"learning_rate": 5.238418694270317e-05,
"loss": 3.4058,
"step": 231
},
{
"epoch": 0.6734397677793904,
"grad_norm": 2.4402058124542236,
"learning_rate": 5.1585421464903994e-05,
"loss": 3.9064,
"step": 232
},
{
"epoch": 0.6763425253991292,
"grad_norm": 2.100404977798462,
"learning_rate": 5.0790670517124097e-05,
"loss": 3.6432,
"step": 233
},
{
"epoch": 0.6792452830188679,
"grad_norm": 2.1355984210968018,
"learning_rate": 5.000000000000002e-05,
"loss": 3.58,
"step": 234
},
{
"epoch": 0.6821480406386067,
"grad_norm": 2.045910596847534,
"learning_rate": 4.921347547581939e-05,
"loss": 2.9068,
"step": 235
},
{
"epoch": 0.6850507982583455,
"grad_norm": 2.569124460220337,
"learning_rate": 4.843116216308483e-05,
"loss": 2.9852,
"step": 236
},
{
"epoch": 0.6879535558780842,
"grad_norm": 1.493397831916809,
"learning_rate": 4.765312493110578e-05,
"loss": 1.991,
"step": 237
},
{
"epoch": 0.690856313497823,
"grad_norm": 1.9058390855789185,
"learning_rate": 4.687942829461969e-05,
"loss": 3.2437,
"step": 238
},
{
"epoch": 0.6937590711175616,
"grad_norm": 2.2078254222869873,
"learning_rate": 4.611013640844245e-05,
"loss": 3.319,
"step": 239
},
{
"epoch": 0.6966618287373004,
"grad_norm": 4.914015769958496,
"learning_rate": 4.5345313062148776e-05,
"loss": 3.2462,
"step": 240
},
{
"epoch": 0.6995645863570392,
"grad_norm": 2.244297742843628,
"learning_rate": 4.4585021674782534e-05,
"loss": 3.66,
"step": 241
},
{
"epoch": 0.7024673439767779,
"grad_norm": 2.599207639694214,
"learning_rate": 4.38293252895983e-05,
"loss": 2.5752,
"step": 242
},
{
"epoch": 0.7053701015965167,
"grad_norm": 1.918351411819458,
"learning_rate": 4.3078286568833614e-05,
"loss": 3.905,
"step": 243
},
{
"epoch": 0.7082728592162555,
"grad_norm": 1.9592002630233765,
"learning_rate": 4.2331967788513295e-05,
"loss": 2.9771,
"step": 244
},
{
"epoch": 0.7111756168359942,
"grad_norm": 1.787062168121338,
"learning_rate": 4.159043083328521e-05,
"loss": 2.4677,
"step": 245
},
{
"epoch": 0.714078374455733,
"grad_norm": 1.8812865018844604,
"learning_rate": 4.0853737191289096e-05,
"loss": 3.6701,
"step": 246
},
{
"epoch": 0.7169811320754716,
"grad_norm": 1.9155601263046265,
"learning_rate": 4.012194794905775e-05,
"loss": 3.6807,
"step": 247
},
{
"epoch": 0.7198838896952104,
"grad_norm": 2.015004873275757,
"learning_rate": 3.939512378645185e-05,
"loss": 2.9783,
"step": 248
},
{
"epoch": 0.7227866473149492,
"grad_norm": 2.053408622741699,
"learning_rate": 3.8673324971628357e-05,
"loss": 2.8061,
"step": 249
},
{
"epoch": 0.7256894049346879,
"grad_norm": 1.8491019010543823,
"learning_rate": 3.795661135604319e-05,
"loss": 3.2741,
"step": 250
},
{
"epoch": 0.7285921625544267,
"grad_norm": 2.372168779373169,
"learning_rate": 3.724504236948818e-05,
"loss": 3.3095,
"step": 251
},
{
"epoch": 0.7314949201741655,
"grad_norm": 2.0113255977630615,
"learning_rate": 3.653867701516326e-05,
"loss": 3.5256,
"step": 252
},
{
"epoch": 0.7343976777939042,
"grad_norm": 1.9517208337783813,
"learning_rate": 3.583757386478389e-05,
"loss": 2.9625,
"step": 253
},
{
"epoch": 0.737300435413643,
"grad_norm": 2.208834171295166,
"learning_rate": 3.5141791053724405e-05,
"loss": 3.0578,
"step": 254
},
{
"epoch": 0.7402031930333817,
"grad_norm": 2.307220458984375,
"learning_rate": 3.4451386276197293e-05,
"loss": 2.9855,
"step": 255
},
{
"epoch": 0.7431059506531205,
"grad_norm": 2.1939680576324463,
"learning_rate": 3.3766416780469256e-05,
"loss": 3.673,
"step": 256
},
{
"epoch": 0.7460087082728593,
"grad_norm": 1.9280527830123901,
"learning_rate": 3.308693936411421e-05,
"loss": 3.0642,
"step": 257
},
{
"epoch": 0.7489114658925979,
"grad_norm": 2.047974109649658,
"learning_rate": 3.2413010369303584e-05,
"loss": 3.1728,
"step": 258
},
{
"epoch": 0.7518142235123367,
"grad_norm": 2.1966168880462646,
"learning_rate": 3.174468567813461e-05,
"loss": 3.2277,
"step": 259
},
{
"epoch": 0.7547169811320755,
"grad_norm": 2.072453022003174,
"learning_rate": 3.108202070799626e-05,
"loss": 3.3533,
"step": 260
},
{
"epoch": 0.7576197387518142,
"grad_norm": 1.9733140468597412,
"learning_rate": 3.0425070406974455e-05,
"loss": 2.9843,
"step": 261
},
{
"epoch": 0.760522496371553,
"grad_norm": 2.302907943725586,
"learning_rate": 2.9773889249295294e-05,
"loss": 3.1157,
"step": 262
},
{
"epoch": 0.7634252539912917,
"grad_norm": 1.9516576528549194,
"learning_rate": 2.9128531230808576e-05,
"loss": 3.4501,
"step": 263
},
{
"epoch": 0.7663280116110305,
"grad_norm": 1.9993865489959717,
"learning_rate": 2.8489049864510054e-05,
"loss": 3.5931,
"step": 264
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.091517686843872,
"learning_rate": 2.7855498176104434e-05,
"loss": 2.202,
"step": 265
},
{
"epoch": 0.772133526850508,
"grad_norm": 2.672689199447632,
"learning_rate": 2.7227928699608263e-05,
"loss": 3.4568,
"step": 266
},
{
"epoch": 0.7750362844702468,
"grad_norm": 2.0529282093048096,
"learning_rate": 2.6606393472993973e-05,
"loss": 3.4287,
"step": 267
},
{
"epoch": 0.7779390420899854,
"grad_norm": 1.8243032693862915,
"learning_rate": 2.599094403387481e-05,
"loss": 2.9586,
"step": 268
},
{
"epoch": 0.7808417997097242,
"grad_norm": 2.381425619125366,
"learning_rate": 2.5381631415231454e-05,
"loss": 3.6723,
"step": 269
},
{
"epoch": 0.783744557329463,
"grad_norm": 3.504389524459839,
"learning_rate": 2.4778506141180236e-05,
"loss": 4.3296,
"step": 270
},
{
"epoch": 0.7866473149492017,
"grad_norm": 1.7428265810012817,
"learning_rate": 2.418161822278374e-05,
"loss": 3.037,
"step": 271
},
{
"epoch": 0.7895500725689405,
"grad_norm": 2.81032133102417,
"learning_rate": 2.3591017153903916e-05,
"loss": 3.1645,
"step": 272
},
{
"epoch": 0.7924528301886793,
"grad_norm": 2.1162257194519043,
"learning_rate": 2.300675190709809e-05,
"loss": 3.2709,
"step": 273
},
{
"epoch": 0.795355587808418,
"grad_norm": 1.9196466207504272,
"learning_rate": 2.242887092955801e-05,
"loss": 3.6456,
"step": 274
},
{
"epoch": 0.7982583454281568,
"grad_norm": 2.2060110569000244,
"learning_rate": 2.1857422139092865e-05,
"loss": 3.068,
"step": 275
},
{
"epoch": 0.8011611030478955,
"grad_norm": 1.857069492340088,
"learning_rate": 2.1292452920155592e-05,
"loss": 3.251,
"step": 276
},
{
"epoch": 0.8040638606676342,
"grad_norm": 3.120304584503174,
"learning_rate": 2.0734010119914192e-05,
"loss": 3.1381,
"step": 277
},
{
"epoch": 0.806966618287373,
"grad_norm": 2.208164930343628,
"learning_rate": 2.018214004436677e-05,
"loss": 3.0816,
"step": 278
},
{
"epoch": 0.8098693759071117,
"grad_norm": 1.976894736289978,
"learning_rate": 1.9636888454502178e-05,
"loss": 2.719,
"step": 279
},
{
"epoch": 0.8127721335268505,
"grad_norm": 2.5784201622009277,
"learning_rate": 1.9098300562505266e-05,
"loss": 3.1057,
"step": 280
},
{
"epoch": 0.8156748911465893,
"grad_norm": 2.8327383995056152,
"learning_rate": 1.8566421028008018e-05,
"loss": 3.8255,
"step": 281
},
{
"epoch": 0.818577648766328,
"grad_norm": 2.2047767639160156,
"learning_rate": 1.804129395438635e-05,
"loss": 3.244,
"step": 282
},
{
"epoch": 0.8214804063860668,
"grad_norm": 2.8230714797973633,
"learning_rate": 1.7522962885103145e-05,
"loss": 3.0961,
"step": 283
},
{
"epoch": 0.8243831640058055,
"grad_norm": 2.201507091522217,
"learning_rate": 1.7011470800097496e-05,
"loss": 2.6894,
"step": 284
},
{
"epoch": 0.8272859216255443,
"grad_norm": 1.9765757322311401,
"learning_rate": 1.65068601122209e-05,
"loss": 3.136,
"step": 285
},
{
"epoch": 0.8301886792452831,
"grad_norm": 2.06833815574646,
"learning_rate": 1.600917266372035e-05,
"loss": 3.6098,
"step": 286
},
{
"epoch": 0.8330914368650217,
"grad_norm": 2.757883310317993,
"learning_rate": 1.5518449722768892e-05,
"loss": 4.0251,
"step": 287
},
{
"epoch": 0.8359941944847605,
"grad_norm": 2.0423471927642822,
"learning_rate": 1.5034731980043515e-05,
"loss": 3.1681,
"step": 288
},
{
"epoch": 0.8388969521044993,
"grad_norm": 2.4651999473571777,
"learning_rate": 1.4558059545351143e-05,
"loss": 3.2775,
"step": 289
},
{
"epoch": 0.841799709724238,
"grad_norm": 1.7521671056747437,
"learning_rate": 1.4088471944302861e-05,
"loss": 2.355,
"step": 290
},
{
"epoch": 0.8447024673439768,
"grad_norm": 1.9442704916000366,
"learning_rate": 1.3626008115036181e-05,
"loss": 3.1105,
"step": 291
},
{
"epoch": 0.8476052249637155,
"grad_norm": 2.1002039909362793,
"learning_rate": 1.3170706404986644e-05,
"loss": 3.6593,
"step": 292
},
{
"epoch": 0.8505079825834543,
"grad_norm": 1.7552965879440308,
"learning_rate": 1.2722604567707719e-05,
"loss": 2.6157,
"step": 293
},
{
"epoch": 0.8534107402031931,
"grad_norm": 1.8941353559494019,
"learning_rate": 1.2281739759740574e-05,
"loss": 3.2914,
"step": 294
},
{
"epoch": 0.8563134978229318,
"grad_norm": 2.11254620552063,
"learning_rate": 1.1848148537532843e-05,
"loss": 3.2055,
"step": 295
},
{
"epoch": 0.8592162554426706,
"grad_norm": 2.3030812740325928,
"learning_rate": 1.142186685440747e-05,
"loss": 2.8077,
"step": 296
},
{
"epoch": 0.8621190130624092,
"grad_norm": 1.9629480838775635,
"learning_rate": 1.100293005758145e-05,
"loss": 2.3917,
"step": 297
},
{
"epoch": 0.865021770682148,
"grad_norm": 1.9289971590042114,
"learning_rate": 1.0591372885234885e-05,
"loss": 3.2658,
"step": 298
},
{
"epoch": 0.8679245283018868,
"grad_norm": 1.8033056259155273,
"learning_rate": 1.01872294636304e-05,
"loss": 3.2022,
"step": 299
},
{
"epoch": 0.8708272859216255,
"grad_norm": 1.87389075756073,
"learning_rate": 9.790533304283478e-06,
"loss": 2.6739,
"step": 300
},
{
"epoch": 0.8737300435413643,
"grad_norm": 2.6886935234069824,
"learning_rate": 9.401317301183655e-06,
"loss": 3.1875,
"step": 301
},
{
"epoch": 0.8766328011611031,
"grad_norm": 2.1857502460479736,
"learning_rate": 9.019613728067e-06,
"loss": 2.8756,
"step": 302
},
{
"epoch": 0.8795355587808418,
"grad_norm": 2.1285061836242676,
"learning_rate": 8.645454235739903e-06,
"loss": 3.2116,
"step": 303
},
{
"epoch": 0.8824383164005806,
"grad_norm": 2.7644810676574707,
"learning_rate": 8.278869849454718e-06,
"loss": 3.0152,
"step": 304
},
{
"epoch": 0.8853410740203193,
"grad_norm": 1.9984538555145264,
"learning_rate": 7.91989096633693e-06,
"loss": 2.7286,
"step": 305
},
{
"epoch": 0.888243831640058,
"grad_norm": 1.859739899635315,
"learning_rate": 7.568547352864941e-06,
"loss": 2.9108,
"step": 306
},
{
"epoch": 0.8911465892597968,
"grad_norm": 1.783887505531311,
"learning_rate": 7.224868142401542e-06,
"loss": 2.7539,
"step": 307
},
{
"epoch": 0.8940493468795355,
"grad_norm": 2.297299385070801,
"learning_rate": 6.888881832778415e-06,
"loss": 2.8574,
"step": 308
},
{
"epoch": 0.8969521044992743,
"grad_norm": 2.203857898712158,
"learning_rate": 6.560616283932897e-06,
"loss": 3.6275,
"step": 309
},
{
"epoch": 0.8998548621190131,
"grad_norm": 2.2782490253448486,
"learning_rate": 6.240098715597975e-06,
"loss": 3.1797,
"step": 310
},
{
"epoch": 0.9027576197387518,
"grad_norm": 2.0081512928009033,
"learning_rate": 5.927355705045179e-06,
"loss": 3.09,
"step": 311
},
{
"epoch": 0.9056603773584906,
"grad_norm": 2.6315252780914307,
"learning_rate": 5.6224131848808144e-06,
"loss": 2.8839,
"step": 312
},
{
"epoch": 0.9085631349782293,
"grad_norm": 2.094134569168091,
"learning_rate": 5.325296440895622e-06,
"loss": 2.9956,
"step": 313
},
{
"epoch": 0.9114658925979681,
"grad_norm": 2.017035484313965,
"learning_rate": 5.036030109968082e-06,
"loss": 2.6596,
"step": 314
},
{
"epoch": 0.9143686502177069,
"grad_norm": 2.2012784481048584,
"learning_rate": 4.754638178021498e-06,
"loss": 3.1305,
"step": 315
},
{
"epoch": 0.9172714078374455,
"grad_norm": 1.8841356039047241,
"learning_rate": 4.481143978035196e-06,
"loss": 3.0464,
"step": 316
},
{
"epoch": 0.9201741654571843,
"grad_norm": 2.4728565216064453,
"learning_rate": 4.2155701881096075e-06,
"loss": 2.7735,
"step": 317
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.1314468383789062,
"learning_rate": 3.95793882958595e-06,
"loss": 3.3511,
"step": 318
},
{
"epoch": 0.9259796806966618,
"grad_norm": 1.9269267320632935,
"learning_rate": 3.7082712652200867e-06,
"loss": 3.282,
"step": 319
},
{
"epoch": 0.9288824383164006,
"grad_norm": 1.958406925201416,
"learning_rate": 3.4665881974112026e-06,
"loss": 2.8489,
"step": 320
},
{
"epoch": 0.9317851959361393,
"grad_norm": 2.2147128582000732,
"learning_rate": 3.2329096664852064e-06,
"loss": 3.6156,
"step": 321
},
{
"epoch": 0.9346879535558781,
"grad_norm": 1.944659948348999,
"learning_rate": 3.0072550490328753e-06,
"loss": 3.2088,
"step": 322
},
{
"epoch": 0.9375907111756169,
"grad_norm": 2.1794497966766357,
"learning_rate": 2.7896430563032707e-06,
"loss": 3.0827,
"step": 323
},
{
"epoch": 0.9404934687953556,
"grad_norm": 2.2770931720733643,
"learning_rate": 2.580091732652101e-06,
"loss": 3.3405,
"step": 324
},
{
"epoch": 0.9433962264150944,
"grad_norm": 2.1666173934936523,
"learning_rate": 2.3786184540455448e-06,
"loss": 2.7803,
"step": 325
},
{
"epoch": 0.9462989840348331,
"grad_norm": 2.107891321182251,
"learning_rate": 2.1852399266194314e-06,
"loss": 2.7457,
"step": 326
},
{
"epoch": 0.9492017416545718,
"grad_norm": 2.0576820373535156,
"learning_rate": 1.9999721852939858e-06,
"loss": 2.9182,
"step": 327
},
{
"epoch": 0.9521044992743106,
"grad_norm": 3.305752992630005,
"learning_rate": 1.822830592444147e-06,
"loss": 3.8223,
"step": 328
},
{
"epoch": 0.9550072568940493,
"grad_norm": 2.0414235591888428,
"learning_rate": 1.6538298366257976e-06,
"loss": 3.0047,
"step": 329
},
{
"epoch": 0.9579100145137881,
"grad_norm": 2.361135721206665,
"learning_rate": 1.4929839313577609e-06,
"loss": 3.9007,
"step": 330
},
{
"epoch": 0.9608127721335269,
"grad_norm": 1.976693034172058,
"learning_rate": 1.3403062139598076e-06,
"loss": 3.2631,
"step": 331
},
{
"epoch": 0.9637155297532656,
"grad_norm": 1.9887497425079346,
"learning_rate": 1.1958093444467079e-06,
"loss": 3.5457,
"step": 332
},
{
"epoch": 0.9666182873730044,
"grad_norm": 2.3717265129089355,
"learning_rate": 1.059505304478503e-06,
"loss": 3.2485,
"step": 333
},
{
"epoch": 0.969521044992743,
"grad_norm": 1.9998297691345215,
"learning_rate": 9.314053963669245e-07,
"loss": 3.5795,
"step": 334
},
{
"epoch": 0.9724238026124818,
"grad_norm": 2.2316505908966064,
"learning_rate": 8.115202421383083e-07,
"loss": 3.2281,
"step": 335
},
{
"epoch": 0.9753265602322206,
"grad_norm": 1.9784519672393799,
"learning_rate": 6.998597826526898e-07,
"loss": 3.3497,
"step": 336
},
{
"epoch": 0.9782293178519593,
"grad_norm": 1.9381024837493896,
"learning_rate": 5.964332767796399e-07,
"loss": 2.771,
"step": 337
},
{
"epoch": 0.9811320754716981,
"grad_norm": 2.6159842014312744,
"learning_rate": 5.012493006304131e-07,
"loss": 3.2164,
"step": 338
},
{
"epoch": 0.9840348330914369,
"grad_norm": 2.405776262283325,
"learning_rate": 4.143157468468717e-07,
"loss": 2.6788,
"step": 339
},
{
"epoch": 0.9869375907111756,
"grad_norm": 2.2898902893066406,
"learning_rate": 3.3563982394704266e-07,
"loss": 4.1156,
"step": 340
},
{
"epoch": 0.9898403483309144,
"grad_norm": 2.2498302459716797,
"learning_rate": 2.652280557273512e-07,
"loss": 3.3784,
"step": 341
},
{
"epoch": 0.9927431059506531,
"grad_norm": 2.19677996635437,
"learning_rate": 2.030862807216649e-07,
"loss": 3.2635,
"step": 342
},
{
"epoch": 0.9956458635703919,
"grad_norm": 2.383686065673828,
"learning_rate": 1.4921965171720287e-07,
"loss": 3.2177,
"step": 343
},
{
"epoch": 0.9985486211901307,
"grad_norm": 1.5856789350509644,
"learning_rate": 1.0363263532724432e-07,
"loss": 2.6107,
"step": 344
},
{
"epoch": 0.9985486211901307,
"eval_loss": 0.8033239841461182,
"eval_runtime": 13.4743,
"eval_samples_per_second": 21.522,
"eval_steps_per_second": 5.418,
"step": 344
},
{
"epoch": 1.0021770682148041,
"grad_norm": 1.5614508390426636,
"learning_rate": 6.632901162074711e-08,
"loss": 2.3665,
"step": 345
},
{
"epoch": 1.0021770682148041,
"eval_loss": 0.8033127188682556,
"eval_runtime": 13.2595,
"eval_samples_per_second": 21.871,
"eval_steps_per_second": 5.506,
"step": 345
}
],
"logging_steps": 1,
"max_steps": 345,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.435086402578022e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}