PaliGemma-2-MMDD-3B / trainer_state.json
devichand's picture
Upload folder using huggingface_hub
65902cb verified
{
"best_metric": 1.86671543,
"best_model_checkpoint": "/home/anubhab-pg/sm745052/swift/exp_output_paligemma/v1-20250508-175335/checkpoint-3500",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4944,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006067961165048543,
"grad_norm": 27.07073974609375,
"learning_rate": 9.999998990554643e-05,
"loss": 4.470662593841553,
"memory(GiB)": 29.74,
"step": 1,
"token_acc": 0.2638888888888889,
"train_speed(iter/s)": 0.176069
},
{
"epoch": 0.003033980582524272,
"grad_norm": 11.808771133422852,
"learning_rate": 9.999974763886429e-05,
"loss": 3.900416851043701,
"memory(GiB)": 29.74,
"step": 5,
"token_acc": 0.2831050228310502,
"train_speed(iter/s)": 0.398445
},
{
"epoch": 0.006067961165048544,
"grad_norm": 12.929791450500488,
"learning_rate": 9.999899055800455e-05,
"loss": 2.7944046020507813,
"memory(GiB)": 29.74,
"step": 10,
"token_acc": 0.4377358490566038,
"train_speed(iter/s)": 0.483776
},
{
"epoch": 0.009101941747572815,
"grad_norm": 8.702505111694336,
"learning_rate": 9.99977287650631e-05,
"loss": 2.662510871887207,
"memory(GiB)": 38.25,
"step": 15,
"token_acc": 0.44649446494464945,
"train_speed(iter/s)": 0.514334
},
{
"epoch": 0.012135922330097087,
"grad_norm": 8.632209777832031,
"learning_rate": 9.999596227277707e-05,
"loss": 2.5844635009765624,
"memory(GiB)": 38.86,
"step": 20,
"token_acc": 0.4867549668874172,
"train_speed(iter/s)": 0.516824
},
{
"epoch": 0.01516990291262136,
"grad_norm": 7.144984245300293,
"learning_rate": 9.999369109897819e-05,
"loss": 2.760052490234375,
"memory(GiB)": 38.86,
"step": 25,
"token_acc": 0.42524916943521596,
"train_speed(iter/s)": 0.531359
},
{
"epoch": 0.01820388349514563,
"grad_norm": 13.147393226623535,
"learning_rate": 9.999091526659272e-05,
"loss": 2.7114631652832033,
"memory(GiB)": 38.86,
"step": 30,
"token_acc": 0.42613636363636365,
"train_speed(iter/s)": 0.54308
},
{
"epoch": 0.021237864077669904,
"grad_norm": 10.967350959777832,
"learning_rate": 9.998763480364113e-05,
"loss": 2.8917694091796875,
"memory(GiB)": 38.86,
"step": 35,
"token_acc": 0.3957703927492447,
"train_speed(iter/s)": 0.55201
},
{
"epoch": 0.024271844660194174,
"grad_norm": 6.367667198181152,
"learning_rate": 9.99838497432379e-05,
"loss": 2.7323539733886717,
"memory(GiB)": 38.86,
"step": 40,
"token_acc": 0.4843205574912892,
"train_speed(iter/s)": 0.554469
},
{
"epoch": 0.027305825242718445,
"grad_norm": 7.910516262054443,
"learning_rate": 9.997956012359109e-05,
"loss": 2.541508674621582,
"memory(GiB)": 38.86,
"step": 45,
"token_acc": 0.4327485380116959,
"train_speed(iter/s)": 0.558152
},
{
"epoch": 0.03033980582524272,
"grad_norm": 6.039285182952881,
"learning_rate": 9.997476598800203e-05,
"loss": 2.543034553527832,
"memory(GiB)": 38.86,
"step": 50,
"token_acc": 0.45058139534883723,
"train_speed(iter/s)": 0.56043
},
{
"epoch": 0.03337378640776699,
"grad_norm": 8.210753440856934,
"learning_rate": 9.99694673848649e-05,
"loss": 2.3589075088500975,
"memory(GiB)": 38.86,
"step": 55,
"token_acc": 0.4642857142857143,
"train_speed(iter/s)": 0.562231
},
{
"epoch": 0.03640776699029126,
"grad_norm": 9.309414863586426,
"learning_rate": 9.996366436766611e-05,
"loss": 2.3582067489624023,
"memory(GiB)": 38.86,
"step": 60,
"token_acc": 0.4788273615635179,
"train_speed(iter/s)": 0.567887
},
{
"epoch": 0.03944174757281554,
"grad_norm": 6.9387993812561035,
"learning_rate": 9.995735699498394e-05,
"loss": 2.5982736587524413,
"memory(GiB)": 38.86,
"step": 65,
"token_acc": 0.436046511627907,
"train_speed(iter/s)": 0.573648
},
{
"epoch": 0.04247572815533981,
"grad_norm": 7.010188579559326,
"learning_rate": 9.995054533048777e-05,
"loss": 2.508279228210449,
"memory(GiB)": 38.86,
"step": 70,
"token_acc": 0.4281609195402299,
"train_speed(iter/s)": 0.579165
},
{
"epoch": 0.04550970873786408,
"grad_norm": 8.828622817993164,
"learning_rate": 9.994322944293763e-05,
"loss": 2.588084411621094,
"memory(GiB)": 38.86,
"step": 75,
"token_acc": 0.4558303886925795,
"train_speed(iter/s)": 0.583997
},
{
"epoch": 0.04854368932038835,
"grad_norm": 8.112404823303223,
"learning_rate": 9.993540940618334e-05,
"loss": 2.316554832458496,
"memory(GiB)": 38.86,
"step": 80,
"token_acc": 0.4552238805970149,
"train_speed(iter/s)": 0.582967
},
{
"epoch": 0.05157766990291262,
"grad_norm": 8.621855735778809,
"learning_rate": 9.992708529916379e-05,
"loss": 2.366764259338379,
"memory(GiB)": 38.86,
"step": 85,
"token_acc": 0.44025157232704404,
"train_speed(iter/s)": 0.585803
},
{
"epoch": 0.05461165048543689,
"grad_norm": 8.71721076965332,
"learning_rate": 9.991825720590626e-05,
"loss": 2.346388244628906,
"memory(GiB)": 38.86,
"step": 90,
"token_acc": 0.4457831325301205,
"train_speed(iter/s)": 0.588259
},
{
"epoch": 0.05764563106796117,
"grad_norm": 6.412728786468506,
"learning_rate": 9.990892521552546e-05,
"loss": 2.4675243377685545,
"memory(GiB)": 38.86,
"step": 95,
"token_acc": 0.4896755162241888,
"train_speed(iter/s)": 0.587726
},
{
"epoch": 0.06067961165048544,
"grad_norm": 10.379164695739746,
"learning_rate": 9.989908942222264e-05,
"loss": 2.24587345123291,
"memory(GiB)": 38.86,
"step": 100,
"token_acc": 0.5168067226890757,
"train_speed(iter/s)": 0.588352
},
{
"epoch": 0.06371359223300971,
"grad_norm": 6.4799370765686035,
"learning_rate": 9.988874992528468e-05,
"loss": 2.652623748779297,
"memory(GiB)": 38.86,
"step": 105,
"token_acc": 0.4127906976744186,
"train_speed(iter/s)": 0.589898
},
{
"epoch": 0.06674757281553398,
"grad_norm": 6.027382850646973,
"learning_rate": 9.987790682908306e-05,
"loss": 2.2998146057128905,
"memory(GiB)": 38.86,
"step": 110,
"token_acc": 0.4807121661721068,
"train_speed(iter/s)": 0.589495
},
{
"epoch": 0.06978155339805825,
"grad_norm": 6.517679214477539,
"learning_rate": 9.986656024307286e-05,
"loss": 2.5867145538330076,
"memory(GiB)": 38.86,
"step": 115,
"token_acc": 0.455026455026455,
"train_speed(iter/s)": 0.589597
},
{
"epoch": 0.07281553398058252,
"grad_norm": 7.561508655548096,
"learning_rate": 9.985471028179154e-05,
"loss": 2.4384201049804686,
"memory(GiB)": 38.86,
"step": 120,
"token_acc": 0.47368421052631576,
"train_speed(iter/s)": 0.589834
},
{
"epoch": 0.07584951456310679,
"grad_norm": 7.263455867767334,
"learning_rate": 9.984235706485789e-05,
"loss": 2.373090362548828,
"memory(GiB)": 38.86,
"step": 125,
"token_acc": 0.4657039711191336,
"train_speed(iter/s)": 0.590074
},
{
"epoch": 0.07888349514563107,
"grad_norm": 6.084628582000732,
"learning_rate": 9.98295007169708e-05,
"loss": 2.5036380767822264,
"memory(GiB)": 38.86,
"step": 130,
"token_acc": 0.47941176470588237,
"train_speed(iter/s)": 0.592166
},
{
"epoch": 0.08191747572815535,
"grad_norm": 8.00130844116211,
"learning_rate": 9.981614136790796e-05,
"loss": 2.153367614746094,
"memory(GiB)": 39.22,
"step": 135,
"token_acc": 0.5410764872521246,
"train_speed(iter/s)": 0.591399
},
{
"epoch": 0.08495145631067962,
"grad_norm": 6.030653953552246,
"learning_rate": 9.980227915252459e-05,
"loss": 2.2291128158569338,
"memory(GiB)": 39.22,
"step": 140,
"token_acc": 0.4910394265232975,
"train_speed(iter/s)": 0.591432
},
{
"epoch": 0.08798543689320389,
"grad_norm": 6.891486167907715,
"learning_rate": 9.978791421075206e-05,
"loss": 2.5422630310058594,
"memory(GiB)": 39.22,
"step": 145,
"token_acc": 0.4812286689419795,
"train_speed(iter/s)": 0.588377
},
{
"epoch": 0.09101941747572816,
"grad_norm": 7.838645935058594,
"learning_rate": 9.97730466875965e-05,
"loss": 2.476850128173828,
"memory(GiB)": 39.22,
"step": 150,
"token_acc": 0.4542372881355932,
"train_speed(iter/s)": 0.590615
},
{
"epoch": 0.09405339805825243,
"grad_norm": 7.769046306610107,
"learning_rate": 9.975767673313734e-05,
"loss": 2.592838096618652,
"memory(GiB)": 39.22,
"step": 155,
"token_acc": 0.4678362573099415,
"train_speed(iter/s)": 0.591754
},
{
"epoch": 0.0970873786407767,
"grad_norm": 5.977383136749268,
"learning_rate": 9.974180450252569e-05,
"loss": 2.345209503173828,
"memory(GiB)": 39.22,
"step": 160,
"token_acc": 0.4849624060150376,
"train_speed(iter/s)": 0.592414
},
{
"epoch": 0.10012135922330097,
"grad_norm": 6.340784549713135,
"learning_rate": 9.972543015598295e-05,
"loss": 2.4988531112670898,
"memory(GiB)": 39.22,
"step": 165,
"token_acc": 0.4491525423728814,
"train_speed(iter/s)": 0.592598
},
{
"epoch": 0.10315533980582524,
"grad_norm": 6.322139263153076,
"learning_rate": 9.970855385879908e-05,
"loss": 2.7641939163208007,
"memory(GiB)": 39.22,
"step": 170,
"token_acc": 0.42450142450142453,
"train_speed(iter/s)": 0.593328
},
{
"epoch": 0.10618932038834951,
"grad_norm": 8.460200309753418,
"learning_rate": 9.969117578133089e-05,
"loss": 2.4497074127197265,
"memory(GiB)": 39.22,
"step": 175,
"token_acc": 0.4819672131147541,
"train_speed(iter/s)": 0.592932
},
{
"epoch": 0.10922330097087378,
"grad_norm": 6.508354663848877,
"learning_rate": 9.96732960990005e-05,
"loss": 2.3542524337768556,
"memory(GiB)": 39.22,
"step": 180,
"token_acc": 0.4444444444444444,
"train_speed(iter/s)": 0.594793
},
{
"epoch": 0.11225728155339806,
"grad_norm": 6.436831474304199,
"learning_rate": 9.965491499229332e-05,
"loss": 2.355543518066406,
"memory(GiB)": 39.22,
"step": 185,
"token_acc": 0.48639455782312924,
"train_speed(iter/s)": 0.595112
},
{
"epoch": 0.11529126213592233,
"grad_norm": 5.326399803161621,
"learning_rate": 9.963603264675648e-05,
"loss": 2.626679611206055,
"memory(GiB)": 39.22,
"step": 190,
"token_acc": 0.45058139534883723,
"train_speed(iter/s)": 0.59601
},
{
"epoch": 0.1183252427184466,
"grad_norm": 6.522929668426514,
"learning_rate": 9.961664925299677e-05,
"loss": 2.417061424255371,
"memory(GiB)": 39.22,
"step": 195,
"token_acc": 0.49050632911392406,
"train_speed(iter/s)": 0.595902
},
{
"epoch": 0.12135922330097088,
"grad_norm": 5.905120849609375,
"learning_rate": 9.95967650066788e-05,
"loss": 2.5360954284667967,
"memory(GiB)": 39.22,
"step": 200,
"token_acc": 0.4444444444444444,
"train_speed(iter/s)": 0.59608
},
{
"epoch": 0.12439320388349515,
"grad_norm": 7.083728790283203,
"learning_rate": 9.957638010852301e-05,
"loss": 2.5276988983154296,
"memory(GiB)": 39.22,
"step": 205,
"token_acc": 0.43333333333333335,
"train_speed(iter/s)": 0.597234
},
{
"epoch": 0.12742718446601942,
"grad_norm": 6.791469573974609,
"learning_rate": 9.955549476430364e-05,
"loss": 2.6791542053222654,
"memory(GiB)": 39.22,
"step": 210,
"token_acc": 0.44481605351170567,
"train_speed(iter/s)": 0.597785
},
{
"epoch": 0.1304611650485437,
"grad_norm": 8.691610336303711,
"learning_rate": 9.953410918484667e-05,
"loss": 2.5277048110961915,
"memory(GiB)": 39.22,
"step": 215,
"token_acc": 0.4937888198757764,
"train_speed(iter/s)": 0.598803
},
{
"epoch": 0.13349514563106796,
"grad_norm": 5.966423988342285,
"learning_rate": 9.951222358602763e-05,
"loss": 2.5550731658935546,
"memory(GiB)": 39.22,
"step": 220,
"token_acc": 0.4676470588235294,
"train_speed(iter/s)": 0.599676
},
{
"epoch": 0.13652912621359223,
"grad_norm": 8.491061210632324,
"learning_rate": 9.948983818876954e-05,
"loss": 2.433759880065918,
"memory(GiB)": 39.22,
"step": 225,
"token_acc": 0.4908424908424908,
"train_speed(iter/s)": 0.598817
},
{
"epoch": 0.1395631067961165,
"grad_norm": 4.885462760925293,
"learning_rate": 9.946695321904056e-05,
"loss": 2.5523433685302734,
"memory(GiB)": 39.22,
"step": 230,
"token_acc": 0.45478723404255317,
"train_speed(iter/s)": 0.597615
},
{
"epoch": 0.14259708737864077,
"grad_norm": 6.235279083251953,
"learning_rate": 9.944356890785177e-05,
"loss": 2.3788055419921874,
"memory(GiB)": 39.22,
"step": 235,
"token_acc": 0.4809384164222874,
"train_speed(iter/s)": 0.598117
},
{
"epoch": 0.14563106796116504,
"grad_norm": 5.3688130378723145,
"learning_rate": 9.941968549125481e-05,
"loss": 2.4541061401367186,
"memory(GiB)": 39.22,
"step": 240,
"token_acc": 0.46153846153846156,
"train_speed(iter/s)": 0.596655
},
{
"epoch": 0.1486650485436893,
"grad_norm": 5.759191036224365,
"learning_rate": 9.939530321033955e-05,
"loss": 2.168326568603516,
"memory(GiB)": 39.22,
"step": 245,
"token_acc": 0.501628664495114,
"train_speed(iter/s)": 0.5967
},
{
"epoch": 0.15169902912621358,
"grad_norm": 8.470988273620605,
"learning_rate": 9.937042231123155e-05,
"loss": 2.5771547317504884,
"memory(GiB)": 39.22,
"step": 250,
"token_acc": 0.501628664495114,
"train_speed(iter/s)": 0.596782
},
{
"epoch": 0.15473300970873785,
"grad_norm": 6.000228404998779,
"learning_rate": 9.934504304508974e-05,
"loss": 2.5160358428955076,
"memory(GiB)": 39.22,
"step": 255,
"token_acc": 0.4469914040114613,
"train_speed(iter/s)": 0.596955
},
{
"epoch": 0.15776699029126215,
"grad_norm": 7.762350082397461,
"learning_rate": 9.931916566810371e-05,
"loss": 2.245794677734375,
"memory(GiB)": 39.22,
"step": 260,
"token_acc": 0.521594684385382,
"train_speed(iter/s)": 0.596759
},
{
"epoch": 0.16080097087378642,
"grad_norm": 7.007081031799316,
"learning_rate": 9.929279044149123e-05,
"loss": 2.3080322265625,
"memory(GiB)": 39.22,
"step": 265,
"token_acc": 0.4964788732394366,
"train_speed(iter/s)": 0.595878
},
{
"epoch": 0.1638349514563107,
"grad_norm": 5.466193675994873,
"learning_rate": 9.926591763149559e-05,
"loss": 2.1369998931884764,
"memory(GiB)": 39.22,
"step": 270,
"token_acc": 0.5296052631578947,
"train_speed(iter/s)": 0.596845
},
{
"epoch": 0.16686893203883496,
"grad_norm": 7.380741596221924,
"learning_rate": 9.923854750938291e-05,
"loss": 2.2451313018798826,
"memory(GiB)": 39.22,
"step": 275,
"token_acc": 0.5,
"train_speed(iter/s)": 0.595956
},
{
"epoch": 0.16990291262135923,
"grad_norm": 6.371977806091309,
"learning_rate": 9.921068035143936e-05,
"loss": 2.408839797973633,
"memory(GiB)": 39.22,
"step": 280,
"token_acc": 0.46296296296296297,
"train_speed(iter/s)": 0.596145
},
{
"epoch": 0.1729368932038835,
"grad_norm": 7.335880279541016,
"learning_rate": 9.918231643896852e-05,
"loss": 2.199435234069824,
"memory(GiB)": 39.22,
"step": 285,
"token_acc": 0.5050167224080268,
"train_speed(iter/s)": 0.595974
},
{
"epoch": 0.17597087378640777,
"grad_norm": 7.418302536010742,
"learning_rate": 9.915345605828828e-05,
"loss": 2.3224533081054686,
"memory(GiB)": 39.22,
"step": 290,
"token_acc": 0.5035971223021583,
"train_speed(iter/s)": 0.596204
},
{
"epoch": 0.17900485436893204,
"grad_norm": 6.758571624755859,
"learning_rate": 9.912409950072821e-05,
"loss": 2.4346172332763674,
"memory(GiB)": 39.22,
"step": 295,
"token_acc": 0.48429319371727747,
"train_speed(iter/s)": 0.597578
},
{
"epoch": 0.1820388349514563,
"grad_norm": 10.919794082641602,
"learning_rate": 9.909424706262647e-05,
"loss": 2.4341407775878907,
"memory(GiB)": 39.22,
"step": 300,
"token_acc": 0.48857142857142855,
"train_speed(iter/s)": 0.598081
},
{
"epoch": 0.18507281553398058,
"grad_norm": 5.728007793426514,
"learning_rate": 9.906389904532688e-05,
"loss": 2.120174026489258,
"memory(GiB)": 39.22,
"step": 305,
"token_acc": 0.5371024734982333,
"train_speed(iter/s)": 0.597458
},
{
"epoch": 0.18810679611650485,
"grad_norm": 6.871456623077393,
"learning_rate": 9.903305575517584e-05,
"loss": 2.342795181274414,
"memory(GiB)": 39.22,
"step": 310,
"token_acc": 0.498567335243553,
"train_speed(iter/s)": 0.596534
},
{
"epoch": 0.19114077669902912,
"grad_norm": 8.57703971862793,
"learning_rate": 9.900171750351925e-05,
"loss": 2.6183086395263673,
"memory(GiB)": 39.22,
"step": 315,
"token_acc": 0.4625,
"train_speed(iter/s)": 0.597484
},
{
"epoch": 0.1941747572815534,
"grad_norm": 7.768932342529297,
"learning_rate": 9.89698846066994e-05,
"loss": 2.291164207458496,
"memory(GiB)": 39.22,
"step": 320,
"token_acc": 0.5154320987654321,
"train_speed(iter/s)": 0.596998
},
{
"epoch": 0.19720873786407767,
"grad_norm": 6.957777976989746,
"learning_rate": 9.893755738605171e-05,
"loss": 2.211928367614746,
"memory(GiB)": 39.22,
"step": 325,
"token_acc": 0.5045871559633027,
"train_speed(iter/s)": 0.597803
},
{
"epoch": 0.20024271844660194,
"grad_norm": 6.228968143463135,
"learning_rate": 9.890473616790154e-05,
"loss": 2.4344671249389647,
"memory(GiB)": 39.22,
"step": 330,
"token_acc": 0.47413793103448276,
"train_speed(iter/s)": 0.597936
},
{
"epoch": 0.2032766990291262,
"grad_norm": 5.3689422607421875,
"learning_rate": 9.887142128356092e-05,
"loss": 2.6146148681640624,
"memory(GiB)": 39.22,
"step": 335,
"token_acc": 0.45222929936305734,
"train_speed(iter/s)": 0.59885
},
{
"epoch": 0.20631067961165048,
"grad_norm": 5.215574264526367,
"learning_rate": 9.88376130693251e-05,
"loss": 2.0827293395996094,
"memory(GiB)": 39.22,
"step": 340,
"token_acc": 0.5172413793103449,
"train_speed(iter/s)": 0.599013
},
{
"epoch": 0.20934466019417475,
"grad_norm": 5.897531509399414,
"learning_rate": 9.880331186646925e-05,
"loss": 2.232925796508789,
"memory(GiB)": 39.22,
"step": 345,
"token_acc": 0.4750830564784053,
"train_speed(iter/s)": 0.599063
},
{
"epoch": 0.21237864077669902,
"grad_norm": 5.433231830596924,
"learning_rate": 9.876851802124503e-05,
"loss": 2.4904659271240233,
"memory(GiB)": 39.22,
"step": 350,
"token_acc": 0.47770700636942676,
"train_speed(iter/s)": 0.598348
},
{
"epoch": 0.2154126213592233,
"grad_norm": 7.521780967712402,
"learning_rate": 9.873323188487697e-05,
"loss": 2.5035079956054687,
"memory(GiB)": 39.22,
"step": 355,
"token_acc": 0.4612903225806452,
"train_speed(iter/s)": 0.597584
},
{
"epoch": 0.21844660194174756,
"grad_norm": 7.7608256340026855,
"learning_rate": 9.869745381355906e-05,
"loss": 2.2622493743896483,
"memory(GiB)": 39.22,
"step": 360,
"token_acc": 0.5156695156695157,
"train_speed(iter/s)": 0.59833
},
{
"epoch": 0.22148058252427186,
"grad_norm": 9.321803092956543,
"learning_rate": 9.86611841684511e-05,
"loss": 2.4146696090698243,
"memory(GiB)": 39.22,
"step": 365,
"token_acc": 0.47335423197492166,
"train_speed(iter/s)": 0.599026
},
{
"epoch": 0.22451456310679613,
"grad_norm": 9.292396545410156,
"learning_rate": 9.862442331567503e-05,
"loss": 2.3599546432495115,
"memory(GiB)": 39.22,
"step": 370,
"token_acc": 0.4956268221574344,
"train_speed(iter/s)": 0.5998
},
{
"epoch": 0.2275485436893204,
"grad_norm": 8.419163703918457,
"learning_rate": 9.858717162631128e-05,
"loss": 2.6148075103759765,
"memory(GiB)": 39.22,
"step": 375,
"token_acc": 0.46048109965635736,
"train_speed(iter/s)": 0.599846
},
{
"epoch": 0.23058252427184467,
"grad_norm": 6.27116584777832,
"learning_rate": 9.854942947639501e-05,
"loss": 2.4621152877807617,
"memory(GiB)": 39.22,
"step": 380,
"token_acc": 0.505524861878453,
"train_speed(iter/s)": 0.600276
},
{
"epoch": 0.23361650485436894,
"grad_norm": 7.211396217346191,
"learning_rate": 9.851119724691225e-05,
"loss": 2.5144262313842773,
"memory(GiB)": 39.22,
"step": 385,
"token_acc": 0.4525993883792049,
"train_speed(iter/s)": 0.600896
},
{
"epoch": 0.2366504854368932,
"grad_norm": 6.34926700592041,
"learning_rate": 9.84724753237962e-05,
"loss": 2.4521541595458984,
"memory(GiB)": 39.22,
"step": 390,
"token_acc": 0.5,
"train_speed(iter/s)": 0.600572
},
{
"epoch": 0.23968446601941748,
"grad_norm": 6.972572326660156,
"learning_rate": 9.843326409792317e-05,
"loss": 2.6046756744384765,
"memory(GiB)": 39.22,
"step": 395,
"token_acc": 0.44884488448844884,
"train_speed(iter/s)": 0.600491
},
{
"epoch": 0.24271844660194175,
"grad_norm": 11.898480415344238,
"learning_rate": 9.839356396510875e-05,
"loss": 2.3576316833496094,
"memory(GiB)": 39.22,
"step": 400,
"token_acc": 0.4472843450479233,
"train_speed(iter/s)": 0.601068
},
{
"epoch": 0.24575242718446602,
"grad_norm": 5.818270683288574,
"learning_rate": 9.835337532610376e-05,
"loss": 2.0870508193969726,
"memory(GiB)": 39.22,
"step": 405,
"token_acc": 0.526813880126183,
"train_speed(iter/s)": 0.601138
},
{
"epoch": 0.2487864077669903,
"grad_norm": 8.206275939941406,
"learning_rate": 9.831269858659023e-05,
"loss": 2.1485408782958983,
"memory(GiB)": 39.61,
"step": 410,
"token_acc": 0.5371900826446281,
"train_speed(iter/s)": 0.598807
},
{
"epoch": 0.2518203883495146,
"grad_norm": 7.233333587646484,
"learning_rate": 9.827153415717729e-05,
"loss": 2.37838191986084,
"memory(GiB)": 39.61,
"step": 415,
"token_acc": 0.5067114093959731,
"train_speed(iter/s)": 0.598786
},
{
"epoch": 0.25485436893203883,
"grad_norm": 6.615445613861084,
"learning_rate": 9.822988245339701e-05,
"loss": 2.3126983642578125,
"memory(GiB)": 39.61,
"step": 420,
"token_acc": 0.514018691588785,
"train_speed(iter/s)": 0.599013
},
{
"epoch": 0.25788834951456313,
"grad_norm": 7.5856523513793945,
"learning_rate": 9.818774389570027e-05,
"loss": 2.4124004364013674,
"memory(GiB)": 39.61,
"step": 425,
"token_acc": 0.511864406779661,
"train_speed(iter/s)": 0.598978
},
{
"epoch": 0.2609223300970874,
"grad_norm": 4.8371381759643555,
"learning_rate": 9.814511890945241e-05,
"loss": 2.2959733963012696,
"memory(GiB)": 39.61,
"step": 430,
"token_acc": 0.5327380952380952,
"train_speed(iter/s)": 0.59816
},
{
"epoch": 0.26395631067961167,
"grad_norm": 6.623883247375488,
"learning_rate": 9.810200792492904e-05,
"loss": 2.1788196563720703,
"memory(GiB)": 39.61,
"step": 435,
"token_acc": 0.5016611295681063,
"train_speed(iter/s)": 0.597541
},
{
"epoch": 0.2669902912621359,
"grad_norm": 6.926652431488037,
"learning_rate": 9.805841137731164e-05,
"loss": 2.1499845504760744,
"memory(GiB)": 39.61,
"step": 440,
"token_acc": 0.5192307692307693,
"train_speed(iter/s)": 0.597116
},
{
"epoch": 0.2700242718446602,
"grad_norm": 9.03418254852295,
"learning_rate": 9.801432970668318e-05,
"loss": 2.1190351486206054,
"memory(GiB)": 39.61,
"step": 445,
"token_acc": 0.5272727272727272,
"train_speed(iter/s)": 0.597517
},
{
"epoch": 0.27305825242718446,
"grad_norm": 8.781913757324219,
"learning_rate": 9.79697633580237e-05,
"loss": 2.4038110733032227,
"memory(GiB)": 39.61,
"step": 450,
"token_acc": 0.48179271708683474,
"train_speed(iter/s)": 0.59777
},
{
"epoch": 0.27609223300970875,
"grad_norm": 5.531435489654541,
"learning_rate": 9.792471278120573e-05,
"loss": 2.3716163635253906,
"memory(GiB)": 39.61,
"step": 455,
"token_acc": 0.4847560975609756,
"train_speed(iter/s)": 0.597608
},
{
"epoch": 0.279126213592233,
"grad_norm": 5.956150054931641,
"learning_rate": 9.787917843098989e-05,
"loss": 2.181165313720703,
"memory(GiB)": 39.61,
"step": 460,
"token_acc": 0.5051903114186851,
"train_speed(iter/s)": 0.597412
},
{
"epoch": 0.2821601941747573,
"grad_norm": 7.345389366149902,
"learning_rate": 9.783316076702019e-05,
"loss": 2.4305038452148438,
"memory(GiB)": 39.61,
"step": 465,
"token_acc": 0.47802197802197804,
"train_speed(iter/s)": 0.597506
},
{
"epoch": 0.28519417475728154,
"grad_norm": 5.4440388679504395,
"learning_rate": 9.778666025381943e-05,
"loss": 2.178025245666504,
"memory(GiB)": 39.61,
"step": 470,
"token_acc": 0.5167785234899329,
"train_speed(iter/s)": 0.597433
},
{
"epoch": 0.28822815533980584,
"grad_norm": 6.164299011230469,
"learning_rate": 9.77396773607845e-05,
"loss": 2.1623489379882814,
"memory(GiB)": 39.61,
"step": 475,
"token_acc": 0.4915254237288136,
"train_speed(iter/s)": 0.597134
},
{
"epoch": 0.2912621359223301,
"grad_norm": 6.166046619415283,
"learning_rate": 9.769221256218164e-05,
"loss": 2.3753950119018556,
"memory(GiB)": 39.61,
"step": 480,
"token_acc": 0.4879518072289157,
"train_speed(iter/s)": 0.597494
},
{
"epoch": 0.2942961165048544,
"grad_norm": 6.958017826080322,
"learning_rate": 9.764426633714167e-05,
"loss": 2.21927547454834,
"memory(GiB)": 39.61,
"step": 485,
"token_acc": 0.5050847457627119,
"train_speed(iter/s)": 0.597922
},
{
"epoch": 0.2973300970873786,
"grad_norm": 6.639190673828125,
"learning_rate": 9.759583916965517e-05,
"loss": 2.4649885177612303,
"memory(GiB)": 39.61,
"step": 490,
"token_acc": 0.4845360824742268,
"train_speed(iter/s)": 0.597703
},
{
"epoch": 0.3003640776699029,
"grad_norm": 5.950069904327393,
"learning_rate": 9.754693154856751e-05,
"loss": 2.612634468078613,
"memory(GiB)": 39.61,
"step": 495,
"token_acc": 0.45478723404255317,
"train_speed(iter/s)": 0.597528
},
{
"epoch": 0.30339805825242716,
"grad_norm": 6.54391622543335,
"learning_rate": 9.7497543967574e-05,
"loss": 2.3269075393676757,
"memory(GiB)": 39.61,
"step": 500,
"token_acc": 0.47770700636942676,
"train_speed(iter/s)": 0.597896
},
{
"epoch": 0.30339805825242716,
"eval_loss": 1.981583833694458,
"eval_runtime": 12.577,
"eval_samples_per_second": 7.951,
"eval_steps_per_second": 7.951,
"eval_token_acc": 0.48756906077348067,
"step": 500
},
{
"epoch": 0.30643203883495146,
"grad_norm": 10.183039665222168,
"learning_rate": 9.74476769252149e-05,
"loss": 2.1522619247436525,
"memory(GiB)": 39.61,
"step": 505,
"token_acc": 0.49651741293532337,
"train_speed(iter/s)": 0.587357
},
{
"epoch": 0.3094660194174757,
"grad_norm": 7.112940788269043,
"learning_rate": 9.739733092487035e-05,
"loss": 2.388911247253418,
"memory(GiB)": 39.61,
"step": 510,
"token_acc": 0.501577287066246,
"train_speed(iter/s)": 0.587206
},
{
"epoch": 0.3125,
"grad_norm": 6.210618495941162,
"learning_rate": 9.73465064747553e-05,
"loss": 2.59771614074707,
"memory(GiB)": 39.61,
"step": 515,
"token_acc": 0.4624624624624625,
"train_speed(iter/s)": 0.587584
},
{
"epoch": 0.3155339805825243,
"grad_norm": 6.931279182434082,
"learning_rate": 9.729520408791434e-05,
"loss": 2.512074279785156,
"memory(GiB)": 39.61,
"step": 520,
"token_acc": 0.4910394265232975,
"train_speed(iter/s)": 0.587423
},
{
"epoch": 0.31856796116504854,
"grad_norm": 6.450678825378418,
"learning_rate": 9.72434242822167e-05,
"loss": 2.1714031219482424,
"memory(GiB)": 39.61,
"step": 525,
"token_acc": 0.5186721991701245,
"train_speed(iter/s)": 0.586863
},
{
"epoch": 0.32160194174757284,
"grad_norm": 6.14349365234375,
"learning_rate": 9.719116758035074e-05,
"loss": 2.5791160583496096,
"memory(GiB)": 39.61,
"step": 530,
"token_acc": 0.46153846153846156,
"train_speed(iter/s)": 0.585986
},
{
"epoch": 0.3246359223300971,
"grad_norm": 6.583944797515869,
"learning_rate": 9.71384345098189e-05,
"loss": 2.3987077713012694,
"memory(GiB)": 39.61,
"step": 535,
"token_acc": 0.4925373134328358,
"train_speed(iter/s)": 0.58625
},
{
"epoch": 0.3276699029126214,
"grad_norm": 5.9314422607421875,
"learning_rate": 9.70852256029323e-05,
"loss": 2.235941505432129,
"memory(GiB)": 39.61,
"step": 540,
"token_acc": 0.5250737463126843,
"train_speed(iter/s)": 0.586408
},
{
"epoch": 0.3307038834951456,
"grad_norm": 7.305792331695557,
"learning_rate": 9.703154139680533e-05,
"loss": 2.417573928833008,
"memory(GiB)": 39.61,
"step": 545,
"token_acc": 0.5104477611940299,
"train_speed(iter/s)": 0.586501
},
{
"epoch": 0.3337378640776699,
"grad_norm": 5.719043731689453,
"learning_rate": 9.697738243335028e-05,
"loss": 2.2177127838134765,
"memory(GiB)": 39.61,
"step": 550,
"token_acc": 0.5016181229773463,
"train_speed(iter/s)": 0.586526
},
{
"epoch": 0.33677184466019416,
"grad_norm": 6.281179428100586,
"learning_rate": 9.692274925927185e-05,
"loss": 2.1101545333862304,
"memory(GiB)": 39.61,
"step": 555,
"token_acc": 0.5151515151515151,
"train_speed(iter/s)": 0.586182
},
{
"epoch": 0.33980582524271846,
"grad_norm": 5.763940811157227,
"learning_rate": 9.686764242606163e-05,
"loss": 2.2045364379882812,
"memory(GiB)": 39.61,
"step": 560,
"token_acc": 0.5030864197530864,
"train_speed(iter/s)": 0.585339
},
{
"epoch": 0.3428398058252427,
"grad_norm": 8.95957088470459,
"learning_rate": 9.681206248999257e-05,
"loss": 2.5135177612304687,
"memory(GiB)": 39.61,
"step": 565,
"token_acc": 0.48986486486486486,
"train_speed(iter/s)": 0.584751
},
{
"epoch": 0.345873786407767,
"grad_norm": 5.587778568267822,
"learning_rate": 9.675601001211326e-05,
"loss": 2.392421340942383,
"memory(GiB)": 39.61,
"step": 570,
"token_acc": 0.4461538461538462,
"train_speed(iter/s)": 0.585118
},
{
"epoch": 0.34890776699029125,
"grad_norm": 7.836484432220459,
"learning_rate": 9.669948555824242e-05,
"loss": 2.324014663696289,
"memory(GiB)": 39.61,
"step": 575,
"token_acc": 0.4639175257731959,
"train_speed(iter/s)": 0.584897
},
{
"epoch": 0.35194174757281554,
"grad_norm": 5.96414041519165,
"learning_rate": 9.664248969896303e-05,
"loss": 2.302785301208496,
"memory(GiB)": 39.61,
"step": 580,
"token_acc": 0.4904109589041096,
"train_speed(iter/s)": 0.585033
},
{
"epoch": 0.3549757281553398,
"grad_norm": 7.691707611083984,
"learning_rate": 9.65850230096167e-05,
"loss": 2.4697898864746093,
"memory(GiB)": 39.61,
"step": 585,
"token_acc": 0.4444444444444444,
"train_speed(iter/s)": 0.584897
},
{
"epoch": 0.3580097087378641,
"grad_norm": 8.556262016296387,
"learning_rate": 9.652708607029779e-05,
"loss": 2.2903860092163084,
"memory(GiB)": 40.86,
"step": 590,
"token_acc": 0.47115384615384615,
"train_speed(iter/s)": 0.584145
},
{
"epoch": 0.36104368932038833,
"grad_norm": 6.732985496520996,
"learning_rate": 9.646867946584757e-05,
"loss": 2.1200277328491213,
"memory(GiB)": 40.86,
"step": 595,
"token_acc": 0.532608695652174,
"train_speed(iter/s)": 0.584433
},
{
"epoch": 0.3640776699029126,
"grad_norm": 6.632906913757324,
"learning_rate": 9.64098037858483e-05,
"loss": 2.4770671844482424,
"memory(GiB)": 40.86,
"step": 600,
"token_acc": 0.4965753424657534,
"train_speed(iter/s)": 0.584177
},
{
"epoch": 0.36711165048543687,
"grad_norm": 8.074189186096191,
"learning_rate": 9.635045962461735e-05,
"loss": 2.0175329208374024,
"memory(GiB)": 40.86,
"step": 605,
"token_acc": 0.5444444444444444,
"train_speed(iter/s)": 0.584218
},
{
"epoch": 0.37014563106796117,
"grad_norm": 10.57684326171875,
"learning_rate": 9.62906475812011e-05,
"loss": 2.471089172363281,
"memory(GiB)": 40.86,
"step": 610,
"token_acc": 0.47604790419161674,
"train_speed(iter/s)": 0.584641
},
{
"epoch": 0.3731796116504854,
"grad_norm": 9.030044555664062,
"learning_rate": 9.623036825936898e-05,
"loss": 2.4689071655273436,
"memory(GiB)": 40.86,
"step": 615,
"token_acc": 0.4551282051282051,
"train_speed(iter/s)": 0.58472
},
{
"epoch": 0.3762135922330097,
"grad_norm": 12.650615692138672,
"learning_rate": 9.616962226760728e-05,
"loss": 2.4379999160766603,
"memory(GiB)": 40.86,
"step": 620,
"token_acc": 0.4965753424657534,
"train_speed(iter/s)": 0.583902
},
{
"epoch": 0.379247572815534,
"grad_norm": 6.823087692260742,
"learning_rate": 9.610841021911312e-05,
"loss": 2.2892841339111327,
"memory(GiB)": 40.86,
"step": 625,
"token_acc": 0.4925373134328358,
"train_speed(iter/s)": 0.584344
},
{
"epoch": 0.38228155339805825,
"grad_norm": 6.585781097412109,
"learning_rate": 9.604673273178819e-05,
"loss": 2.1564374923706056,
"memory(GiB)": 40.86,
"step": 630,
"token_acc": 0.5151515151515151,
"train_speed(iter/s)": 0.584389
},
{
"epoch": 0.38531553398058255,
"grad_norm": 7.104307174682617,
"learning_rate": 9.59845904282325e-05,
"loss": 2.1816734313964843,
"memory(GiB)": 40.86,
"step": 635,
"token_acc": 0.5308219178082192,
"train_speed(iter/s)": 0.585012
},
{
"epoch": 0.3883495145631068,
"grad_norm": 7.516766548156738,
"learning_rate": 9.592198393573816e-05,
"loss": 2.276702308654785,
"memory(GiB)": 40.86,
"step": 640,
"token_acc": 0.5102739726027398,
"train_speed(iter/s)": 0.585101
},
{
"epoch": 0.3913834951456311,
"grad_norm": 8.942841529846191,
"learning_rate": 9.585891388628298e-05,
"loss": 2.3461095809936525,
"memory(GiB)": 40.86,
"step": 645,
"token_acc": 0.527972027972028,
"train_speed(iter/s)": 0.585177
},
{
"epoch": 0.39441747572815533,
"grad_norm": 7.309288024902344,
"learning_rate": 9.579538091652414e-05,
"loss": 2.3102886199951174,
"memory(GiB)": 40.86,
"step": 650,
"token_acc": 0.5029585798816568,
"train_speed(iter/s)": 0.584698
},
{
"epoch": 0.39745145631067963,
"grad_norm": 8.047052383422852,
"learning_rate": 9.573138566779171e-05,
"loss": 2.2706655502319335,
"memory(GiB)": 40.86,
"step": 655,
"token_acc": 0.48942598187311176,
"train_speed(iter/s)": 0.584864
},
{
"epoch": 0.40048543689320387,
"grad_norm": 5.1699442863464355,
"learning_rate": 9.566692878608229e-05,
"loss": 2.3724884033203124,
"memory(GiB)": 40.86,
"step": 660,
"token_acc": 0.5276872964169381,
"train_speed(iter/s)": 0.584586
},
{
"epoch": 0.40351941747572817,
"grad_norm": 7.834784030914307,
"learning_rate": 9.560201092205231e-05,
"loss": 2.149821090698242,
"memory(GiB)": 40.86,
"step": 665,
"token_acc": 0.5422535211267606,
"train_speed(iter/s)": 0.584347
},
{
"epoch": 0.4065533980582524,
"grad_norm": 5.076271057128906,
"learning_rate": 9.553663273101162e-05,
"loss": 2.2725826263427735,
"memory(GiB)": 40.86,
"step": 670,
"token_acc": 0.48223350253807107,
"train_speed(iter/s)": 0.584476
},
{
"epoch": 0.4095873786407767,
"grad_norm": 5.5801005363464355,
"learning_rate": 9.54707948729168e-05,
"loss": 2.556637001037598,
"memory(GiB)": 40.86,
"step": 675,
"token_acc": 0.49714285714285716,
"train_speed(iter/s)": 0.583956
},
{
"epoch": 0.41262135922330095,
"grad_norm": 7.065471649169922,
"learning_rate": 9.540449801236451e-05,
"loss": 2.326729393005371,
"memory(GiB)": 40.86,
"step": 680,
"token_acc": 0.5097402597402597,
"train_speed(iter/s)": 0.583876
},
{
"epoch": 0.41565533980582525,
"grad_norm": 5.337322235107422,
"learning_rate": 9.533774281858481e-05,
"loss": 2.34055118560791,
"memory(GiB)": 40.86,
"step": 685,
"token_acc": 0.45645645645645644,
"train_speed(iter/s)": 0.584332
},
{
"epoch": 0.4186893203883495,
"grad_norm": 6.009404182434082,
"learning_rate": 9.527052996543436e-05,
"loss": 2.368490791320801,
"memory(GiB)": 40.86,
"step": 690,
"token_acc": 0.49240121580547114,
"train_speed(iter/s)": 0.584492
},
{
"epoch": 0.4217233009708738,
"grad_norm": 7.1615495681762695,
"learning_rate": 9.520286013138959e-05,
"loss": 2.2751487731933593,
"memory(GiB)": 40.86,
"step": 695,
"token_acc": 0.51875,
"train_speed(iter/s)": 0.584253
},
{
"epoch": 0.42475728155339804,
"grad_norm": 6.305184841156006,
"learning_rate": 9.513473399954001e-05,
"loss": 2.2249755859375,
"memory(GiB)": 40.86,
"step": 700,
"token_acc": 0.5359477124183006,
"train_speed(iter/s)": 0.583644
},
{
"epoch": 0.42779126213592233,
"grad_norm": 6.879371166229248,
"learning_rate": 9.506615225758111e-05,
"loss": 2.1284107208251952,
"memory(GiB)": 40.86,
"step": 705,
"token_acc": 0.46647230320699706,
"train_speed(iter/s)": 0.583054
},
{
"epoch": 0.4308252427184466,
"grad_norm": 5.7029523849487305,
"learning_rate": 9.499711559780756e-05,
"loss": 2.3587778091430662,
"memory(GiB)": 40.86,
"step": 710,
"token_acc": 0.4859154929577465,
"train_speed(iter/s)": 0.583094
},
{
"epoch": 0.4338592233009709,
"grad_norm": 7.390230178833008,
"learning_rate": 9.492762471710612e-05,
"loss": 2.6136167526245115,
"memory(GiB)": 40.86,
"step": 715,
"token_acc": 0.46646341463414637,
"train_speed(iter/s)": 0.582932
},
{
"epoch": 0.4368932038834951,
"grad_norm": 5.883137226104736,
"learning_rate": 9.485768031694872e-05,
"loss": 2.2231393814086915,
"memory(GiB)": 40.86,
"step": 720,
"token_acc": 0.49818181818181817,
"train_speed(iter/s)": 0.582775
},
{
"epoch": 0.4399271844660194,
"grad_norm": 6.680229663848877,
"learning_rate": 9.478728310338527e-05,
"loss": 2.1992170333862306,
"memory(GiB)": 40.86,
"step": 725,
"token_acc": 0.5133531157270029,
"train_speed(iter/s)": 0.582635
},
{
"epoch": 0.4429611650485437,
"grad_norm": 8.902689933776855,
"learning_rate": 9.471643378703662e-05,
"loss": 2.0395624160766603,
"memory(GiB)": 40.86,
"step": 730,
"token_acc": 0.5493421052631579,
"train_speed(iter/s)": 0.582703
},
{
"epoch": 0.44599514563106796,
"grad_norm": 5.443286895751953,
"learning_rate": 9.464513308308734e-05,
"loss": 2.506935882568359,
"memory(GiB)": 40.86,
"step": 735,
"token_acc": 0.47368421052631576,
"train_speed(iter/s)": 0.583215
},
{
"epoch": 0.44902912621359226,
"grad_norm": 6.487564563751221,
"learning_rate": 9.457338171127847e-05,
"loss": 2.2692995071411133,
"memory(GiB)": 40.86,
"step": 740,
"token_acc": 0.5179153094462541,
"train_speed(iter/s)": 0.583207
},
{
"epoch": 0.4520631067961165,
"grad_norm": 7.125478267669678,
"learning_rate": 9.450118039590032e-05,
"loss": 2.1293052673339843,
"memory(GiB)": 40.86,
"step": 745,
"token_acc": 0.5464285714285714,
"train_speed(iter/s)": 0.583504
},
{
"epoch": 0.4550970873786408,
"grad_norm": 7.087446212768555,
"learning_rate": 9.442852986578514e-05,
"loss": 2.4458339691162108,
"memory(GiB)": 40.86,
"step": 750,
"token_acc": 0.49736842105263157,
"train_speed(iter/s)": 0.583179
},
{
"epoch": 0.45813106796116504,
"grad_norm": 7.162069320678711,
"learning_rate": 9.435543085429972e-05,
"loss": 2.3158668518066405,
"memory(GiB)": 40.86,
"step": 755,
"token_acc": 0.4744744744744745,
"train_speed(iter/s)": 0.5828
},
{
"epoch": 0.46116504854368934,
"grad_norm": 5.414243698120117,
"learning_rate": 9.428188409933806e-05,
"loss": 2.16876335144043,
"memory(GiB)": 40.86,
"step": 760,
"token_acc": 0.5087719298245614,
"train_speed(iter/s)": 0.58285
},
{
"epoch": 0.4641990291262136,
"grad_norm": 6.282864570617676,
"learning_rate": 9.420789034331387e-05,
"loss": 2.289217948913574,
"memory(GiB)": 40.86,
"step": 765,
"token_acc": 0.512396694214876,
"train_speed(iter/s)": 0.582631
},
{
"epoch": 0.4672330097087379,
"grad_norm": 8.376455307006836,
"learning_rate": 9.413345033315307e-05,
"loss": 2.428557777404785,
"memory(GiB)": 40.86,
"step": 770,
"token_acc": 0.49038461538461536,
"train_speed(iter/s)": 0.582831
},
{
"epoch": 0.4702669902912621,
"grad_norm": 6.952515602111816,
"learning_rate": 9.405856482028627e-05,
"loss": 2.5767995834350588,
"memory(GiB)": 40.86,
"step": 775,
"token_acc": 0.43425076452599387,
"train_speed(iter/s)": 0.583315
},
{
"epoch": 0.4733009708737864,
"grad_norm": 9.879197120666504,
"learning_rate": 9.398323456064123e-05,
"loss": 2.218907356262207,
"memory(GiB)": 40.86,
"step": 780,
"token_acc": 0.4844961240310077,
"train_speed(iter/s)": 0.583721
},
{
"epoch": 0.47633495145631066,
"grad_norm": 7.553537845611572,
"learning_rate": 9.39074603146351e-05,
"loss": 2.3447980880737305,
"memory(GiB)": 40.86,
"step": 785,
"token_acc": 0.4934640522875817,
"train_speed(iter/s)": 0.583893
},
{
"epoch": 0.47936893203883496,
"grad_norm": 6.071379661560059,
"learning_rate": 9.383124284716691e-05,
"loss": 2.241764450073242,
"memory(GiB)": 40.86,
"step": 790,
"token_acc": 0.46984126984126984,
"train_speed(iter/s)": 0.584261
},
{
"epoch": 0.4824029126213592,
"grad_norm": 5.881275653839111,
"learning_rate": 9.37545829276097e-05,
"loss": 2.361056900024414,
"memory(GiB)": 40.86,
"step": 795,
"token_acc": 0.5249169435215947,
"train_speed(iter/s)": 0.584506
},
{
"epoch": 0.4854368932038835,
"grad_norm": 8.599099159240723,
"learning_rate": 9.367748132980287e-05,
"loss": 2.1997608184814452,
"memory(GiB)": 40.86,
"step": 800,
"token_acc": 0.48518518518518516,
"train_speed(iter/s)": 0.584425
},
{
"epoch": 0.48847087378640774,
"grad_norm": 5.97467565536499,
"learning_rate": 9.359993883204425e-05,
"loss": 2.2297504425048826,
"memory(GiB)": 40.86,
"step": 805,
"token_acc": 0.5,
"train_speed(iter/s)": 0.583965
},
{
"epoch": 0.49150485436893204,
"grad_norm": 6.91083288192749,
"learning_rate": 9.352195621708239e-05,
"loss": 1.9850988388061523,
"memory(GiB)": 40.86,
"step": 810,
"token_acc": 0.5147540983606558,
"train_speed(iter/s)": 0.584245
},
{
"epoch": 0.4945388349514563,
"grad_norm": 8.59461784362793,
"learning_rate": 9.344353427210852e-05,
"loss": 2.421934127807617,
"memory(GiB)": 40.86,
"step": 815,
"token_acc": 0.5176056338028169,
"train_speed(iter/s)": 0.5847
},
{
"epoch": 0.4975728155339806,
"grad_norm": 6.944448947906494,
"learning_rate": 9.336467378874871e-05,
"loss": 2.4557096481323244,
"memory(GiB)": 40.86,
"step": 820,
"token_acc": 0.47419354838709676,
"train_speed(iter/s)": 0.584583
},
{
"epoch": 0.5006067961165048,
"grad_norm": 5.55971622467041,
"learning_rate": 9.328537556305578e-05,
"loss": 2.2306629180908204,
"memory(GiB)": 40.86,
"step": 825,
"token_acc": 0.4852459016393443,
"train_speed(iter/s)": 0.585006
},
{
"epoch": 0.5036407766990292,
"grad_norm": 7.160358905792236,
"learning_rate": 9.320564039550134e-05,
"loss": 2.429665374755859,
"memory(GiB)": 40.86,
"step": 830,
"token_acc": 0.47262247838616717,
"train_speed(iter/s)": 0.585348
},
{
"epoch": 0.5066747572815534,
"grad_norm": 6.570638656616211,
"learning_rate": 9.31254690909677e-05,
"loss": 2.442539596557617,
"memory(GiB)": 40.86,
"step": 835,
"token_acc": 0.476027397260274,
"train_speed(iter/s)": 0.585696
},
{
"epoch": 0.5097087378640777,
"grad_norm": 6.370124340057373,
"learning_rate": 9.304486245873972e-05,
"loss": 2.287601089477539,
"memory(GiB)": 40.86,
"step": 840,
"token_acc": 0.4982456140350877,
"train_speed(iter/s)": 0.585975
},
{
"epoch": 0.5127427184466019,
"grad_norm": 6.999332904815674,
"learning_rate": 9.296382131249666e-05,
"loss": 2.317913818359375,
"memory(GiB)": 40.86,
"step": 845,
"token_acc": 0.5041782729805014,
"train_speed(iter/s)": 0.585835
},
{
"epoch": 0.5157766990291263,
"grad_norm": 5.257606506347656,
"learning_rate": 9.288234647030391e-05,
"loss": 2.18968505859375,
"memory(GiB)": 40.86,
"step": 850,
"token_acc": 0.5102040816326531,
"train_speed(iter/s)": 0.585922
},
{
"epoch": 0.5188106796116505,
"grad_norm": 5.611077785491943,
"learning_rate": 9.280043875460485e-05,
"loss": 2.0620901107788088,
"memory(GiB)": 40.86,
"step": 855,
"token_acc": 0.5365079365079365,
"train_speed(iter/s)": 0.585966
},
{
"epoch": 0.5218446601941747,
"grad_norm": 5.108231067657471,
"learning_rate": 9.271809899221246e-05,
"loss": 2.4372896194458007,
"memory(GiB)": 40.86,
"step": 860,
"token_acc": 0.4479768786127168,
"train_speed(iter/s)": 0.585988
},
{
"epoch": 0.524878640776699,
"grad_norm": 6.733373641967773,
"learning_rate": 9.263532801430094e-05,
"loss": 2.1579952239990234,
"memory(GiB)": 40.86,
"step": 865,
"token_acc": 0.4807121661721068,
"train_speed(iter/s)": 0.585966
},
{
"epoch": 0.5279126213592233,
"grad_norm": 5.073429107666016,
"learning_rate": 9.255212665639744e-05,
"loss": 2.1149240493774415,
"memory(GiB)": 40.86,
"step": 870,
"token_acc": 0.5171232876712328,
"train_speed(iter/s)": 0.585975
},
{
"epoch": 0.5309466019417476,
"grad_norm": 6.175984859466553,
"learning_rate": 9.246849575837349e-05,
"loss": 1.9833623886108398,
"memory(GiB)": 40.86,
"step": 875,
"token_acc": 0.526813880126183,
"train_speed(iter/s)": 0.585474
},
{
"epoch": 0.5339805825242718,
"grad_norm": 8.124624252319336,
"learning_rate": 9.238443616443666e-05,
"loss": 2.4017959594726563,
"memory(GiB)": 40.86,
"step": 880,
"token_acc": 0.49032258064516127,
"train_speed(iter/s)": 0.585597
},
{
"epoch": 0.5370145631067961,
"grad_norm": 7.752699375152588,
"learning_rate": 9.229994872312193e-05,
"loss": 2.387744331359863,
"memory(GiB)": 40.86,
"step": 885,
"token_acc": 0.5106382978723404,
"train_speed(iter/s)": 0.585668
},
{
"epoch": 0.5400485436893204,
"grad_norm": 8.323918342590332,
"learning_rate": 9.221503428728316e-05,
"loss": 2.1385421752929688,
"memory(GiB)": 40.86,
"step": 890,
"token_acc": 0.5551020408163265,
"train_speed(iter/s)": 0.586131
},
{
"epoch": 0.5430825242718447,
"grad_norm": 6.044275760650635,
"learning_rate": 9.212969371408449e-05,
"loss": 1.9817846298217774,
"memory(GiB)": 40.86,
"step": 895,
"token_acc": 0.5494880546075085,
"train_speed(iter/s)": 0.586393
},
{
"epoch": 0.5461165048543689,
"grad_norm": 6.398566246032715,
"learning_rate": 9.204392786499168e-05,
"loss": 2.3052085876464843,
"memory(GiB)": 40.86,
"step": 900,
"token_acc": 0.4840764331210191,
"train_speed(iter/s)": 0.586567
},
{
"epoch": 0.5491504854368932,
"grad_norm": 9.17261028289795,
"learning_rate": 9.19577376057634e-05,
"loss": 2.37634391784668,
"memory(GiB)": 40.86,
"step": 905,
"token_acc": 0.5249169435215947,
"train_speed(iter/s)": 0.586787
},
{
"epoch": 0.5521844660194175,
"grad_norm": 5.751415729522705,
"learning_rate": 9.187112380644254e-05,
"loss": 2.2847476959228517,
"memory(GiB)": 40.86,
"step": 910,
"token_acc": 0.51338199513382,
"train_speed(iter/s)": 0.586408
},
{
"epoch": 0.5552184466019418,
"grad_norm": 12.116822242736816,
"learning_rate": 9.178408734134736e-05,
"loss": 2.5225976943969726,
"memory(GiB)": 40.86,
"step": 915,
"token_acc": 0.48253968253968255,
"train_speed(iter/s)": 0.586571
},
{
"epoch": 0.558252427184466,
"grad_norm": 8.28947925567627,
"learning_rate": 9.16966290890627e-05,
"loss": 2.215795135498047,
"memory(GiB)": 40.86,
"step": 920,
"token_acc": 0.5543071161048689,
"train_speed(iter/s)": 0.586881
},
{
"epoch": 0.5612864077669902,
"grad_norm": 9.582908630371094,
"learning_rate": 9.160874993243113e-05,
"loss": 2.299172019958496,
"memory(GiB)": 40.86,
"step": 925,
"token_acc": 0.4763636363636364,
"train_speed(iter/s)": 0.587069
},
{
"epoch": 0.5643203883495146,
"grad_norm": 8.669927597045898,
"learning_rate": 9.152045075854398e-05,
"loss": 2.457051086425781,
"memory(GiB)": 40.86,
"step": 930,
"token_acc": 0.49828178694158076,
"train_speed(iter/s)": 0.587349
},
{
"epoch": 0.5673543689320388,
"grad_norm": 6.801449298858643,
"learning_rate": 9.143173245873247e-05,
"loss": 2.1124551773071287,
"memory(GiB)": 40.86,
"step": 935,
"token_acc": 0.5018315018315018,
"train_speed(iter/s)": 0.58766
},
{
"epoch": 0.5703883495145631,
"grad_norm": 7.988888263702393,
"learning_rate": 9.134259592855861e-05,
"loss": 2.3452516555786134,
"memory(GiB)": 40.86,
"step": 940,
"token_acc": 0.49666666666666665,
"train_speed(iter/s)": 0.58765
},
{
"epoch": 0.5734223300970874,
"grad_norm": 7.102814674377441,
"learning_rate": 9.125304206780627e-05,
"loss": 2.3180185317993165,
"memory(GiB)": 40.86,
"step": 945,
"token_acc": 0.5050847457627119,
"train_speed(iter/s)": 0.587449
},
{
"epoch": 0.5764563106796117,
"grad_norm": 7.604477405548096,
"learning_rate": 9.116307178047198e-05,
"loss": 2.3972042083740233,
"memory(GiB)": 40.86,
"step": 950,
"token_acc": 0.46283783783783783,
"train_speed(iter/s)": 0.587729
},
{
"epoch": 0.5794902912621359,
"grad_norm": 6.319246292114258,
"learning_rate": 9.10726859747559e-05,
"loss": 2.103443908691406,
"memory(GiB)": 40.86,
"step": 955,
"token_acc": 0.5191256830601093,
"train_speed(iter/s)": 0.588189
},
{
"epoch": 0.5825242718446602,
"grad_norm": 8.772871017456055,
"learning_rate": 9.098188556305263e-05,
"loss": 2.073552703857422,
"memory(GiB)": 40.86,
"step": 960,
"token_acc": 0.552901023890785,
"train_speed(iter/s)": 0.588218
},
{
"epoch": 0.5855582524271845,
"grad_norm": 8.01586627960205,
"learning_rate": 9.089067146194196e-05,
"loss": 1.8984146118164062,
"memory(GiB)": 40.86,
"step": 965,
"token_acc": 0.616504854368932,
"train_speed(iter/s)": 0.58821
},
{
"epoch": 0.5885922330097088,
"grad_norm": 6.168645858764648,
"learning_rate": 9.079904459217966e-05,
"loss": 2.379282760620117,
"memory(GiB)": 40.86,
"step": 970,
"token_acc": 0.4649122807017544,
"train_speed(iter/s)": 0.588446
},
{
"epoch": 0.591626213592233,
"grad_norm": 6.704972743988037,
"learning_rate": 9.070700587868817e-05,
"loss": 2.1655595779418944,
"memory(GiB)": 40.86,
"step": 975,
"token_acc": 0.5521885521885522,
"train_speed(iter/s)": 0.588386
},
{
"epoch": 0.5946601941747572,
"grad_norm": 7.025293827056885,
"learning_rate": 9.061455625054725e-05,
"loss": 2.193133735656738,
"memory(GiB)": 40.86,
"step": 980,
"token_acc": 0.5197368421052632,
"train_speed(iter/s)": 0.588278
},
{
"epoch": 0.5976941747572816,
"grad_norm": 6.618514537811279,
"learning_rate": 9.052169664098461e-05,
"loss": 2.0073310852050783,
"memory(GiB)": 40.86,
"step": 985,
"token_acc": 0.55893536121673,
"train_speed(iter/s)": 0.588288
},
{
"epoch": 0.6007281553398058,
"grad_norm": 5.154722690582275,
"learning_rate": 9.042842798736654e-05,
"loss": 2.2399974822998048,
"memory(GiB)": 40.86,
"step": 990,
"token_acc": 0.5195530726256983,
"train_speed(iter/s)": 0.588303
},
{
"epoch": 0.6037621359223301,
"grad_norm": 6.787222862243652,
"learning_rate": 9.03347512311884e-05,
"loss": 2.3585285186767577,
"memory(GiB)": 40.86,
"step": 995,
"token_acc": 0.46075085324232085,
"train_speed(iter/s)": 0.588665
},
{
"epoch": 0.6067961165048543,
"grad_norm": 4.932912826538086,
"learning_rate": 9.024066731806501e-05,
"loss": 2.276376724243164,
"memory(GiB)": 40.86,
"step": 1000,
"token_acc": 0.4921135646687697,
"train_speed(iter/s)": 0.58881
},
{
"epoch": 0.6067961165048543,
"eval_loss": 2.31942081451416,
"eval_runtime": 12.0489,
"eval_samples_per_second": 8.3,
"eval_steps_per_second": 8.3,
"eval_token_acc": 0.48575305291723203,
"step": 1000
},
{
"epoch": 0.6098300970873787,
"grad_norm": 9.08281421661377,
"learning_rate": 9.01461771977214e-05,
"loss": 2.333499717712402,
"memory(GiB)": 40.86,
"step": 1005,
"token_acc": 0.4905838041431262,
"train_speed(iter/s)": 0.584203
},
{
"epoch": 0.6128640776699029,
"grad_norm": 7.2298359870910645,
"learning_rate": 9.005128182398283e-05,
"loss": 2.4393625259399414,
"memory(GiB)": 40.86,
"step": 1010,
"token_acc": 0.48732394366197185,
"train_speed(iter/s)": 0.584412
},
{
"epoch": 0.6158980582524272,
"grad_norm": 5.784246444702148,
"learning_rate": 8.995598215476555e-05,
"loss": 2.171500587463379,
"memory(GiB)": 40.86,
"step": 1015,
"token_acc": 0.5384615384615384,
"train_speed(iter/s)": 0.58417
},
{
"epoch": 0.6189320388349514,
"grad_norm": 8.403388977050781,
"learning_rate": 8.986027915206686e-05,
"loss": 2.1093074798583986,
"memory(GiB)": 40.86,
"step": 1020,
"token_acc": 0.5201342281879194,
"train_speed(iter/s)": 0.584014
},
{
"epoch": 0.6219660194174758,
"grad_norm": 7.646571636199951,
"learning_rate": 8.976417378195544e-05,
"loss": 2.1439834594726563,
"memory(GiB)": 40.86,
"step": 1025,
"token_acc": 0.5295857988165681,
"train_speed(iter/s)": 0.583981
},
{
"epoch": 0.625,
"grad_norm": 6.978275299072266,
"learning_rate": 8.966766701456177e-05,
"loss": 2.288041687011719,
"memory(GiB)": 40.86,
"step": 1030,
"token_acc": 0.513126491646778,
"train_speed(iter/s)": 0.584314
},
{
"epoch": 0.6280339805825242,
"grad_norm": 6.3236541748046875,
"learning_rate": 8.957075982406811e-05,
"loss": 2.250352668762207,
"memory(GiB)": 40.86,
"step": 1035,
"token_acc": 0.531986531986532,
"train_speed(iter/s)": 0.584575
},
{
"epoch": 0.6310679611650486,
"grad_norm": 6.21897554397583,
"learning_rate": 8.947345318869882e-05,
"loss": 2.425637054443359,
"memory(GiB)": 40.86,
"step": 1040,
"token_acc": 0.46859903381642515,
"train_speed(iter/s)": 0.584674
},
{
"epoch": 0.6341019417475728,
"grad_norm": 7.0973358154296875,
"learning_rate": 8.937574809071041e-05,
"loss": 1.9796913146972657,
"memory(GiB)": 40.86,
"step": 1045,
"token_acc": 0.5555555555555556,
"train_speed(iter/s)": 0.584622
},
{
"epoch": 0.6371359223300971,
"grad_norm": 6.9171833992004395,
"learning_rate": 8.927764551638169e-05,
"loss": 2.153505325317383,
"memory(GiB)": 40.86,
"step": 1050,
"token_acc": 0.5481727574750831,
"train_speed(iter/s)": 0.584749
},
{
"epoch": 0.6401699029126213,
"grad_norm": 6.10349178314209,
"learning_rate": 8.917914645600369e-05,
"loss": 2.2978469848632814,
"memory(GiB)": 40.86,
"step": 1055,
"token_acc": 0.5279503105590062,
"train_speed(iter/s)": 0.584579
},
{
"epoch": 0.6432038834951457,
"grad_norm": 8.071660995483398,
"learning_rate": 8.908025190386985e-05,
"loss": 1.8877496719360352,
"memory(GiB)": 40.86,
"step": 1060,
"token_acc": 0.582089552238806,
"train_speed(iter/s)": 0.584349
},
{
"epoch": 0.6462378640776699,
"grad_norm": 5.858845233917236,
"learning_rate": 8.898096285826582e-05,
"loss": 2.2511001586914063,
"memory(GiB)": 40.86,
"step": 1065,
"token_acc": 0.4642857142857143,
"train_speed(iter/s)": 0.584502
},
{
"epoch": 0.6492718446601942,
"grad_norm": 7.347677230834961,
"learning_rate": 8.888128032145941e-05,
"loss": 2.173788833618164,
"memory(GiB)": 40.86,
"step": 1070,
"token_acc": 0.5113636363636364,
"train_speed(iter/s)": 0.584862
},
{
"epoch": 0.6523058252427184,
"grad_norm": 6.6078596115112305,
"learning_rate": 8.878120529969061e-05,
"loss": 2.1907543182373046,
"memory(GiB)": 40.86,
"step": 1075,
"token_acc": 0.5047318611987381,
"train_speed(iter/s)": 0.585062
},
{
"epoch": 0.6553398058252428,
"grad_norm": 6.744375228881836,
"learning_rate": 8.868073880316124e-05,
"loss": 2.5921836853027345,
"memory(GiB)": 40.86,
"step": 1080,
"token_acc": 0.4777777777777778,
"train_speed(iter/s)": 0.585281
},
{
"epoch": 0.658373786407767,
"grad_norm": 6.0812153816223145,
"learning_rate": 8.857988184602484e-05,
"loss": 2.076370620727539,
"memory(GiB)": 40.86,
"step": 1085,
"token_acc": 0.5142118863049095,
"train_speed(iter/s)": 0.585629
},
{
"epoch": 0.6614077669902912,
"grad_norm": 7.485403060913086,
"learning_rate": 8.84786354463765e-05,
"loss": 2.553455352783203,
"memory(GiB)": 40.86,
"step": 1090,
"token_acc": 0.47151898734177217,
"train_speed(iter/s)": 0.58598
},
{
"epoch": 0.6644417475728155,
"grad_norm": 6.709589958190918,
"learning_rate": 8.837700062624245e-05,
"loss": 2.1033605575561523,
"memory(GiB)": 40.86,
"step": 1095,
"token_acc": 0.525,
"train_speed(iter/s)": 0.586173
},
{
"epoch": 0.6674757281553398,
"grad_norm": 7.800707817077637,
"learning_rate": 8.827497841156986e-05,
"loss": 2.4268184661865235,
"memory(GiB)": 40.86,
"step": 1100,
"token_acc": 0.5115511551155115,
"train_speed(iter/s)": 0.586144
},
{
"epoch": 0.6705097087378641,
"grad_norm": 6.759317874908447,
"learning_rate": 8.817256983221637e-05,
"loss": 2.4730669021606446,
"memory(GiB)": 40.86,
"step": 1105,
"token_acc": 0.45609065155807366,
"train_speed(iter/s)": 0.586261
},
{
"epoch": 0.6735436893203883,
"grad_norm": 7.099054336547852,
"learning_rate": 8.806977592193985e-05,
"loss": 2.596049118041992,
"memory(GiB)": 40.86,
"step": 1110,
"token_acc": 0.44542772861356933,
"train_speed(iter/s)": 0.586514
},
{
"epoch": 0.6765776699029126,
"grad_norm": 6.84334135055542,
"learning_rate": 8.796659771838777e-05,
"loss": 2.2642656326293946,
"memory(GiB)": 40.86,
"step": 1115,
"token_acc": 0.5156794425087108,
"train_speed(iter/s)": 0.586643
},
{
"epoch": 0.6796116504854369,
"grad_norm": 6.421635627746582,
"learning_rate": 8.786303626308689e-05,
"loss": 2.1252628326416017,
"memory(GiB)": 40.86,
"step": 1120,
"token_acc": 0.5263157894736842,
"train_speed(iter/s)": 0.586755
},
{
"epoch": 0.6826456310679612,
"grad_norm": 4.622204303741455,
"learning_rate": 8.775909260143266e-05,
"loss": 2.2372303009033203,
"memory(GiB)": 40.86,
"step": 1125,
"token_acc": 0.5159574468085106,
"train_speed(iter/s)": 0.586869
},
{
"epoch": 0.6856796116504854,
"grad_norm": 7.592894554138184,
"learning_rate": 8.765476778267874e-05,
"loss": 2.1323163986206053,
"memory(GiB)": 40.86,
"step": 1130,
"token_acc": 0.4909090909090909,
"train_speed(iter/s)": 0.586899
},
{
"epoch": 0.6887135922330098,
"grad_norm": 7.193599700927734,
"learning_rate": 8.755006285992629e-05,
"loss": 2.1902294158935547,
"memory(GiB)": 40.86,
"step": 1135,
"token_acc": 0.5234899328859061,
"train_speed(iter/s)": 0.586763
},
{
"epoch": 0.691747572815534,
"grad_norm": 4.904916286468506,
"learning_rate": 8.744497889011343e-05,
"loss": 2.2312740325927733,
"memory(GiB)": 40.86,
"step": 1140,
"token_acc": 0.48404255319148937,
"train_speed(iter/s)": 0.586675
},
{
"epoch": 0.6947815533980582,
"grad_norm": 8.201228141784668,
"learning_rate": 8.733951693400458e-05,
"loss": 2.166943168640137,
"memory(GiB)": 40.86,
"step": 1145,
"token_acc": 0.5105633802816901,
"train_speed(iter/s)": 0.58693
},
{
"epoch": 0.6978155339805825,
"grad_norm": 5.049937725067139,
"learning_rate": 8.723367805617965e-05,
"loss": 2.254404067993164,
"memory(GiB)": 40.86,
"step": 1150,
"token_acc": 0.478125,
"train_speed(iter/s)": 0.587058
},
{
"epoch": 0.7008495145631068,
"grad_norm": 6.745171546936035,
"learning_rate": 8.712746332502351e-05,
"loss": 2.1543249130249023,
"memory(GiB)": 40.86,
"step": 1155,
"token_acc": 0.5327380952380952,
"train_speed(iter/s)": 0.587131
},
{
"epoch": 0.7038834951456311,
"grad_norm": 10.320196151733398,
"learning_rate": 8.702087381271488e-05,
"loss": 2.4464441299438477,
"memory(GiB)": 40.86,
"step": 1160,
"token_acc": 0.4897959183673469,
"train_speed(iter/s)": 0.587013
},
{
"epoch": 0.7069174757281553,
"grad_norm": 6.8334503173828125,
"learning_rate": 8.691391059521583e-05,
"loss": 2.1910587310791017,
"memory(GiB)": 40.86,
"step": 1165,
"token_acc": 0.527972027972028,
"train_speed(iter/s)": 0.586856
},
{
"epoch": 0.7099514563106796,
"grad_norm": 6.28577184677124,
"learning_rate": 8.680657475226069e-05,
"loss": 1.9499628067016601,
"memory(GiB)": 40.86,
"step": 1170,
"token_acc": 0.6007751937984496,
"train_speed(iter/s)": 0.586444
},
{
"epoch": 0.7129854368932039,
"grad_norm": 6.818657398223877,
"learning_rate": 8.669886736734527e-05,
"loss": 2.151942825317383,
"memory(GiB)": 40.86,
"step": 1175,
"token_acc": 0.5254777070063694,
"train_speed(iter/s)": 0.58655
},
{
"epoch": 0.7160194174757282,
"grad_norm": 5.253009796142578,
"learning_rate": 8.659078952771592e-05,
"loss": 2.54516487121582,
"memory(GiB)": 40.86,
"step": 1180,
"token_acc": 0.4984894259818731,
"train_speed(iter/s)": 0.586775
},
{
"epoch": 0.7190533980582524,
"grad_norm": 8.068851470947266,
"learning_rate": 8.648234232435845e-05,
"loss": 2.3182897567749023,
"memory(GiB)": 40.86,
"step": 1185,
"token_acc": 0.4734982332155477,
"train_speed(iter/s)": 0.586773
},
{
"epoch": 0.7220873786407767,
"grad_norm": 6.965189456939697,
"learning_rate": 8.63735268519873e-05,
"loss": 2.1850954055786134,
"memory(GiB)": 40.86,
"step": 1190,
"token_acc": 0.5196374622356495,
"train_speed(iter/s)": 0.586751
},
{
"epoch": 0.725121359223301,
"grad_norm": 6.5986409187316895,
"learning_rate": 8.626434420903424e-05,
"loss": 2.5639453887939454,
"memory(GiB)": 40.86,
"step": 1195,
"token_acc": 0.4631268436578171,
"train_speed(iter/s)": 0.586943
},
{
"epoch": 0.7281553398058253,
"grad_norm": 4.670579433441162,
"learning_rate": 8.615479549763756e-05,
"loss": 2.406460189819336,
"memory(GiB)": 40.86,
"step": 1200,
"token_acc": 0.48223350253807107,
"train_speed(iter/s)": 0.587084
},
{
"epoch": 0.7311893203883495,
"grad_norm": 6.295917510986328,
"learning_rate": 8.604488182363074e-05,
"loss": 2.536121940612793,
"memory(GiB)": 40.86,
"step": 1205,
"token_acc": 0.4873417721518987,
"train_speed(iter/s)": 0.587357
},
{
"epoch": 0.7342233009708737,
"grad_norm": 6.125625133514404,
"learning_rate": 8.593460429653133e-05,
"loss": 2.4063135147094727,
"memory(GiB)": 40.86,
"step": 1210,
"token_acc": 0.48128342245989303,
"train_speed(iter/s)": 0.587204
},
{
"epoch": 0.7372572815533981,
"grad_norm": 6.775357723236084,
"learning_rate": 8.582396402952984e-05,
"loss": 2.082032585144043,
"memory(GiB)": 40.86,
"step": 1215,
"token_acc": 0.5273311897106109,
"train_speed(iter/s)": 0.587489
},
{
"epoch": 0.7402912621359223,
"grad_norm": 8.1486177444458,
"learning_rate": 8.571296213947838e-05,
"loss": 1.675777053833008,
"memory(GiB)": 40.86,
"step": 1220,
"token_acc": 0.6021897810218978,
"train_speed(iter/s)": 0.587737
},
{
"epoch": 0.7433252427184466,
"grad_norm": 5.302309036254883,
"learning_rate": 8.560159974687952e-05,
"loss": 2.1232393264770506,
"memory(GiB)": 40.86,
"step": 1225,
"token_acc": 0.5156695156695157,
"train_speed(iter/s)": 0.587809
},
{
"epoch": 0.7463592233009708,
"grad_norm": 9.10730266571045,
"learning_rate": 8.54898779758748e-05,
"loss": 2.1305063247680662,
"memory(GiB)": 40.86,
"step": 1230,
"token_acc": 0.53515625,
"train_speed(iter/s)": 0.587588
},
{
"epoch": 0.7493932038834952,
"grad_norm": 6.489813327789307,
"learning_rate": 8.537779795423359e-05,
"loss": 2.2934566497802735,
"memory(GiB)": 40.86,
"step": 1235,
"token_acc": 0.5161290322580645,
"train_speed(iter/s)": 0.587435
},
{
"epoch": 0.7524271844660194,
"grad_norm": 6.982603549957275,
"learning_rate": 8.526536081334152e-05,
"loss": 2.2987644195556642,
"memory(GiB)": 40.86,
"step": 1240,
"token_acc": 0.47924528301886793,
"train_speed(iter/s)": 0.587522
},
{
"epoch": 0.7554611650485437,
"grad_norm": 7.774171352386475,
"learning_rate": 8.515256768818918e-05,
"loss": 2.5817737579345703,
"memory(GiB)": 40.86,
"step": 1245,
"token_acc": 0.5040431266846361,
"train_speed(iter/s)": 0.587668
},
{
"epoch": 0.758495145631068,
"grad_norm": 5.695102691650391,
"learning_rate": 8.503941971736062e-05,
"loss": 2.298574447631836,
"memory(GiB)": 40.86,
"step": 1250,
"token_acc": 0.5070821529745042,
"train_speed(iter/s)": 0.587481
},
{
"epoch": 0.7615291262135923,
"grad_norm": 5.622751235961914,
"learning_rate": 8.492591804302186e-05,
"loss": 2.149024772644043,
"memory(GiB)": 40.86,
"step": 1255,
"token_acc": 0.5030674846625767,
"train_speed(iter/s)": 0.58761
},
{
"epoch": 0.7645631067961165,
"grad_norm": 9.05585765838623,
"learning_rate": 8.481206381090934e-05,
"loss": 2.464432716369629,
"memory(GiB)": 40.86,
"step": 1260,
"token_acc": 0.504950495049505,
"train_speed(iter/s)": 0.587385
},
{
"epoch": 0.7675970873786407,
"grad_norm": 6.5983428955078125,
"learning_rate": 8.469785817031841e-05,
"loss": 2.203810119628906,
"memory(GiB)": 40.86,
"step": 1265,
"token_acc": 0.5412186379928315,
"train_speed(iter/s)": 0.587789
},
{
"epoch": 0.7706310679611651,
"grad_norm": 4.769191265106201,
"learning_rate": 8.458330227409168e-05,
"loss": 2.432425308227539,
"memory(GiB)": 40.86,
"step": 1270,
"token_acc": 0.4608433734939759,
"train_speed(iter/s)": 0.588056
},
{
"epoch": 0.7736650485436893,
"grad_norm": 8.539231300354004,
"learning_rate": 8.446839727860738e-05,
"loss": 2.354892539978027,
"memory(GiB)": 40.86,
"step": 1275,
"token_acc": 0.5155555555555555,
"train_speed(iter/s)": 0.588053
},
{
"epoch": 0.7766990291262136,
"grad_norm": 5.209239959716797,
"learning_rate": 8.435314434376773e-05,
"loss": 2.296826934814453,
"memory(GiB)": 40.86,
"step": 1280,
"token_acc": 0.5085714285714286,
"train_speed(iter/s)": 0.587863
},
{
"epoch": 0.7797330097087378,
"grad_norm": 7.653853893280029,
"learning_rate": 8.423754463298717e-05,
"loss": 2.117538261413574,
"memory(GiB)": 40.86,
"step": 1285,
"token_acc": 0.5723076923076923,
"train_speed(iter/s)": 0.587922
},
{
"epoch": 0.7827669902912622,
"grad_norm": 7.506109237670898,
"learning_rate": 8.412159931318068e-05,
"loss": 2.4975624084472656,
"memory(GiB)": 40.86,
"step": 1290,
"token_acc": 0.46710526315789475,
"train_speed(iter/s)": 0.587905
},
{
"epoch": 0.7858009708737864,
"grad_norm": 5.187159538269043,
"learning_rate": 8.400530955475198e-05,
"loss": 2.2532814025878904,
"memory(GiB)": 40.86,
"step": 1295,
"token_acc": 0.46987951807228917,
"train_speed(iter/s)": 0.587699
},
{
"epoch": 0.7888349514563107,
"grad_norm": 8.4281587600708,
"learning_rate": 8.38886765315817e-05,
"loss": 2.3919906616210938,
"memory(GiB)": 40.86,
"step": 1300,
"token_acc": 0.5017182130584192,
"train_speed(iter/s)": 0.587556
},
{
"epoch": 0.7918689320388349,
"grad_norm": 6.116622447967529,
"learning_rate": 8.377170142101548e-05,
"loss": 2.3181718826293944,
"memory(GiB)": 40.86,
"step": 1305,
"token_acc": 0.5061349693251533,
"train_speed(iter/s)": 0.58774
},
{
"epoch": 0.7949029126213593,
"grad_norm": 8.740164756774902,
"learning_rate": 8.365438540385223e-05,
"loss": 2.1749797821044923,
"memory(GiB)": 40.86,
"step": 1310,
"token_acc": 0.5187713310580204,
"train_speed(iter/s)": 0.587607
},
{
"epoch": 0.7979368932038835,
"grad_norm": 6.935183048248291,
"learning_rate": 8.353672966433206e-05,
"loss": 2.314193534851074,
"memory(GiB)": 40.86,
"step": 1315,
"token_acc": 0.47039473684210525,
"train_speed(iter/s)": 0.587722
},
{
"epoch": 0.8009708737864077,
"grad_norm": 7.3493475914001465,
"learning_rate": 8.341873539012444e-05,
"loss": 2.2399951934814455,
"memory(GiB)": 40.86,
"step": 1320,
"token_acc": 0.5111821086261981,
"train_speed(iter/s)": 0.587965
},
{
"epoch": 0.804004854368932,
"grad_norm": 6.552261829376221,
"learning_rate": 8.33004037723161e-05,
"loss": 2.223754119873047,
"memory(GiB)": 40.86,
"step": 1325,
"token_acc": 0.5283018867924528,
"train_speed(iter/s)": 0.588183
},
{
"epoch": 0.8070388349514563,
"grad_norm": 6.420342445373535,
"learning_rate": 8.318173600539911e-05,
"loss": 1.9445220947265625,
"memory(GiB)": 40.86,
"step": 1330,
"token_acc": 0.5394736842105263,
"train_speed(iter/s)": 0.588461
},
{
"epoch": 0.8100728155339806,
"grad_norm": 5.923401355743408,
"learning_rate": 8.306273328725878e-05,
"loss": 2.1622385025024413,
"memory(GiB)": 40.86,
"step": 1335,
"token_acc": 0.5357142857142857,
"train_speed(iter/s)": 0.588601
},
{
"epoch": 0.8131067961165048,
"grad_norm": 7.1788506507873535,
"learning_rate": 8.294339681916154e-05,
"loss": 2.1121898651123048,
"memory(GiB)": 40.86,
"step": 1340,
"token_acc": 0.496875,
"train_speed(iter/s)": 0.588559
},
{
"epoch": 0.8161407766990292,
"grad_norm": 6.46894645690918,
"learning_rate": 8.282372780574285e-05,
"loss": 2.207390022277832,
"memory(GiB)": 40.86,
"step": 1345,
"token_acc": 0.5216049382716049,
"train_speed(iter/s)": 0.588706
},
{
"epoch": 0.8191747572815534,
"grad_norm": 7.959349632263184,
"learning_rate": 8.270372745499506e-05,
"loss": 2.2782615661621093,
"memory(GiB)": 40.86,
"step": 1350,
"token_acc": 0.5174603174603175,
"train_speed(iter/s)": 0.588601
},
{
"epoch": 0.8222087378640777,
"grad_norm": 7.4319939613342285,
"learning_rate": 8.258339697825515e-05,
"loss": 1.8879600524902345,
"memory(GiB)": 40.86,
"step": 1355,
"token_acc": 0.5580357142857143,
"train_speed(iter/s)": 0.588875
},
{
"epoch": 0.8252427184466019,
"grad_norm": 7.50739860534668,
"learning_rate": 8.246273759019252e-05,
"loss": 2.3653688430786133,
"memory(GiB)": 40.86,
"step": 1360,
"token_acc": 0.5179856115107914,
"train_speed(iter/s)": 0.588976
},
{
"epoch": 0.8282766990291263,
"grad_norm": 8.38315486907959,
"learning_rate": 8.234175050879684e-05,
"loss": 2.0219940185546874,
"memory(GiB)": 40.86,
"step": 1365,
"token_acc": 0.5266903914590747,
"train_speed(iter/s)": 0.589206
},
{
"epoch": 0.8313106796116505,
"grad_norm": 5.579223155975342,
"learning_rate": 8.222043695536555e-05,
"loss": 2.0323202133178713,
"memory(GiB)": 41.25,
"step": 1370,
"token_acc": 0.5419847328244275,
"train_speed(iter/s)": 0.588666
},
{
"epoch": 0.8343446601941747,
"grad_norm": 7.079959392547607,
"learning_rate": 8.20987981544917e-05,
"loss": 2.245712661743164,
"memory(GiB)": 41.25,
"step": 1375,
"token_acc": 0.5054545454545455,
"train_speed(iter/s)": 0.58865
},
{
"epoch": 0.837378640776699,
"grad_norm": 5.938848972320557,
"learning_rate": 8.197683533405157e-05,
"loss": 1.959267807006836,
"memory(GiB)": 41.25,
"step": 1380,
"token_acc": 0.5316901408450704,
"train_speed(iter/s)": 0.58891
},
{
"epoch": 0.8404126213592233,
"grad_norm": 8.333083152770996,
"learning_rate": 8.185454972519213e-05,
"loss": 2.2188604354858397,
"memory(GiB)": 41.25,
"step": 1385,
"token_acc": 0.5415282392026578,
"train_speed(iter/s)": 0.589226
},
{
"epoch": 0.8434466019417476,
"grad_norm": 5.235838413238525,
"learning_rate": 8.173194256231884e-05,
"loss": 2.312948226928711,
"memory(GiB)": 41.25,
"step": 1390,
"token_acc": 0.48546511627906974,
"train_speed(iter/s)": 0.589378
},
{
"epoch": 0.8464805825242718,
"grad_norm": 9.581235885620117,
"learning_rate": 8.1609015083083e-05,
"loss": 2.3604787826538085,
"memory(GiB)": 41.25,
"step": 1395,
"token_acc": 0.4927007299270073,
"train_speed(iter/s)": 0.589384
},
{
"epoch": 0.8495145631067961,
"grad_norm": 6.8221611976623535,
"learning_rate": 8.148576852836933e-05,
"loss": 2.0327474594116213,
"memory(GiB)": 41.25,
"step": 1400,
"token_acc": 0.569620253164557,
"train_speed(iter/s)": 0.589095
},
{
"epoch": 0.8525485436893204,
"grad_norm": 7.140889644622803,
"learning_rate": 8.136220414228347e-05,
"loss": 2.5129384994506836,
"memory(GiB)": 41.25,
"step": 1405,
"token_acc": 0.4952076677316294,
"train_speed(iter/s)": 0.589242
},
{
"epoch": 0.8555825242718447,
"grad_norm": 5.594088077545166,
"learning_rate": 8.123832317213933e-05,
"loss": 2.288181686401367,
"memory(GiB)": 41.25,
"step": 1410,
"token_acc": 0.5228758169934641,
"train_speed(iter/s)": 0.589415
},
{
"epoch": 0.8586165048543689,
"grad_norm": 5.7525811195373535,
"learning_rate": 8.111412686844664e-05,
"loss": 2.288965606689453,
"memory(GiB)": 41.25,
"step": 1415,
"token_acc": 0.5157068062827225,
"train_speed(iter/s)": 0.589323
},
{
"epoch": 0.8616504854368932,
"grad_norm": 9.362752914428711,
"learning_rate": 8.098961648489821e-05,
"loss": 1.9993032455444335,
"memory(GiB)": 41.25,
"step": 1420,
"token_acc": 0.5448275862068965,
"train_speed(iter/s)": 0.589162
},
{
"epoch": 0.8646844660194175,
"grad_norm": 6.3312764167785645,
"learning_rate": 8.08647932783573e-05,
"loss": 2.4338268280029296,
"memory(GiB)": 41.25,
"step": 1425,
"token_acc": 0.4863013698630137,
"train_speed(iter/s)": 0.589226
},
{
"epoch": 0.8677184466019418,
"grad_norm": 5.9260172843933105,
"learning_rate": 8.073965850884496e-05,
"loss": 2.2075326919555662,
"memory(GiB)": 41.25,
"step": 1430,
"token_acc": 0.5230769230769231,
"train_speed(iter/s)": 0.589205
},
{
"epoch": 0.870752427184466,
"grad_norm": 4.957935810089111,
"learning_rate": 8.061421343952731e-05,
"loss": 2.123280334472656,
"memory(GiB)": 41.25,
"step": 1435,
"token_acc": 0.5446927374301676,
"train_speed(iter/s)": 0.589414
},
{
"epoch": 0.8737864077669902,
"grad_norm": 5.678249359130859,
"learning_rate": 8.048845933670273e-05,
"loss": 1.9449834823608398,
"memory(GiB)": 41.25,
"step": 1440,
"token_acc": 0.5700934579439252,
"train_speed(iter/s)": 0.589581
},
{
"epoch": 0.8768203883495146,
"grad_norm": 7.655086040496826,
"learning_rate": 8.036239746978914e-05,
"loss": 2.4002641677856444,
"memory(GiB)": 41.25,
"step": 1445,
"token_acc": 0.4842105263157895,
"train_speed(iter/s)": 0.58949
},
{
"epoch": 0.8798543689320388,
"grad_norm": 6.851123332977295,
"learning_rate": 8.02360291113112e-05,
"loss": 2.103730392456055,
"memory(GiB)": 41.25,
"step": 1450,
"token_acc": 0.5627009646302251,
"train_speed(iter/s)": 0.589624
},
{
"epoch": 0.8828883495145631,
"grad_norm": 7.437098979949951,
"learning_rate": 8.010935553688741e-05,
"loss": 2.1862071990966796,
"memory(GiB)": 41.25,
"step": 1455,
"token_acc": 0.5364238410596026,
"train_speed(iter/s)": 0.58987
},
{
"epoch": 0.8859223300970874,
"grad_norm": 7.451559066772461,
"learning_rate": 7.998237802521726e-05,
"loss": 2.167529296875,
"memory(GiB)": 41.25,
"step": 1460,
"token_acc": 0.5220338983050847,
"train_speed(iter/s)": 0.589733
},
{
"epoch": 0.8889563106796117,
"grad_norm": 5.745720863342285,
"learning_rate": 7.985509785806827e-05,
"loss": 1.7958356857299804,
"memory(GiB)": 41.25,
"step": 1465,
"token_acc": 0.6163793103448276,
"train_speed(iter/s)": 0.589826
},
{
"epoch": 0.8919902912621359,
"grad_norm": 5.106093406677246,
"learning_rate": 7.97275163202632e-05,
"loss": 1.7782585144042968,
"memory(GiB)": 41.25,
"step": 1470,
"token_acc": 0.5852842809364549,
"train_speed(iter/s)": 0.589876
},
{
"epoch": 0.8950242718446602,
"grad_norm": 7.241384506225586,
"learning_rate": 7.959963469966687e-05,
"loss": 2.27147216796875,
"memory(GiB)": 41.25,
"step": 1475,
"token_acc": 0.52,
"train_speed(iter/s)": 0.590043
},
{
"epoch": 0.8980582524271845,
"grad_norm": 7.773332595825195,
"learning_rate": 7.947145428717335e-05,
"loss": 2.3339469909667967,
"memory(GiB)": 41.25,
"step": 1480,
"token_acc": 0.4868035190615836,
"train_speed(iter/s)": 0.59013
},
{
"epoch": 0.9010922330097088,
"grad_norm": 6.2095866203308105,
"learning_rate": 7.934297637669281e-05,
"loss": 2.15749568939209,
"memory(GiB)": 41.25,
"step": 1485,
"token_acc": 0.5335120643431636,
"train_speed(iter/s)": 0.590232
},
{
"epoch": 0.904126213592233,
"grad_norm": 9.049623489379883,
"learning_rate": 7.921420226513852e-05,
"loss": 2.2805938720703125,
"memory(GiB)": 41.25,
"step": 1490,
"token_acc": 0.48771929824561405,
"train_speed(iter/s)": 0.590181
},
{
"epoch": 0.9071601941747572,
"grad_norm": 5.86360502243042,
"learning_rate": 7.90851332524137e-05,
"loss": 2.204097557067871,
"memory(GiB)": 41.25,
"step": 1495,
"token_acc": 0.5291970802919708,
"train_speed(iter/s)": 0.589939
},
{
"epoch": 0.9101941747572816,
"grad_norm": 6.702127456665039,
"learning_rate": 7.895577064139848e-05,
"loss": 2.099565124511719,
"memory(GiB)": 41.25,
"step": 1500,
"token_acc": 0.5468164794007491,
"train_speed(iter/s)": 0.590107
},
{
"epoch": 0.9101941747572816,
"eval_loss": 1.9851206541061401,
"eval_runtime": 12.4849,
"eval_samples_per_second": 8.01,
"eval_steps_per_second": 8.01,
"eval_token_acc": 0.5260196905766527,
"step": 1500
},
{
"epoch": 0.9132281553398058,
"grad_norm": 7.588992118835449,
"learning_rate": 7.882611573793663e-05,
"loss": 2.118764877319336,
"memory(GiB)": 41.25,
"step": 1505,
"token_acc": 0.5204795204795205,
"train_speed(iter/s)": 0.586984
},
{
"epoch": 0.9162621359223301,
"grad_norm": 5.986236572265625,
"learning_rate": 7.869616985082255e-05,
"loss": 2.0279298782348634,
"memory(GiB)": 41.25,
"step": 1510,
"token_acc": 0.5660377358490566,
"train_speed(iter/s)": 0.586663
},
{
"epoch": 0.9192961165048543,
"grad_norm": 7.583939075469971,
"learning_rate": 7.856593429178789e-05,
"loss": 2.0275857925415037,
"memory(GiB)": 41.25,
"step": 1515,
"token_acc": 0.5351170568561873,
"train_speed(iter/s)": 0.586556
},
{
"epoch": 0.9223300970873787,
"grad_norm": 7.145445823669434,
"learning_rate": 7.843541037548838e-05,
"loss": 2.181304168701172,
"memory(GiB)": 41.25,
"step": 1520,
"token_acc": 0.5451612903225806,
"train_speed(iter/s)": 0.58615
},
{
"epoch": 0.9253640776699029,
"grad_norm": 9.427350997924805,
"learning_rate": 7.830459941949058e-05,
"loss": 1.9623226165771483,
"memory(GiB)": 41.25,
"step": 1525,
"token_acc": 0.5575539568345323,
"train_speed(iter/s)": 0.585779
},
{
"epoch": 0.9283980582524272,
"grad_norm": 10.541104316711426,
"learning_rate": 7.817350274425856e-05,
"loss": 2.2855878829956056,
"memory(GiB)": 41.25,
"step": 1530,
"token_acc": 0.518796992481203,
"train_speed(iter/s)": 0.585757
},
{
"epoch": 0.9314320388349514,
"grad_norm": 8.254549980163574,
"learning_rate": 7.804212167314054e-05,
"loss": 2.3625198364257813,
"memory(GiB)": 41.25,
"step": 1535,
"token_acc": 0.45938375350140054,
"train_speed(iter/s)": 0.585525
},
{
"epoch": 0.9344660194174758,
"grad_norm": 5.327072620391846,
"learning_rate": 7.791045753235555e-05,
"loss": 2.1574447631835936,
"memory(GiB)": 41.25,
"step": 1540,
"token_acc": 0.5401234567901234,
"train_speed(iter/s)": 0.585161
},
{
"epoch": 0.9375,
"grad_norm": 6.956089496612549,
"learning_rate": 7.777851165098012e-05,
"loss": 2.220409965515137,
"memory(GiB)": 41.25,
"step": 1545,
"token_acc": 0.5068027210884354,
"train_speed(iter/s)": 0.58532
},
{
"epoch": 0.9405339805825242,
"grad_norm": 8.668743133544922,
"learning_rate": 7.76462853609347e-05,
"loss": 2.2191883087158204,
"memory(GiB)": 41.25,
"step": 1550,
"token_acc": 0.5181159420289855,
"train_speed(iter/s)": 0.58508
},
{
"epoch": 0.9435679611650486,
"grad_norm": 6.6736063957214355,
"learning_rate": 7.751377999697043e-05,
"loss": 2.111481857299805,
"memory(GiB)": 41.25,
"step": 1555,
"token_acc": 0.5555555555555556,
"train_speed(iter/s)": 0.585013
},
{
"epoch": 0.9466019417475728,
"grad_norm": 8.943532943725586,
"learning_rate": 7.73809968966554e-05,
"loss": 2.2334514617919923,
"memory(GiB)": 41.25,
"step": 1560,
"token_acc": 0.5,
"train_speed(iter/s)": 0.585004
},
{
"epoch": 0.9496359223300971,
"grad_norm": 6.392602443695068,
"learning_rate": 7.724793740036142e-05,
"loss": 2.3538848876953127,
"memory(GiB)": 41.25,
"step": 1565,
"token_acc": 0.5333333333333333,
"train_speed(iter/s)": 0.585018
},
{
"epoch": 0.9526699029126213,
"grad_norm": 8.349867820739746,
"learning_rate": 7.711460285125028e-05,
"loss": 1.9792165756225586,
"memory(GiB)": 41.25,
"step": 1570,
"token_acc": 0.5506756756756757,
"train_speed(iter/s)": 0.584841
},
{
"epoch": 0.9557038834951457,
"grad_norm": 6.740106582641602,
"learning_rate": 7.698099459526034e-05,
"loss": 2.2277217864990235,
"memory(GiB)": 41.25,
"step": 1575,
"token_acc": 0.5065359477124183,
"train_speed(iter/s)": 0.584644
},
{
"epoch": 0.9587378640776699,
"grad_norm": 7.618457317352295,
"learning_rate": 7.684711398109284e-05,
"loss": 2.152913284301758,
"memory(GiB)": 41.25,
"step": 1580,
"token_acc": 0.5343283582089552,
"train_speed(iter/s)": 0.584502
},
{
"epoch": 0.9617718446601942,
"grad_norm": 5.828027248382568,
"learning_rate": 7.67129623601983e-05,
"loss": 2.1841548919677733,
"memory(GiB)": 41.25,
"step": 1585,
"token_acc": 0.509493670886076,
"train_speed(iter/s)": 0.584655
},
{
"epoch": 0.9648058252427184,
"grad_norm": 8.393068313598633,
"learning_rate": 7.657854108676299e-05,
"loss": 2.4885177612304688,
"memory(GiB)": 41.25,
"step": 1590,
"token_acc": 0.48773006134969327,
"train_speed(iter/s)": 0.584201
},
{
"epoch": 0.9678398058252428,
"grad_norm": 6.520992755889893,
"learning_rate": 7.644385151769509e-05,
"loss": 2.489660453796387,
"memory(GiB)": 41.25,
"step": 1595,
"token_acc": 0.49107142857142855,
"train_speed(iter/s)": 0.584289
},
{
"epoch": 0.970873786407767,
"grad_norm": 5.243824481964111,
"learning_rate": 7.630889501261109e-05,
"loss": 2.0495643615722656,
"memory(GiB)": 41.25,
"step": 1600,
"token_acc": 0.5572289156626506,
"train_speed(iter/s)": 0.584189
},
{
"epoch": 0.9739077669902912,
"grad_norm": 8.216861724853516,
"learning_rate": 7.617367293382211e-05,
"loss": 2.7457176208496095,
"memory(GiB)": 41.25,
"step": 1605,
"token_acc": 0.4244791666666667,
"train_speed(iter/s)": 0.584051
},
{
"epoch": 0.9769417475728155,
"grad_norm": 6.738630771636963,
"learning_rate": 7.603818664632001e-05,
"loss": 2.252565383911133,
"memory(GiB)": 41.25,
"step": 1610,
"token_acc": 0.48986486486486486,
"train_speed(iter/s)": 0.584192
},
{
"epoch": 0.9799757281553398,
"grad_norm": 6.404202938079834,
"learning_rate": 7.590243751776374e-05,
"loss": 2.2700517654418944,
"memory(GiB)": 41.25,
"step": 1615,
"token_acc": 0.4858757062146893,
"train_speed(iter/s)": 0.584132
},
{
"epoch": 0.9830097087378641,
"grad_norm": 6.124429702758789,
"learning_rate": 7.576642691846546e-05,
"loss": 2.3936836242675783,
"memory(GiB)": 41.25,
"step": 1620,
"token_acc": 0.5235294117647059,
"train_speed(iter/s)": 0.58398
},
{
"epoch": 0.9860436893203883,
"grad_norm": 7.0240044593811035,
"learning_rate": 7.563015622137674e-05,
"loss": 2.3892589569091798,
"memory(GiB)": 41.25,
"step": 1625,
"token_acc": 0.46688741721854304,
"train_speed(iter/s)": 0.583754
},
{
"epoch": 0.9890776699029126,
"grad_norm": 6.437112331390381,
"learning_rate": 7.549362680207472e-05,
"loss": 2.232225036621094,
"memory(GiB)": 41.25,
"step": 1630,
"token_acc": 0.4984025559105431,
"train_speed(iter/s)": 0.58366
},
{
"epoch": 0.9921116504854369,
"grad_norm": 6.010834217071533,
"learning_rate": 7.535684003874816e-05,
"loss": 2.146392822265625,
"memory(GiB)": 41.25,
"step": 1635,
"token_acc": 0.5261538461538462,
"train_speed(iter/s)": 0.58354
},
{
"epoch": 0.9951456310679612,
"grad_norm": 6.317235946655273,
"learning_rate": 7.521979731218356e-05,
"loss": 2.3056121826171876,
"memory(GiB)": 41.25,
"step": 1640,
"token_acc": 0.48264984227129337,
"train_speed(iter/s)": 0.583292
},
{
"epoch": 0.9981796116504854,
"grad_norm": 7.453293800354004,
"learning_rate": 7.508250000575125e-05,
"loss": 2.188218688964844,
"memory(GiB)": 41.25,
"step": 1645,
"token_acc": 0.5512367491166078,
"train_speed(iter/s)": 0.583245
},
{
"epoch": 1.0012135922330097,
"grad_norm": 8.073345184326172,
"learning_rate": 7.494494950539143e-05,
"loss": 1.7200986862182617,
"memory(GiB)": 41.25,
"step": 1650,
"token_acc": 0.6014760147601476,
"train_speed(iter/s)": 0.583281
},
{
"epoch": 1.004247572815534,
"grad_norm": 6.676420211791992,
"learning_rate": 7.480714719960007e-05,
"loss": 2.1127391815185548,
"memory(GiB)": 41.25,
"step": 1655,
"token_acc": 0.5288461538461539,
"train_speed(iter/s)": 0.583366
},
{
"epoch": 1.0072815533980584,
"grad_norm": 6.307994842529297,
"learning_rate": 7.466909447941508e-05,
"loss": 1.8806413650512694,
"memory(GiB)": 41.25,
"step": 1660,
"token_acc": 0.5547703180212014,
"train_speed(iter/s)": 0.583445
},
{
"epoch": 1.0103155339805825,
"grad_norm": 6.221712589263916,
"learning_rate": 7.453079273840207e-05,
"loss": 2.276551055908203,
"memory(GiB)": 41.25,
"step": 1665,
"token_acc": 0.5133333333333333,
"train_speed(iter/s)": 0.583162
},
{
"epoch": 1.0133495145631068,
"grad_norm": 5.912354469299316,
"learning_rate": 7.439224337264043e-05,
"loss": 1.9514554977416991,
"memory(GiB)": 41.25,
"step": 1670,
"token_acc": 0.5527950310559007,
"train_speed(iter/s)": 0.583074
},
{
"epoch": 1.016383495145631,
"grad_norm": 7.461360931396484,
"learning_rate": 7.425344778070917e-05,
"loss": 2.087990951538086,
"memory(GiB)": 41.25,
"step": 1675,
"token_acc": 0.5451713395638629,
"train_speed(iter/s)": 0.583019
},
{
"epoch": 1.0194174757281553,
"grad_norm": 6.206568241119385,
"learning_rate": 7.411440736367281e-05,
"loss": 2.088376045227051,
"memory(GiB)": 41.25,
"step": 1680,
"token_acc": 0.5496688741721855,
"train_speed(iter/s)": 0.58291
},
{
"epoch": 1.0224514563106797,
"grad_norm": 6.608606338500977,
"learning_rate": 7.397512352506727e-05,
"loss": 1.6116622924804687,
"memory(GiB)": 41.25,
"step": 1685,
"token_acc": 0.5833333333333334,
"train_speed(iter/s)": 0.582982
},
{
"epoch": 1.0254854368932038,
"grad_norm": 7.508535385131836,
"learning_rate": 7.383559767088566e-05,
"loss": 1.8518999099731446,
"memory(GiB)": 41.25,
"step": 1690,
"token_acc": 0.5867158671586716,
"train_speed(iter/s)": 0.583052
},
{
"epoch": 1.0285194174757282,
"grad_norm": 6.2956318855285645,
"learning_rate": 7.369583120956407e-05,
"loss": 2.077930450439453,
"memory(GiB)": 41.25,
"step": 1695,
"token_acc": 0.5295950155763239,
"train_speed(iter/s)": 0.583022
},
{
"epoch": 1.0315533980582525,
"grad_norm": 6.229779243469238,
"learning_rate": 7.355582555196745e-05,
"loss": 1.6506580352783202,
"memory(GiB)": 41.25,
"step": 1700,
"token_acc": 0.6342182890855457,
"train_speed(iter/s)": 0.582892
},
{
"epoch": 1.0345873786407767,
"grad_norm": 7.167182445526123,
"learning_rate": 7.341558211137526e-05,
"loss": 2.1481195449829102,
"memory(GiB)": 41.25,
"step": 1705,
"token_acc": 0.49226006191950467,
"train_speed(iter/s)": 0.582852
},
{
"epoch": 1.037621359223301,
"grad_norm": 7.526867866516113,
"learning_rate": 7.327510230346726e-05,
"loss": 2.0346538543701174,
"memory(GiB)": 41.25,
"step": 1710,
"token_acc": 0.5077399380804953,
"train_speed(iter/s)": 0.582708
},
{
"epoch": 1.0406553398058251,
"grad_norm": 6.285158634185791,
"learning_rate": 7.313438754630918e-05,
"loss": 2.084914779663086,
"memory(GiB)": 41.25,
"step": 1715,
"token_acc": 0.5326797385620915,
"train_speed(iter/s)": 0.58263
},
{
"epoch": 1.0436893203883495,
"grad_norm": 5.3016252517700195,
"learning_rate": 7.299343926033851e-05,
"loss": 1.8931154251098632,
"memory(GiB)": 41.25,
"step": 1720,
"token_acc": 0.5520504731861199,
"train_speed(iter/s)": 0.582235
},
{
"epoch": 1.0467233009708738,
"grad_norm": 6.363744258880615,
"learning_rate": 7.285225886834997e-05,
"loss": 2.1936279296875,
"memory(GiB)": 41.25,
"step": 1725,
"token_acc": 0.49683544303797467,
"train_speed(iter/s)": 0.582259
},
{
"epoch": 1.049757281553398,
"grad_norm": 6.571318626403809,
"learning_rate": 7.271084779548136e-05,
"loss": 2.0733669281005858,
"memory(GiB)": 41.25,
"step": 1730,
"token_acc": 0.5579937304075235,
"train_speed(iter/s)": 0.582202
},
{
"epoch": 1.0527912621359223,
"grad_norm": 7.151698589324951,
"learning_rate": 7.256920746919904e-05,
"loss": 2.2026699066162108,
"memory(GiB)": 41.25,
"step": 1735,
"token_acc": 0.5150375939849624,
"train_speed(iter/s)": 0.58212
},
{
"epoch": 1.0558252427184467,
"grad_norm": 6.636294364929199,
"learning_rate": 7.242733931928352e-05,
"loss": 2.145404052734375,
"memory(GiB)": 41.25,
"step": 1740,
"token_acc": 0.49221183800623053,
"train_speed(iter/s)": 0.582037
},
{
"epoch": 1.0588592233009708,
"grad_norm": 6.21516227722168,
"learning_rate": 7.228524477781514e-05,
"loss": 1.6696731567382812,
"memory(GiB)": 41.25,
"step": 1745,
"token_acc": 0.6295081967213115,
"train_speed(iter/s)": 0.581842
},
{
"epoch": 1.0618932038834952,
"grad_norm": 6.904699802398682,
"learning_rate": 7.214292527915949e-05,
"loss": 1.995549201965332,
"memory(GiB)": 41.25,
"step": 1750,
"token_acc": 0.5806451612903226,
"train_speed(iter/s)": 0.581695
},
{
"epoch": 1.0649271844660193,
"grad_norm": 4.713315963745117,
"learning_rate": 7.200038225995294e-05,
"loss": 2.3474475860595705,
"memory(GiB)": 41.25,
"step": 1755,
"token_acc": 0.4887005649717514,
"train_speed(iter/s)": 0.581706
},
{
"epoch": 1.0679611650485437,
"grad_norm": 8.901693344116211,
"learning_rate": 7.185761715908825e-05,
"loss": 2.004246139526367,
"memory(GiB)": 41.25,
"step": 1760,
"token_acc": 0.5867158671586716,
"train_speed(iter/s)": 0.581409
},
{
"epoch": 1.070995145631068,
"grad_norm": 6.650726318359375,
"learning_rate": 7.171463141769994e-05,
"loss": 2.21859130859375,
"memory(GiB)": 41.25,
"step": 1765,
"token_acc": 0.5466666666666666,
"train_speed(iter/s)": 0.581411
},
{
"epoch": 1.0740291262135921,
"grad_norm": 7.826591968536377,
"learning_rate": 7.157142647914979e-05,
"loss": 2.0319658279418946,
"memory(GiB)": 41.25,
"step": 1770,
"token_acc": 0.5594202898550724,
"train_speed(iter/s)": 0.581305
},
{
"epoch": 1.0770631067961165,
"grad_norm": 6.98701286315918,
"learning_rate": 7.14280037890122e-05,
"loss": 1.9901140213012696,
"memory(GiB)": 41.25,
"step": 1775,
"token_acc": 0.551829268292683,
"train_speed(iter/s)": 0.581264
},
{
"epoch": 1.0800970873786409,
"grad_norm": 6.480953693389893,
"learning_rate": 7.128436479505971e-05,
"loss": 2.1239852905273438,
"memory(GiB)": 41.25,
"step": 1780,
"token_acc": 0.5121359223300971,
"train_speed(iter/s)": 0.581217
},
{
"epoch": 1.083131067961165,
"grad_norm": 5.683126449584961,
"learning_rate": 7.114051094724831e-05,
"loss": 2.0841569900512695,
"memory(GiB)": 41.25,
"step": 1785,
"token_acc": 0.5318352059925093,
"train_speed(iter/s)": 0.581099
},
{
"epoch": 1.0861650485436893,
"grad_norm": 5.394412517547607,
"learning_rate": 7.09964436977028e-05,
"loss": 1.9973236083984376,
"memory(GiB)": 41.25,
"step": 1790,
"token_acc": 0.541095890410959,
"train_speed(iter/s)": 0.580947
},
{
"epoch": 1.0891990291262137,
"grad_norm": 5.046519756317139,
"learning_rate": 7.085216450070218e-05,
"loss": 2.029042053222656,
"memory(GiB)": 41.25,
"step": 1795,
"token_acc": 0.5591054313099042,
"train_speed(iter/s)": 0.580934
},
{
"epoch": 1.0922330097087378,
"grad_norm": 6.593071460723877,
"learning_rate": 7.070767481266492e-05,
"loss": 1.8102890014648438,
"memory(GiB)": 41.25,
"step": 1800,
"token_acc": 0.5718654434250765,
"train_speed(iter/s)": 0.581028
},
{
"epoch": 1.0952669902912622,
"grad_norm": 7.305717945098877,
"learning_rate": 7.056297609213432e-05,
"loss": 1.9902324676513672,
"memory(GiB)": 41.25,
"step": 1805,
"token_acc": 0.5699300699300699,
"train_speed(iter/s)": 0.581249
},
{
"epoch": 1.0983009708737863,
"grad_norm": 7.886199474334717,
"learning_rate": 7.041806979976368e-05,
"loss": 2.2953224182128906,
"memory(GiB)": 41.25,
"step": 1810,
"token_acc": 0.5,
"train_speed(iter/s)": 0.581276
},
{
"epoch": 1.1013349514563107,
"grad_norm": 10.443878173828125,
"learning_rate": 7.027295739830169e-05,
"loss": 2.220531463623047,
"memory(GiB)": 41.25,
"step": 1815,
"token_acc": 0.5054945054945055,
"train_speed(iter/s)": 0.581467
},
{
"epoch": 1.104368932038835,
"grad_norm": 8.019064903259277,
"learning_rate": 7.012764035257756e-05,
"loss": 2.4718793869018554,
"memory(GiB)": 41.25,
"step": 1820,
"token_acc": 0.47619047619047616,
"train_speed(iter/s)": 0.581659
},
{
"epoch": 1.1074029126213591,
"grad_norm": 7.334555625915527,
"learning_rate": 6.998212012948626e-05,
"loss": 1.9244306564331055,
"memory(GiB)": 41.25,
"step": 1825,
"token_acc": 0.5625,
"train_speed(iter/s)": 0.581498
},
{
"epoch": 1.1104368932038835,
"grad_norm": 10.03096866607666,
"learning_rate": 6.983639819797377e-05,
"loss": 2.2340341567993165,
"memory(GiB)": 41.25,
"step": 1830,
"token_acc": 0.5136986301369864,
"train_speed(iter/s)": 0.581297
},
{
"epoch": 1.1134708737864079,
"grad_norm": 8.886280059814453,
"learning_rate": 6.969047602902213e-05,
"loss": 2.0593013763427734,
"memory(GiB)": 41.25,
"step": 1835,
"token_acc": 0.5460526315789473,
"train_speed(iter/s)": 0.581181
},
{
"epoch": 1.116504854368932,
"grad_norm": 7.363580703735352,
"learning_rate": 6.954435509563478e-05,
"loss": 1.8324342727661134,
"memory(GiB)": 41.25,
"step": 1840,
"token_acc": 0.5607142857142857,
"train_speed(iter/s)": 0.581259
},
{
"epoch": 1.1195388349514563,
"grad_norm": 8.011999130249023,
"learning_rate": 6.939803687282146e-05,
"loss": 2.3135982513427735,
"memory(GiB)": 41.25,
"step": 1845,
"token_acc": 0.4666666666666667,
"train_speed(iter/s)": 0.581014
},
{
"epoch": 1.1225728155339807,
"grad_norm": 5.767248630523682,
"learning_rate": 6.925152283758348e-05,
"loss": 1.8407760620117188,
"memory(GiB)": 41.25,
"step": 1850,
"token_acc": 0.5792880258899676,
"train_speed(iter/s)": 0.58124
},
{
"epoch": 1.1256067961165048,
"grad_norm": 6.498402118682861,
"learning_rate": 6.91048144688988e-05,
"loss": 2.21679573059082,
"memory(GiB)": 41.25,
"step": 1855,
"token_acc": 0.5419354838709678,
"train_speed(iter/s)": 0.581462
},
{
"epoch": 1.1286407766990292,
"grad_norm": 6.394837379455566,
"learning_rate": 6.895791324770701e-05,
"loss": 1.947611427307129,
"memory(GiB)": 41.25,
"step": 1860,
"token_acc": 0.534375,
"train_speed(iter/s)": 0.581612
},
{
"epoch": 1.1316747572815533,
"grad_norm": 11.657426834106445,
"learning_rate": 6.881082065689453e-05,
"loss": 2.234457015991211,
"memory(GiB)": 41.25,
"step": 1865,
"token_acc": 0.543859649122807,
"train_speed(iter/s)": 0.581654
},
{
"epoch": 1.1347087378640777,
"grad_norm": 7.383030414581299,
"learning_rate": 6.866353818127942e-05,
"loss": 2.1067886352539062,
"memory(GiB)": 41.25,
"step": 1870,
"token_acc": 0.5800711743772242,
"train_speed(iter/s)": 0.581704
},
{
"epoch": 1.137742718446602,
"grad_norm": 6.467532157897949,
"learning_rate": 6.851606730759664e-05,
"loss": 2.12357234954834,
"memory(GiB)": 41.25,
"step": 1875,
"token_acc": 0.5160256410256411,
"train_speed(iter/s)": 0.581655
},
{
"epoch": 1.1407766990291262,
"grad_norm": 7.949151992797852,
"learning_rate": 6.836840952448285e-05,
"loss": 2.1536586761474608,
"memory(GiB)": 41.25,
"step": 1880,
"token_acc": 0.5520833333333334,
"train_speed(iter/s)": 0.581744
},
{
"epoch": 1.1438106796116505,
"grad_norm": 7.234400749206543,
"learning_rate": 6.82205663224615e-05,
"loss": 2.2570121765136717,
"memory(GiB)": 41.25,
"step": 1885,
"token_acc": 0.5168195718654435,
"train_speed(iter/s)": 0.581831
},
{
"epoch": 1.1468446601941746,
"grad_norm": 6.600982189178467,
"learning_rate": 6.807253919392773e-05,
"loss": 1.9843761444091796,
"memory(GiB)": 41.25,
"step": 1890,
"token_acc": 0.5444839857651246,
"train_speed(iter/s)": 0.581694
},
{
"epoch": 1.149878640776699,
"grad_norm": 7.820127010345459,
"learning_rate": 6.792432963313328e-05,
"loss": 2.096297836303711,
"memory(GiB)": 41.25,
"step": 1895,
"token_acc": 0.5566037735849056,
"train_speed(iter/s)": 0.581753
},
{
"epoch": 1.1529126213592233,
"grad_norm": 6.915624618530273,
"learning_rate": 6.777593913617152e-05,
"loss": 2.108437156677246,
"memory(GiB)": 41.25,
"step": 1900,
"token_acc": 0.5073313782991202,
"train_speed(iter/s)": 0.581827
},
{
"epoch": 1.1559466019417475,
"grad_norm": 7.475584030151367,
"learning_rate": 6.762736920096218e-05,
"loss": 2.277429389953613,
"memory(GiB)": 41.25,
"step": 1905,
"token_acc": 0.5014577259475219,
"train_speed(iter/s)": 0.581845
},
{
"epoch": 1.1589805825242718,
"grad_norm": 7.104306221008301,
"learning_rate": 6.747862132723641e-05,
"loss": 2.067903518676758,
"memory(GiB)": 41.25,
"step": 1910,
"token_acc": 0.5628930817610063,
"train_speed(iter/s)": 0.581719
},
{
"epoch": 1.1620145631067962,
"grad_norm": 8.869878768920898,
"learning_rate": 6.732969701652145e-05,
"loss": 2.2940914154052736,
"memory(GiB)": 41.25,
"step": 1915,
"token_acc": 0.5511551155115512,
"train_speed(iter/s)": 0.58164
},
{
"epoch": 1.1650485436893203,
"grad_norm": 7.5197248458862305,
"learning_rate": 6.718059777212567e-05,
"loss": 2.0857444763183595,
"memory(GiB)": 41.25,
"step": 1920,
"token_acc": 0.5338645418326693,
"train_speed(iter/s)": 0.581495
},
{
"epoch": 1.1680825242718447,
"grad_norm": 6.92659854888916,
"learning_rate": 6.703132509912322e-05,
"loss": 1.807958221435547,
"memory(GiB)": 41.25,
"step": 1925,
"token_acc": 0.5786350148367952,
"train_speed(iter/s)": 0.581481
},
{
"epoch": 1.171116504854369,
"grad_norm": 7.253981113433838,
"learning_rate": 6.688188050433897e-05,
"loss": 1.9212162017822265,
"memory(GiB)": 41.25,
"step": 1930,
"token_acc": 0.5470383275261324,
"train_speed(iter/s)": 0.581673
},
{
"epoch": 1.1741504854368932,
"grad_norm": 7.32392692565918,
"learning_rate": 6.673226549633325e-05,
"loss": 2.0752506256103516,
"memory(GiB)": 41.25,
"step": 1935,
"token_acc": 0.5434782608695652,
"train_speed(iter/s)": 0.581879
},
{
"epoch": 1.1771844660194175,
"grad_norm": 6.774953842163086,
"learning_rate": 6.658248158538655e-05,
"loss": 2.022067832946777,
"memory(GiB)": 41.25,
"step": 1940,
"token_acc": 0.5303430079155673,
"train_speed(iter/s)": 0.582004
},
{
"epoch": 1.1802184466019416,
"grad_norm": 8.567710876464844,
"learning_rate": 6.643253028348443e-05,
"loss": 1.9163774490356444,
"memory(GiB)": 41.25,
"step": 1945,
"token_acc": 0.5769230769230769,
"train_speed(iter/s)": 0.581884
},
{
"epoch": 1.183252427184466,
"grad_norm": 7.197096347808838,
"learning_rate": 6.628241310430208e-05,
"loss": 1.9915233612060548,
"memory(GiB)": 41.25,
"step": 1950,
"token_acc": 0.5397923875432526,
"train_speed(iter/s)": 0.58194
},
{
"epoch": 1.1862864077669903,
"grad_norm": 7.874612808227539,
"learning_rate": 6.613213156318921e-05,
"loss": 2.039535331726074,
"memory(GiB)": 41.25,
"step": 1955,
"token_acc": 0.5563380281690141,
"train_speed(iter/s)": 0.58199
},
{
"epoch": 1.1893203883495145,
"grad_norm": 6.794829368591309,
"learning_rate": 6.598168717715462e-05,
"loss": 2.182103729248047,
"memory(GiB)": 41.25,
"step": 1960,
"token_acc": 0.48936170212765956,
"train_speed(iter/s)": 0.581769
},
{
"epoch": 1.1923543689320388,
"grad_norm": 8.138648986816406,
"learning_rate": 6.583108146485092e-05,
"loss": 2.205635833740234,
"memory(GiB)": 41.25,
"step": 1965,
"token_acc": 0.5301507537688442,
"train_speed(iter/s)": 0.581594
},
{
"epoch": 1.1953883495145632,
"grad_norm": 5.8334197998046875,
"learning_rate": 6.568031594655933e-05,
"loss": 2.1141899108886717,
"memory(GiB)": 41.25,
"step": 1970,
"token_acc": 0.5442622950819672,
"train_speed(iter/s)": 0.581474
},
{
"epoch": 1.1984223300970873,
"grad_norm": 6.450995922088623,
"learning_rate": 6.552939214417411e-05,
"loss": 2.0908193588256836,
"memory(GiB)": 41.25,
"step": 1975,
"token_acc": 0.5270758122743683,
"train_speed(iter/s)": 0.581538
},
{
"epoch": 1.2014563106796117,
"grad_norm": 5.936161041259766,
"learning_rate": 6.537831158118732e-05,
"loss": 2.2035654067993162,
"memory(GiB)": 41.25,
"step": 1980,
"token_acc": 0.5281899109792285,
"train_speed(iter/s)": 0.581735
},
{
"epoch": 1.204490291262136,
"grad_norm": 6.0731282234191895,
"learning_rate": 6.522707578267349e-05,
"loss": 2.015408515930176,
"memory(GiB)": 41.25,
"step": 1985,
"token_acc": 0.5625,
"train_speed(iter/s)": 0.581728
},
{
"epoch": 1.2075242718446602,
"grad_norm": 8.126087188720703,
"learning_rate": 6.507568627527411e-05,
"loss": 2.233916091918945,
"memory(GiB)": 41.25,
"step": 1990,
"token_acc": 0.5214899713467048,
"train_speed(iter/s)": 0.581678
},
{
"epoch": 1.2105582524271845,
"grad_norm": 5.780952453613281,
"learning_rate": 6.492414458718235e-05,
"loss": 2.153764533996582,
"memory(GiB)": 41.25,
"step": 1995,
"token_acc": 0.5223880597014925,
"train_speed(iter/s)": 0.581876
},
{
"epoch": 1.2135922330097086,
"grad_norm": 6.646781921386719,
"learning_rate": 6.477245224812745e-05,
"loss": 2.137297439575195,
"memory(GiB)": 41.25,
"step": 2000,
"token_acc": 0.5310880829015544,
"train_speed(iter/s)": 0.581927
},
{
"epoch": 1.2135922330097086,
"eval_loss": 2.180868148803711,
"eval_runtime": 12.0025,
"eval_samples_per_second": 8.332,
"eval_steps_per_second": 8.332,
"eval_token_acc": 0.5036284470246735,
"step": 2000
},
{
"epoch": 1.216626213592233,
"grad_norm": 6.332268238067627,
"learning_rate": 6.462061078935951e-05,
"loss": 2.0248859405517576,
"memory(GiB)": 41.25,
"step": 2005,
"token_acc": 0.5138888888888888,
"train_speed(iter/s)": 0.579819
},
{
"epoch": 1.2196601941747574,
"grad_norm": 9.277915954589844,
"learning_rate": 6.446862174363378e-05,
"loss": 2.223433494567871,
"memory(GiB)": 41.25,
"step": 2010,
"token_acc": 0.5347985347985348,
"train_speed(iter/s)": 0.579723
},
{
"epoch": 1.2226941747572815,
"grad_norm": 6.857091903686523,
"learning_rate": 6.431648664519544e-05,
"loss": 2.093130111694336,
"memory(GiB)": 41.25,
"step": 2015,
"token_acc": 0.5523809523809524,
"train_speed(iter/s)": 0.579662
},
{
"epoch": 1.2257281553398058,
"grad_norm": 7.251791000366211,
"learning_rate": 6.416420702976393e-05,
"loss": 2.4163230895996093,
"memory(GiB)": 41.25,
"step": 2020,
"token_acc": 0.5063291139240507,
"train_speed(iter/s)": 0.579631
},
{
"epoch": 1.2287621359223302,
"grad_norm": 6.369975566864014,
"learning_rate": 6.401178443451751e-05,
"loss": 1.8332990646362304,
"memory(GiB)": 41.25,
"step": 2025,
"token_acc": 0.5765124555160143,
"train_speed(iter/s)": 0.579685
},
{
"epoch": 1.2317961165048543,
"grad_norm": 12.884454727172852,
"learning_rate": 6.385922039807773e-05,
"loss": 1.9554672241210938,
"memory(GiB)": 41.25,
"step": 2030,
"token_acc": 0.5648148148148148,
"train_speed(iter/s)": 0.579632
},
{
"epoch": 1.2348300970873787,
"grad_norm": 9.875422477722168,
"learning_rate": 6.370651646049398e-05,
"loss": 2.229812812805176,
"memory(GiB)": 41.25,
"step": 2035,
"token_acc": 0.49691358024691357,
"train_speed(iter/s)": 0.579766
},
{
"epoch": 1.237864077669903,
"grad_norm": 5.669778823852539,
"learning_rate": 6.355367416322779e-05,
"loss": 1.7003231048583984,
"memory(GiB)": 41.25,
"step": 2040,
"token_acc": 0.5830508474576271,
"train_speed(iter/s)": 0.579765
},
{
"epoch": 1.2408980582524272,
"grad_norm": 6.894186019897461,
"learning_rate": 6.340069504913737e-05,
"loss": 2.091649627685547,
"memory(GiB)": 41.25,
"step": 2045,
"token_acc": 0.5504885993485342,
"train_speed(iter/s)": 0.579827
},
{
"epoch": 1.2439320388349515,
"grad_norm": 8.025986671447754,
"learning_rate": 6.324758066246211e-05,
"loss": 2.0427883148193358,
"memory(GiB)": 41.25,
"step": 2050,
"token_acc": 0.5252225519287834,
"train_speed(iter/s)": 0.579952
},
{
"epoch": 1.2469660194174756,
"grad_norm": 6.996369361877441,
"learning_rate": 6.309433254880675e-05,
"loss": 2.1355659484863283,
"memory(GiB)": 41.25,
"step": 2055,
"token_acc": 0.49606299212598426,
"train_speed(iter/s)": 0.580029
},
{
"epoch": 1.25,
"grad_norm": 9.216190338134766,
"learning_rate": 6.294095225512603e-05,
"loss": 2.045370101928711,
"memory(GiB)": 41.25,
"step": 2060,
"token_acc": 0.5151515151515151,
"train_speed(iter/s)": 0.579931
},
{
"epoch": 1.2530339805825244,
"grad_norm": 8.278094291687012,
"learning_rate": 6.278744132970899e-05,
"loss": 1.7628780364990235,
"memory(GiB)": 41.25,
"step": 2065,
"token_acc": 0.6045016077170418,
"train_speed(iter/s)": 0.57996
},
{
"epoch": 1.2560679611650485,
"grad_norm": 6.4922027587890625,
"learning_rate": 6.263380132216328e-05,
"loss": 2.0872188568115235,
"memory(GiB)": 41.25,
"step": 2070,
"token_acc": 0.5318471337579618,
"train_speed(iter/s)": 0.580006
},
{
"epoch": 1.2591019417475728,
"grad_norm": 9.755973815917969,
"learning_rate": 6.248003378339958e-05,
"loss": 2.043658638000488,
"memory(GiB)": 41.25,
"step": 2075,
"token_acc": 0.521594684385382,
"train_speed(iter/s)": 0.579989
},
{
"epoch": 1.262135922330097,
"grad_norm": 9.78760051727295,
"learning_rate": 6.232614026561587e-05,
"loss": 2.1496110916137696,
"memory(GiB)": 41.25,
"step": 2080,
"token_acc": 0.5136054421768708,
"train_speed(iter/s)": 0.579983
},
{
"epoch": 1.2651699029126213,
"grad_norm": 6.134158611297607,
"learning_rate": 6.217212232228189e-05,
"loss": 1.965431022644043,
"memory(GiB)": 41.25,
"step": 2085,
"token_acc": 0.5578231292517006,
"train_speed(iter/s)": 0.579816
},
{
"epoch": 1.2682038834951457,
"grad_norm": 6.624486446380615,
"learning_rate": 6.201798150812338e-05,
"loss": 2.282021713256836,
"memory(GiB)": 41.25,
"step": 2090,
"token_acc": 0.48556430446194226,
"train_speed(iter/s)": 0.579749
},
{
"epoch": 1.27123786407767,
"grad_norm": 7.1900739669799805,
"learning_rate": 6.186371937910637e-05,
"loss": 2.047537994384766,
"memory(GiB)": 41.25,
"step": 2095,
"token_acc": 0.4966442953020134,
"train_speed(iter/s)": 0.579939
},
{
"epoch": 1.2742718446601942,
"grad_norm": 6.147539138793945,
"learning_rate": 6.170933749242152e-05,
"loss": 2.319692039489746,
"memory(GiB)": 41.25,
"step": 2100,
"token_acc": 0.5370370370370371,
"train_speed(iter/s)": 0.580086
},
{
"epoch": 1.2773058252427185,
"grad_norm": 7.209454536437988,
"learning_rate": 6.155483740646832e-05,
"loss": 2.322870445251465,
"memory(GiB)": 41.25,
"step": 2105,
"token_acc": 0.521865889212828,
"train_speed(iter/s)": 0.580276
},
{
"epoch": 1.2803398058252426,
"grad_norm": 6.588000297546387,
"learning_rate": 6.140022068083948e-05,
"loss": 2.015561103820801,
"memory(GiB)": 41.25,
"step": 2110,
"token_acc": 0.5344129554655871,
"train_speed(iter/s)": 0.5802
},
{
"epoch": 1.283373786407767,
"grad_norm": 9.121885299682617,
"learning_rate": 6.124548887630508e-05,
"loss": 2.019037628173828,
"memory(GiB)": 41.25,
"step": 2115,
"token_acc": 0.5336927223719676,
"train_speed(iter/s)": 0.580255
},
{
"epoch": 1.2864077669902914,
"grad_norm": 10.898550987243652,
"learning_rate": 6.109064355479692e-05,
"loss": 1.740947151184082,
"memory(GiB)": 41.25,
"step": 2120,
"token_acc": 0.5368421052631579,
"train_speed(iter/s)": 0.580136
},
{
"epoch": 1.2894417475728155,
"grad_norm": 6.376506328582764,
"learning_rate": 6.093568627939261e-05,
"loss": 1.9328853607177734,
"memory(GiB)": 41.34,
"step": 2125,
"token_acc": 0.55,
"train_speed(iter/s)": 0.579925
},
{
"epoch": 1.2924757281553398,
"grad_norm": 7.9046525955200195,
"learning_rate": 6.078061861429995e-05,
"loss": 2.187295913696289,
"memory(GiB)": 41.34,
"step": 2130,
"token_acc": 0.5116279069767442,
"train_speed(iter/s)": 0.580068
},
{
"epoch": 1.295509708737864,
"grad_norm": 6.604916095733643,
"learning_rate": 6.062544212484096e-05,
"loss": 2.0762821197509767,
"memory(GiB)": 41.34,
"step": 2135,
"token_acc": 0.5333333333333333,
"train_speed(iter/s)": 0.58009
},
{
"epoch": 1.2985436893203883,
"grad_norm": 7.367359638214111,
"learning_rate": 6.047015837743629e-05,
"loss": 2.126904106140137,
"memory(GiB)": 41.34,
"step": 2140,
"token_acc": 0.5333333333333333,
"train_speed(iter/s)": 0.580247
},
{
"epoch": 1.3015776699029127,
"grad_norm": 5.810800552368164,
"learning_rate": 6.031476893958926e-05,
"loss": 1.7963878631591796,
"memory(GiB)": 41.34,
"step": 2145,
"token_acc": 0.5631399317406144,
"train_speed(iter/s)": 0.580352
},
{
"epoch": 1.3046116504854368,
"grad_norm": 6.407706260681152,
"learning_rate": 6.015927537987004e-05,
"loss": 2.1866846084594727,
"memory(GiB)": 41.34,
"step": 2150,
"token_acc": 0.5258855585831063,
"train_speed(iter/s)": 0.5802
},
{
"epoch": 1.3076456310679612,
"grad_norm": 7.020833969116211,
"learning_rate": 6.0003679267899904e-05,
"loss": 1.8915981292724608,
"memory(GiB)": 41.34,
"step": 2155,
"token_acc": 0.5571428571428572,
"train_speed(iter/s)": 0.580155
},
{
"epoch": 1.3106796116504853,
"grad_norm": 8.229516983032227,
"learning_rate": 5.9847982174335316e-05,
"loss": 1.890799331665039,
"memory(GiB)": 41.34,
"step": 2160,
"token_acc": 0.5424836601307189,
"train_speed(iter/s)": 0.579946
},
{
"epoch": 1.3137135922330097,
"grad_norm": 6.056339263916016,
"learning_rate": 5.969218567085206e-05,
"loss": 2.39956111907959,
"memory(GiB)": 41.34,
"step": 2165,
"token_acc": 0.49453551912568305,
"train_speed(iter/s)": 0.580006
},
{
"epoch": 1.316747572815534,
"grad_norm": 7.4000468254089355,
"learning_rate": 5.953629133012949e-05,
"loss": 2.256308937072754,
"memory(GiB)": 41.34,
"step": 2170,
"token_acc": 0.5133689839572193,
"train_speed(iter/s)": 0.579824
},
{
"epoch": 1.3197815533980584,
"grad_norm": 6.835947513580322,
"learning_rate": 5.938030072583447e-05,
"loss": 1.8971139907836914,
"memory(GiB)": 41.34,
"step": 2175,
"token_acc": 0.542319749216301,
"train_speed(iter/s)": 0.57985
},
{
"epoch": 1.3228155339805825,
"grad_norm": 8.275431632995605,
"learning_rate": 5.922421543260567e-05,
"loss": 1.7686073303222656,
"memory(GiB)": 41.34,
"step": 2180,
"token_acc": 0.5703971119133574,
"train_speed(iter/s)": 0.579752
},
{
"epoch": 1.3258495145631068,
"grad_norm": 7.795175552368164,
"learning_rate": 5.906803702603755e-05,
"loss": 1.9470417022705078,
"memory(GiB)": 41.34,
"step": 2185,
"token_acc": 0.5381944444444444,
"train_speed(iter/s)": 0.57989
},
{
"epoch": 1.328883495145631,
"grad_norm": 5.923962593078613,
"learning_rate": 5.891176708266454e-05,
"loss": 2.17016716003418,
"memory(GiB)": 41.34,
"step": 2190,
"token_acc": 0.5444126074498568,
"train_speed(iter/s)": 0.579998
},
{
"epoch": 1.3319174757281553,
"grad_norm": 7.121251106262207,
"learning_rate": 5.875540717994503e-05,
"loss": 1.586796760559082,
"memory(GiB)": 41.34,
"step": 2195,
"token_acc": 0.6234817813765182,
"train_speed(iter/s)": 0.579935
},
{
"epoch": 1.3349514563106797,
"grad_norm": 7.5099921226501465,
"learning_rate": 5.859895889624554e-05,
"loss": 1.777475357055664,
"memory(GiB)": 41.34,
"step": 2200,
"token_acc": 0.5938566552901023,
"train_speed(iter/s)": 0.580098
},
{
"epoch": 1.3379854368932038,
"grad_norm": 8.970749855041504,
"learning_rate": 5.84424238108247e-05,
"loss": 1.670484733581543,
"memory(GiB)": 41.34,
"step": 2205,
"token_acc": 0.5873015873015873,
"train_speed(iter/s)": 0.580043
},
{
"epoch": 1.3410194174757282,
"grad_norm": 6.932069778442383,
"learning_rate": 5.8285803503817425e-05,
"loss": 2.056923675537109,
"memory(GiB)": 41.34,
"step": 2210,
"token_acc": 0.5124653739612188,
"train_speed(iter/s)": 0.580004
},
{
"epoch": 1.3440533980582523,
"grad_norm": 7.549715518951416,
"learning_rate": 5.812909955621886e-05,
"loss": 1.9996042251586914,
"memory(GiB)": 41.34,
"step": 2215,
"token_acc": 0.5565749235474006,
"train_speed(iter/s)": 0.579986
},
{
"epoch": 1.3470873786407767,
"grad_norm": 8.340503692626953,
"learning_rate": 5.7972313549868415e-05,
"loss": 2.207027816772461,
"memory(GiB)": 41.34,
"step": 2220,
"token_acc": 0.4897959183673469,
"train_speed(iter/s)": 0.579916
},
{
"epoch": 1.350121359223301,
"grad_norm": 6.941786766052246,
"learning_rate": 5.7815447067433917e-05,
"loss": 1.7856271743774415,
"memory(GiB)": 41.34,
"step": 2225,
"token_acc": 0.5862068965517241,
"train_speed(iter/s)": 0.579928
},
{
"epoch": 1.3531553398058254,
"grad_norm": 5.413527488708496,
"learning_rate": 5.7658501692395475e-05,
"loss": 1.8429689407348633,
"memory(GiB)": 41.34,
"step": 2230,
"token_acc": 0.6061643835616438,
"train_speed(iter/s)": 0.579742
},
{
"epoch": 1.3561893203883495,
"grad_norm": 6.279661655426025,
"learning_rate": 5.7501479009029636e-05,
"loss": 1.8153335571289062,
"memory(GiB)": 41.34,
"step": 2235,
"token_acc": 0.5572289156626506,
"train_speed(iter/s)": 0.57984
},
{
"epoch": 1.3592233009708738,
"grad_norm": 7.204460620880127,
"learning_rate": 5.734438060239331e-05,
"loss": 2.255967712402344,
"memory(GiB)": 41.34,
"step": 2240,
"token_acc": 0.513595166163142,
"train_speed(iter/s)": 0.579902
},
{
"epoch": 1.362257281553398,
"grad_norm": 7.191935062408447,
"learning_rate": 5.718720805830777e-05,
"loss": 2.1052494049072266,
"memory(GiB)": 41.34,
"step": 2245,
"token_acc": 0.533724340175953,
"train_speed(iter/s)": 0.579865
},
{
"epoch": 1.3652912621359223,
"grad_norm": 9.75123119354248,
"learning_rate": 5.70299629633427e-05,
"loss": 2.176554489135742,
"memory(GiB)": 41.34,
"step": 2250,
"token_acc": 0.5303514376996805,
"train_speed(iter/s)": 0.579795
},
{
"epoch": 1.3683252427184467,
"grad_norm": 8.081015586853027,
"learning_rate": 5.687264690480014e-05,
"loss": 2.253178024291992,
"memory(GiB)": 41.34,
"step": 2255,
"token_acc": 0.5040431266846361,
"train_speed(iter/s)": 0.579802
},
{
"epoch": 1.3713592233009708,
"grad_norm": 5.86273193359375,
"learning_rate": 5.6715261470698434e-05,
"loss": 2.2541793823242187,
"memory(GiB)": 41.34,
"step": 2260,
"token_acc": 0.5361842105263158,
"train_speed(iter/s)": 0.57974
},
{
"epoch": 1.3743932038834952,
"grad_norm": 6.653288841247559,
"learning_rate": 5.655780824975628e-05,
"loss": 2.219985008239746,
"memory(GiB)": 41.34,
"step": 2265,
"token_acc": 0.5471014492753623,
"train_speed(iter/s)": 0.579644
},
{
"epoch": 1.3774271844660193,
"grad_norm": 9.517049789428711,
"learning_rate": 5.6400288831376604e-05,
"loss": 2.2441757202148436,
"memory(GiB)": 41.34,
"step": 2270,
"token_acc": 0.5112359550561798,
"train_speed(iter/s)": 0.579692
},
{
"epoch": 1.3804611650485437,
"grad_norm": 6.048003673553467,
"learning_rate": 5.624270480563059e-05,
"loss": 2.186481475830078,
"memory(GiB)": 41.34,
"step": 2275,
"token_acc": 0.5155875299760192,
"train_speed(iter/s)": 0.57971
},
{
"epoch": 1.383495145631068,
"grad_norm": 7.275609970092773,
"learning_rate": 5.608505776324158e-05,
"loss": 2.20775146484375,
"memory(GiB)": 41.34,
"step": 2280,
"token_acc": 0.5342465753424658,
"train_speed(iter/s)": 0.57976
},
{
"epoch": 1.3865291262135924,
"grad_norm": 7.088268280029297,
"learning_rate": 5.592734929556907e-05,
"loss": 1.7822921752929688,
"memory(GiB)": 41.34,
"step": 2285,
"token_acc": 0.610223642172524,
"train_speed(iter/s)": 0.57981
},
{
"epoch": 1.3895631067961165,
"grad_norm": 6.6104207038879395,
"learning_rate": 5.576958099459254e-05,
"loss": 2.022065353393555,
"memory(GiB)": 41.34,
"step": 2290,
"token_acc": 0.5352941176470588,
"train_speed(iter/s)": 0.579726
},
{
"epoch": 1.3925970873786409,
"grad_norm": 7.773556709289551,
"learning_rate": 5.5611754452895516e-05,
"loss": 1.8300546646118163,
"memory(GiB)": 41.34,
"step": 2295,
"token_acc": 0.577922077922078,
"train_speed(iter/s)": 0.579707
},
{
"epoch": 1.395631067961165,
"grad_norm": 7.439202785491943,
"learning_rate": 5.5453871263649395e-05,
"loss": 1.970297622680664,
"memory(GiB)": 41.84,
"step": 2300,
"token_acc": 0.6112852664576802,
"train_speed(iter/s)": 0.579403
},
{
"epoch": 1.3986650485436893,
"grad_norm": 9.190638542175293,
"learning_rate": 5.5295933020597426e-05,
"loss": 2.140420913696289,
"memory(GiB)": 41.84,
"step": 2305,
"token_acc": 0.5065359477124183,
"train_speed(iter/s)": 0.579349
},
{
"epoch": 1.4016990291262137,
"grad_norm": 5.690435409545898,
"learning_rate": 5.5137941318038596e-05,
"loss": 1.893089485168457,
"memory(GiB)": 41.84,
"step": 2310,
"token_acc": 0.6,
"train_speed(iter/s)": 0.579324
},
{
"epoch": 1.4047330097087378,
"grad_norm": 7.719916343688965,
"learning_rate": 5.4979897750811506e-05,
"loss": 2.3775409698486327,
"memory(GiB)": 41.84,
"step": 2315,
"token_acc": 0.5171102661596958,
"train_speed(iter/s)": 0.579192
},
{
"epoch": 1.4077669902912622,
"grad_norm": 7.299395561218262,
"learning_rate": 5.4821803914278336e-05,
"loss": 1.9694931030273437,
"memory(GiB)": 41.84,
"step": 2320,
"token_acc": 0.5427728613569321,
"train_speed(iter/s)": 0.579086
},
{
"epoch": 1.4108009708737863,
"grad_norm": 6.726255893707275,
"learning_rate": 5.4663661404308677e-05,
"loss": 2.0492481231689452,
"memory(GiB)": 41.84,
"step": 2325,
"token_acc": 0.5476923076923077,
"train_speed(iter/s)": 0.579024
},
{
"epoch": 1.4138349514563107,
"grad_norm": 9.350031852722168,
"learning_rate": 5.4505471817263475e-05,
"loss": 2.0813602447509765,
"memory(GiB)": 41.84,
"step": 2330,
"token_acc": 0.5481727574750831,
"train_speed(iter/s)": 0.578975
},
{
"epoch": 1.416868932038835,
"grad_norm": 6.127203464508057,
"learning_rate": 5.434723674997888e-05,
"loss": 1.884780502319336,
"memory(GiB)": 41.84,
"step": 2335,
"token_acc": 0.5686900958466453,
"train_speed(iter/s)": 0.579032
},
{
"epoch": 1.4199029126213591,
"grad_norm": 6.9619646072387695,
"learning_rate": 5.418895779975014e-05,
"loss": 1.7420536041259767,
"memory(GiB)": 41.84,
"step": 2340,
"token_acc": 0.552901023890785,
"train_speed(iter/s)": 0.578878
},
{
"epoch": 1.4229368932038835,
"grad_norm": 8.211845397949219,
"learning_rate": 5.403063656431548e-05,
"loss": 1.926046371459961,
"memory(GiB)": 41.84,
"step": 2345,
"token_acc": 0.5566666666666666,
"train_speed(iter/s)": 0.578768
},
{
"epoch": 1.4259708737864076,
"grad_norm": 8.615828514099121,
"learning_rate": 5.387227464183999e-05,
"loss": 1.8713953018188476,
"memory(GiB)": 41.84,
"step": 2350,
"token_acc": 0.5667870036101083,
"train_speed(iter/s)": 0.578908
},
{
"epoch": 1.429004854368932,
"grad_norm": 8.677647590637207,
"learning_rate": 5.371387363089945e-05,
"loss": 2.0104761123657227,
"memory(GiB)": 41.84,
"step": 2355,
"token_acc": 0.5653710247349824,
"train_speed(iter/s)": 0.578973
},
{
"epoch": 1.4320388349514563,
"grad_norm": 8.752043724060059,
"learning_rate": 5.355543513046419e-05,
"loss": 2.0104990005493164,
"memory(GiB)": 41.84,
"step": 2360,
"token_acc": 0.5486111111111112,
"train_speed(iter/s)": 0.579051
},
{
"epoch": 1.4350728155339807,
"grad_norm": 6.938195705413818,
"learning_rate": 5.3396960739883037e-05,
"loss": 1.974110984802246,
"memory(GiB)": 41.84,
"step": 2365,
"token_acc": 0.5476190476190477,
"train_speed(iter/s)": 0.579115
},
{
"epoch": 1.4381067961165048,
"grad_norm": 6.470673561096191,
"learning_rate": 5.323845205886707e-05,
"loss": 2.092882537841797,
"memory(GiB)": 41.84,
"step": 2370,
"token_acc": 0.5299684542586751,
"train_speed(iter/s)": 0.57924
},
{
"epoch": 1.4411407766990292,
"grad_norm": 6.7543206214904785,
"learning_rate": 5.307991068747353e-05,
"loss": 2.317662811279297,
"memory(GiB)": 41.84,
"step": 2375,
"token_acc": 0.5239616613418531,
"train_speed(iter/s)": 0.579126
},
{
"epoch": 1.4441747572815533,
"grad_norm": 7.441592216491699,
"learning_rate": 5.292133822608961e-05,
"loss": 2.0434192657470702,
"memory(GiB)": 41.84,
"step": 2380,
"token_acc": 0.547945205479452,
"train_speed(iter/s)": 0.579147
},
{
"epoch": 1.4472087378640777,
"grad_norm": 7.122344970703125,
"learning_rate": 5.2762736275416416e-05,
"loss": 2.2737056732177736,
"memory(GiB)": 41.84,
"step": 2385,
"token_acc": 0.540785498489426,
"train_speed(iter/s)": 0.579209
},
{
"epoch": 1.450242718446602,
"grad_norm": 6.282622337341309,
"learning_rate": 5.260410643645263e-05,
"loss": 2.0695510864257813,
"memory(GiB)": 41.84,
"step": 2390,
"token_acc": 0.5391849529780565,
"train_speed(iter/s)": 0.579338
},
{
"epoch": 1.4532766990291262,
"grad_norm": 6.010311603546143,
"learning_rate": 5.2445450310478525e-05,
"loss": 1.819678497314453,
"memory(GiB)": 41.84,
"step": 2395,
"token_acc": 0.5876288659793815,
"train_speed(iter/s)": 0.579301
},
{
"epoch": 1.4563106796116505,
"grad_norm": 8.786865234375,
"learning_rate": 5.228676949903973e-05,
"loss": 1.9962085723876952,
"memory(GiB)": 41.84,
"step": 2400,
"token_acc": 0.543046357615894,
"train_speed(iter/s)": 0.579291
},
{
"epoch": 1.4593446601941746,
"grad_norm": 6.772591590881348,
"learning_rate": 5.2128065603931006e-05,
"loss": 1.931478500366211,
"memory(GiB)": 41.84,
"step": 2405,
"token_acc": 0.584717607973422,
"train_speed(iter/s)": 0.579183
},
{
"epoch": 1.462378640776699,
"grad_norm": 7.0186357498168945,
"learning_rate": 5.196934022718017e-05,
"loss": 1.8834335327148437,
"memory(GiB)": 41.84,
"step": 2410,
"token_acc": 0.5857142857142857,
"train_speed(iter/s)": 0.579263
},
{
"epoch": 1.4654126213592233,
"grad_norm": 7.649616241455078,
"learning_rate": 5.18105949710319e-05,
"loss": 2.1677167892456053,
"memory(GiB)": 41.84,
"step": 2415,
"token_acc": 0.5331010452961672,
"train_speed(iter/s)": 0.579501
},
{
"epoch": 1.4684466019417477,
"grad_norm": 7.913327693939209,
"learning_rate": 5.165183143793149e-05,
"loss": 2.4113887786865233,
"memory(GiB)": 41.84,
"step": 2420,
"token_acc": 0.47790055248618785,
"train_speed(iter/s)": 0.579573
},
{
"epoch": 1.4714805825242718,
"grad_norm": 8.196721076965332,
"learning_rate": 5.149305123050877e-05,
"loss": 1.6590158462524414,
"memory(GiB)": 41.84,
"step": 2425,
"token_acc": 0.5425531914893617,
"train_speed(iter/s)": 0.579678
},
{
"epoch": 1.4745145631067962,
"grad_norm": 5.6772637367248535,
"learning_rate": 5.133425595156187e-05,
"loss": 2.0934783935546877,
"memory(GiB)": 41.84,
"step": 2430,
"token_acc": 0.49586776859504134,
"train_speed(iter/s)": 0.579607
},
{
"epoch": 1.4775485436893203,
"grad_norm": 9.212677955627441,
"learning_rate": 5.1175447204041096e-05,
"loss": 2.0111692428588865,
"memory(GiB)": 41.84,
"step": 2435,
"token_acc": 0.5536912751677853,
"train_speed(iter/s)": 0.579662
},
{
"epoch": 1.4805825242718447,
"grad_norm": 6.798145771026611,
"learning_rate": 5.101662659103265e-05,
"loss": 1.8395654678344726,
"memory(GiB)": 41.84,
"step": 2440,
"token_acc": 0.597972972972973,
"train_speed(iter/s)": 0.579718
},
{
"epoch": 1.483616504854369,
"grad_norm": 5.608346462249756,
"learning_rate": 5.0857795715742575e-05,
"loss": 2.0497175216674806,
"memory(GiB)": 41.84,
"step": 2445,
"token_acc": 0.5542168674698795,
"train_speed(iter/s)": 0.579756
},
{
"epoch": 1.4866504854368932,
"grad_norm": 7.392420291900635,
"learning_rate": 5.0698956181480465e-05,
"loss": 2.040939521789551,
"memory(GiB)": 41.84,
"step": 2450,
"token_acc": 0.516728624535316,
"train_speed(iter/s)": 0.57972
},
{
"epoch": 1.4896844660194175,
"grad_norm": 5.091887474060059,
"learning_rate": 5.054010959164329e-05,
"loss": 2.256111907958984,
"memory(GiB)": 41.84,
"step": 2455,
"token_acc": 0.5181347150259067,
"train_speed(iter/s)": 0.57971
},
{
"epoch": 1.4927184466019416,
"grad_norm": 8.56528091430664,
"learning_rate": 5.038125754969933e-05,
"loss": 2.1345645904541017,
"memory(GiB)": 41.84,
"step": 2460,
"token_acc": 0.524390243902439,
"train_speed(iter/s)": 0.579785
},
{
"epoch": 1.495752427184466,
"grad_norm": 8.425841331481934,
"learning_rate": 5.0222401659171846e-05,
"loss": 1.8225021362304688,
"memory(GiB)": 41.84,
"step": 2465,
"token_acc": 0.6041666666666666,
"train_speed(iter/s)": 0.579851
},
{
"epoch": 1.4987864077669903,
"grad_norm": 7.502073287963867,
"learning_rate": 5.006354352362296e-05,
"loss": 2.2287876129150392,
"memory(GiB)": 41.84,
"step": 2470,
"token_acc": 0.5451505016722408,
"train_speed(iter/s)": 0.579885
},
{
"epoch": 1.5018203883495147,
"grad_norm": 14.120893478393555,
"learning_rate": 4.9904684746637445e-05,
"loss": 2.1780731201171877,
"memory(GiB)": 41.84,
"step": 2475,
"token_acc": 0.5900621118012422,
"train_speed(iter/s)": 0.580025
},
{
"epoch": 1.5048543689320388,
"grad_norm": 6.581485271453857,
"learning_rate": 4.9745826931806524e-05,
"loss": 2.466159439086914,
"memory(GiB)": 41.84,
"step": 2480,
"token_acc": 0.4410958904109589,
"train_speed(iter/s)": 0.580064
},
{
"epoch": 1.507888349514563,
"grad_norm": 6.508731365203857,
"learning_rate": 4.958697168271179e-05,
"loss": 1.8887645721435546,
"memory(GiB)": 41.84,
"step": 2485,
"token_acc": 0.5559440559440559,
"train_speed(iter/s)": 0.58009
},
{
"epoch": 1.5109223300970873,
"grad_norm": 5.886694431304932,
"learning_rate": 4.942812060290886e-05,
"loss": 2.1457874298095705,
"memory(GiB)": 41.84,
"step": 2490,
"token_acc": 0.5476190476190477,
"train_speed(iter/s)": 0.580188
},
{
"epoch": 1.5139563106796117,
"grad_norm": 5.6448655128479,
"learning_rate": 4.92692752959113e-05,
"loss": 1.9578502655029297,
"memory(GiB)": 41.84,
"step": 2495,
"token_acc": 0.5710227272727273,
"train_speed(iter/s)": 0.58031
},
{
"epoch": 1.516990291262136,
"grad_norm": 9.438764572143555,
"learning_rate": 4.91104373651744e-05,
"loss": 2.124725341796875,
"memory(GiB)": 41.84,
"step": 2500,
"token_acc": 0.5164179104477612,
"train_speed(iter/s)": 0.580368
},
{
"epoch": 1.516990291262136,
"eval_loss": 2.0256900787353516,
"eval_runtime": 12.7025,
"eval_samples_per_second": 7.872,
"eval_steps_per_second": 7.872,
"eval_token_acc": 0.5185185185185185,
"step": 2500
},
{
"epoch": 1.5200242718446602,
"grad_norm": 7.8130106925964355,
"learning_rate": 4.8951608414078944e-05,
"loss": 2.377336311340332,
"memory(GiB)": 41.84,
"step": 2505,
"token_acc": 0.509090909090909,
"train_speed(iter/s)": 0.578557
},
{
"epoch": 1.5230582524271845,
"grad_norm": 7.16809606552124,
"learning_rate": 4.8792790045915167e-05,
"loss": 1.6067583084106445,
"memory(GiB)": 41.84,
"step": 2510,
"token_acc": 0.6186770428015564,
"train_speed(iter/s)": 0.578509
},
{
"epoch": 1.5260922330097086,
"grad_norm": 6.225858688354492,
"learning_rate": 4.863398386386638e-05,
"loss": 1.8492023468017578,
"memory(GiB)": 41.84,
"step": 2515,
"token_acc": 0.5787965616045845,
"train_speed(iter/s)": 0.578454
},
{
"epoch": 1.529126213592233,
"grad_norm": 8.595073699951172,
"learning_rate": 4.847519147099294e-05,
"loss": 1.9532032012939453,
"memory(GiB)": 41.84,
"step": 2520,
"token_acc": 0.535593220338983,
"train_speed(iter/s)": 0.578603
},
{
"epoch": 1.5321601941747574,
"grad_norm": 7.294178009033203,
"learning_rate": 4.831641447021599e-05,
"loss": 1.7893003463745116,
"memory(GiB)": 41.84,
"step": 2525,
"token_acc": 0.6137184115523465,
"train_speed(iter/s)": 0.57858
},
{
"epoch": 1.5351941747572817,
"grad_norm": 7.821887969970703,
"learning_rate": 4.8157654464301275e-05,
"loss": 2.2367401123046875,
"memory(GiB)": 41.84,
"step": 2530,
"token_acc": 0.5182072829131653,
"train_speed(iter/s)": 0.578758
},
{
"epoch": 1.5382281553398058,
"grad_norm": 7.00529670715332,
"learning_rate": 4.7998913055843054e-05,
"loss": 2.1124399185180662,
"memory(GiB)": 41.84,
"step": 2535,
"token_acc": 0.5432835820895522,
"train_speed(iter/s)": 0.578828
},
{
"epoch": 1.54126213592233,
"grad_norm": 5.952232837677002,
"learning_rate": 4.7840191847247774e-05,
"loss": 2.0016332626342774,
"memory(GiB)": 41.84,
"step": 2540,
"token_acc": 0.5930232558139535,
"train_speed(iter/s)": 0.578959
},
{
"epoch": 1.5442961165048543,
"grad_norm": 7.779722213745117,
"learning_rate": 4.7681492440718045e-05,
"loss": 1.982724952697754,
"memory(GiB)": 41.84,
"step": 2545,
"token_acc": 0.5338345864661654,
"train_speed(iter/s)": 0.579046
},
{
"epoch": 1.5473300970873787,
"grad_norm": 7.770874977111816,
"learning_rate": 4.752281643823633e-05,
"loss": 2.032842254638672,
"memory(GiB)": 41.84,
"step": 2550,
"token_acc": 0.5749235474006116,
"train_speed(iter/s)": 0.579014
},
{
"epoch": 1.550364077669903,
"grad_norm": 6.972710609436035,
"learning_rate": 4.736416544154891e-05,
"loss": 1.9030048370361328,
"memory(GiB)": 41.84,
"step": 2555,
"token_acc": 0.5656565656565656,
"train_speed(iter/s)": 0.579128
},
{
"epoch": 1.5533980582524272,
"grad_norm": 7.349340915679932,
"learning_rate": 4.720554105214961e-05,
"loss": 1.903385543823242,
"memory(GiB)": 41.84,
"step": 2560,
"token_acc": 0.5469798657718121,
"train_speed(iter/s)": 0.579119
},
{
"epoch": 1.5564320388349513,
"grad_norm": 7.2185444831848145,
"learning_rate": 4.704694487126365e-05,
"loss": 1.8204626083374023,
"memory(GiB)": 41.84,
"step": 2565,
"token_acc": 0.5747126436781609,
"train_speed(iter/s)": 0.579258
},
{
"epoch": 1.5594660194174756,
"grad_norm": 7.047289848327637,
"learning_rate": 4.688837849983154e-05,
"loss": 2.169702339172363,
"memory(GiB)": 41.84,
"step": 2570,
"token_acc": 0.509493670886076,
"train_speed(iter/s)": 0.579358
},
{
"epoch": 1.5625,
"grad_norm": 10.583885192871094,
"learning_rate": 4.6729843538492847e-05,
"loss": 1.8666536331176757,
"memory(GiB)": 41.84,
"step": 2575,
"token_acc": 0.5373134328358209,
"train_speed(iter/s)": 0.579491
},
{
"epoch": 1.5655339805825244,
"grad_norm": 7.884814262390137,
"learning_rate": 4.657134158757012e-05,
"loss": 2.1705270767211915,
"memory(GiB)": 41.84,
"step": 2580,
"token_acc": 0.5142857142857142,
"train_speed(iter/s)": 0.579527
},
{
"epoch": 1.5685679611650487,
"grad_norm": 7.872768402099609,
"learning_rate": 4.6412874247052615e-05,
"loss": 2.2928442001342773,
"memory(GiB)": 41.84,
"step": 2585,
"token_acc": 0.5105105105105106,
"train_speed(iter/s)": 0.579593
},
{
"epoch": 1.5716019417475728,
"grad_norm": 9.023248672485352,
"learning_rate": 4.625444311658028e-05,
"loss": 1.8835826873779298,
"memory(GiB)": 41.84,
"step": 2590,
"token_acc": 0.5552147239263804,
"train_speed(iter/s)": 0.579652
},
{
"epoch": 1.574635922330097,
"grad_norm": 7.943882942199707,
"learning_rate": 4.6096049795427514e-05,
"loss": 2.0815145492553713,
"memory(GiB)": 41.84,
"step": 2595,
"token_acc": 0.5218855218855218,
"train_speed(iter/s)": 0.579716
},
{
"epoch": 1.5776699029126213,
"grad_norm": 7.587296009063721,
"learning_rate": 4.593769588248702e-05,
"loss": 1.6165863037109376,
"memory(GiB)": 41.84,
"step": 2600,
"token_acc": 0.6129032258064516,
"train_speed(iter/s)": 0.579694
},
{
"epoch": 1.5807038834951457,
"grad_norm": 8.291844367980957,
"learning_rate": 4.577938297625378e-05,
"loss": 2.093304443359375,
"memory(GiB)": 41.84,
"step": 2605,
"token_acc": 0.5827814569536424,
"train_speed(iter/s)": 0.579729
},
{
"epoch": 1.58373786407767,
"grad_norm": 6.745671272277832,
"learning_rate": 4.5621112674808756e-05,
"loss": 1.9251686096191407,
"memory(GiB)": 41.84,
"step": 2610,
"token_acc": 0.5833333333333334,
"train_speed(iter/s)": 0.579877
},
{
"epoch": 1.5867718446601942,
"grad_norm": 8.493294715881348,
"learning_rate": 4.5462886575802884e-05,
"loss": 1.971460723876953,
"memory(GiB)": 41.84,
"step": 2615,
"token_acc": 0.5821428571428572,
"train_speed(iter/s)": 0.579847
},
{
"epoch": 1.5898058252427183,
"grad_norm": 13.71259593963623,
"learning_rate": 4.530470627644088e-05,
"loss": 2.0272783279418944,
"memory(GiB)": 41.84,
"step": 2620,
"token_acc": 0.5578231292517006,
"train_speed(iter/s)": 0.579923
},
{
"epoch": 1.5928398058252426,
"grad_norm": 6.396689414978027,
"learning_rate": 4.514657337346512e-05,
"loss": 1.958717155456543,
"memory(GiB)": 41.84,
"step": 2625,
"token_acc": 0.5413333333333333,
"train_speed(iter/s)": 0.579933
},
{
"epoch": 1.595873786407767,
"grad_norm": 8.41101360321045,
"learning_rate": 4.4988489463139605e-05,
"loss": 1.8024402618408204,
"memory(GiB)": 41.84,
"step": 2630,
"token_acc": 0.574468085106383,
"train_speed(iter/s)": 0.580054
},
{
"epoch": 1.5989077669902914,
"grad_norm": 6.545622825622559,
"learning_rate": 4.483045614123371e-05,
"loss": 2.081429862976074,
"memory(GiB)": 41.84,
"step": 2635,
"token_acc": 0.5523809523809524,
"train_speed(iter/s)": 0.580079
},
{
"epoch": 1.6019417475728155,
"grad_norm": 7.194870471954346,
"learning_rate": 4.46724750030062e-05,
"loss": 1.9362052917480468,
"memory(GiB)": 41.84,
"step": 2640,
"token_acc": 0.5756578947368421,
"train_speed(iter/s)": 0.580221
},
{
"epoch": 1.6049757281553398,
"grad_norm": 6.871307849884033,
"learning_rate": 4.451454764318903e-05,
"loss": 2.0093603134155273,
"memory(GiB)": 41.84,
"step": 2645,
"token_acc": 0.5370370370370371,
"train_speed(iter/s)": 0.580085
},
{
"epoch": 1.608009708737864,
"grad_norm": 6.45038366317749,
"learning_rate": 4.4356675655971344e-05,
"loss": 1.9990568161010742,
"memory(GiB)": 41.84,
"step": 2650,
"token_acc": 0.5369774919614148,
"train_speed(iter/s)": 0.580256
},
{
"epoch": 1.6110436893203883,
"grad_norm": 10.047187805175781,
"learning_rate": 4.419886063498329e-05,
"loss": 2.281326103210449,
"memory(GiB)": 41.84,
"step": 2655,
"token_acc": 0.4965034965034965,
"train_speed(iter/s)": 0.580351
},
{
"epoch": 1.6140776699029127,
"grad_norm": 8.295970916748047,
"learning_rate": 4.404110417327998e-05,
"loss": 2.0824228286743165,
"memory(GiB)": 41.84,
"step": 2660,
"token_acc": 0.519434628975265,
"train_speed(iter/s)": 0.580351
},
{
"epoch": 1.617111650485437,
"grad_norm": 8.373644828796387,
"learning_rate": 4.388340786332541e-05,
"loss": 1.9413429260253907,
"memory(GiB)": 41.84,
"step": 2665,
"token_acc": 0.580110497237569,
"train_speed(iter/s)": 0.58041
},
{
"epoch": 1.6201456310679612,
"grad_norm": 6.771739482879639,
"learning_rate": 4.372577329697636e-05,
"loss": 2.1314056396484373,
"memory(GiB)": 41.84,
"step": 2670,
"token_acc": 0.5014005602240896,
"train_speed(iter/s)": 0.580318
},
{
"epoch": 1.6231796116504853,
"grad_norm": 6.547637462615967,
"learning_rate": 4.35682020654664e-05,
"loss": 1.8196992874145508,
"memory(GiB)": 41.84,
"step": 2675,
"token_acc": 0.5973154362416108,
"train_speed(iter/s)": 0.580398
},
{
"epoch": 1.6262135922330097,
"grad_norm": 7.0243449211120605,
"learning_rate": 4.341069575938968e-05,
"loss": 2.0443634033203124,
"memory(GiB)": 41.84,
"step": 2680,
"token_acc": 0.5777027027027027,
"train_speed(iter/s)": 0.580433
},
{
"epoch": 1.629247572815534,
"grad_norm": 7.968044281005859,
"learning_rate": 4.3253255968685044e-05,
"loss": 2.372605323791504,
"memory(GiB)": 41.84,
"step": 2685,
"token_acc": 0.5537459283387622,
"train_speed(iter/s)": 0.580421
},
{
"epoch": 1.6322815533980584,
"grad_norm": 7.074746608734131,
"learning_rate": 4.3095884282619866e-05,
"loss": 1.9867733001708985,
"memory(GiB)": 41.84,
"step": 2690,
"token_acc": 0.5676691729323309,
"train_speed(iter/s)": 0.580481
},
{
"epoch": 1.6353155339805825,
"grad_norm": 6.959107398986816,
"learning_rate": 4.2938582289774e-05,
"loss": 1.9854732513427735,
"memory(GiB)": 41.84,
"step": 2695,
"token_acc": 0.5686813186813187,
"train_speed(iter/s)": 0.58059
},
{
"epoch": 1.6383495145631068,
"grad_norm": 6.535874843597412,
"learning_rate": 4.278135157802389e-05,
"loss": 2.186625289916992,
"memory(GiB)": 41.84,
"step": 2700,
"token_acc": 0.5300859598853869,
"train_speed(iter/s)": 0.5806
},
{
"epoch": 1.641383495145631,
"grad_norm": 6.670753002166748,
"learning_rate": 4.262419373452634e-05,
"loss": 2.415786361694336,
"memory(GiB)": 41.84,
"step": 2705,
"token_acc": 0.4827586206896552,
"train_speed(iter/s)": 0.580602
},
{
"epoch": 1.6444174757281553,
"grad_norm": 11.83166790008545,
"learning_rate": 4.246711034570264e-05,
"loss": 2.008403015136719,
"memory(GiB)": 41.84,
"step": 2710,
"token_acc": 0.5294117647058824,
"train_speed(iter/s)": 0.580751
},
{
"epoch": 1.6474514563106797,
"grad_norm": 7.605556964874268,
"learning_rate": 4.231010299722248e-05,
"loss": 2.3934700012207033,
"memory(GiB)": 41.84,
"step": 2715,
"token_acc": 0.4915254237288136,
"train_speed(iter/s)": 0.580846
},
{
"epoch": 1.650485436893204,
"grad_norm": 6.8486504554748535,
"learning_rate": 4.2153173273987946e-05,
"loss": 1.9181827545166015,
"memory(GiB)": 41.84,
"step": 2720,
"token_acc": 0.5562913907284768,
"train_speed(iter/s)": 0.580869
},
{
"epoch": 1.6535194174757282,
"grad_norm": 8.30029296875,
"learning_rate": 4.199632276011761e-05,
"loss": 2.099735641479492,
"memory(GiB)": 41.84,
"step": 2725,
"token_acc": 0.5529100529100529,
"train_speed(iter/s)": 0.580925
},
{
"epoch": 1.6565533980582523,
"grad_norm": 6.734464168548584,
"learning_rate": 4.1839553038930396e-05,
"loss": 1.9709980010986328,
"memory(GiB)": 41.84,
"step": 2730,
"token_acc": 0.5331230283911672,
"train_speed(iter/s)": 0.580952
},
{
"epoch": 1.6595873786407767,
"grad_norm": 6.3508710861206055,
"learning_rate": 4.168286569292972e-05,
"loss": 2.039066123962402,
"memory(GiB)": 41.84,
"step": 2735,
"token_acc": 0.5649717514124294,
"train_speed(iter/s)": 0.581109
},
{
"epoch": 1.662621359223301,
"grad_norm": 6.782240867614746,
"learning_rate": 4.152626230378741e-05,
"loss": 1.832118606567383,
"memory(GiB)": 41.84,
"step": 2740,
"token_acc": 0.6,
"train_speed(iter/s)": 0.581153
},
{
"epoch": 1.6656553398058254,
"grad_norm": 8.437490463256836,
"learning_rate": 4.136974445232788e-05,
"loss": 1.9984106063842773,
"memory(GiB)": 41.84,
"step": 2745,
"token_acc": 0.5113636363636364,
"train_speed(iter/s)": 0.581248
},
{
"epoch": 1.6686893203883495,
"grad_norm": 8.64138126373291,
"learning_rate": 4.121331371851201e-05,
"loss": 1.9429035186767578,
"memory(GiB)": 41.84,
"step": 2750,
"token_acc": 0.574468085106383,
"train_speed(iter/s)": 0.581216
},
{
"epoch": 1.6717233009708736,
"grad_norm": 7.808033466339111,
"learning_rate": 4.10569716814213e-05,
"loss": 2.069664192199707,
"memory(GiB)": 41.84,
"step": 2755,
"token_acc": 0.546583850931677,
"train_speed(iter/s)": 0.581204
},
{
"epoch": 1.674757281553398,
"grad_norm": 7.158506393432617,
"learning_rate": 4.0900719919241935e-05,
"loss": 2.2129743576049803,
"memory(GiB)": 41.84,
"step": 2760,
"token_acc": 0.5330882352941176,
"train_speed(iter/s)": 0.581324
},
{
"epoch": 1.6777912621359223,
"grad_norm": 6.141445636749268,
"learning_rate": 4.0744560009248766e-05,
"loss": 2.1222957611083983,
"memory(GiB)": 41.84,
"step": 2765,
"token_acc": 0.5301204819277109,
"train_speed(iter/s)": 0.581344
},
{
"epoch": 1.6808252427184467,
"grad_norm": 9.04359245300293,
"learning_rate": 4.0588493527789537e-05,
"loss": 2.0622652053833006,
"memory(GiB)": 41.84,
"step": 2770,
"token_acc": 0.5793103448275863,
"train_speed(iter/s)": 0.581484
},
{
"epoch": 1.6838592233009708,
"grad_norm": 7.4207892417907715,
"learning_rate": 4.043252205026879e-05,
"loss": 1.9703941345214844,
"memory(GiB)": 41.84,
"step": 2775,
"token_acc": 0.5451807228915663,
"train_speed(iter/s)": 0.581551
},
{
"epoch": 1.6868932038834952,
"grad_norm": 6.962371826171875,
"learning_rate": 4.027664715113209e-05,
"loss": 2.0751678466796877,
"memory(GiB)": 41.84,
"step": 2780,
"token_acc": 0.533724340175953,
"train_speed(iter/s)": 0.58165
},
{
"epoch": 1.6899271844660193,
"grad_norm": 6.551590919494629,
"learning_rate": 4.012087040385012e-05,
"loss": 1.9780982971191405,
"memory(GiB)": 41.84,
"step": 2785,
"token_acc": 0.564625850340136,
"train_speed(iter/s)": 0.581595
},
{
"epoch": 1.6929611650485437,
"grad_norm": 8.19705867767334,
"learning_rate": 3.996519338090273e-05,
"loss": 1.9075267791748047,
"memory(GiB)": 41.84,
"step": 2790,
"token_acc": 0.5729537366548043,
"train_speed(iter/s)": 0.58155
},
{
"epoch": 1.695995145631068,
"grad_norm": 6.0668206214904785,
"learning_rate": 3.980961765376316e-05,
"loss": 2.269983100891113,
"memory(GiB)": 41.84,
"step": 2795,
"token_acc": 0.5031446540880503,
"train_speed(iter/s)": 0.581377
},
{
"epoch": 1.6990291262135924,
"grad_norm": 7.507983684539795,
"learning_rate": 3.965414479288209e-05,
"loss": 2.1596681594848635,
"memory(GiB)": 41.84,
"step": 2800,
"token_acc": 0.5704225352112676,
"train_speed(iter/s)": 0.581409
},
{
"epoch": 1.7020631067961165,
"grad_norm": 9.827066421508789,
"learning_rate": 3.9498776367671825e-05,
"loss": 2.028460884094238,
"memory(GiB)": 41.84,
"step": 2805,
"token_acc": 0.5544871794871795,
"train_speed(iter/s)": 0.581541
},
{
"epoch": 1.7050970873786406,
"grad_norm": 7.970204830169678,
"learning_rate": 3.9343513946490454e-05,
"loss": 2.2608503341674804,
"memory(GiB)": 41.84,
"step": 2810,
"token_acc": 0.532871972318339,
"train_speed(iter/s)": 0.5816
},
{
"epoch": 1.708131067961165,
"grad_norm": 8.01364517211914,
"learning_rate": 3.9188359096626e-05,
"loss": 1.965842056274414,
"memory(GiB)": 41.84,
"step": 2815,
"token_acc": 0.5447154471544715,
"train_speed(iter/s)": 0.581736
},
{
"epoch": 1.7111650485436893,
"grad_norm": 7.19758939743042,
"learning_rate": 3.903331338428067e-05,
"loss": 2.0728851318359376,
"memory(GiB)": 41.84,
"step": 2820,
"token_acc": 0.5568862275449101,
"train_speed(iter/s)": 0.581956
},
{
"epoch": 1.7141990291262137,
"grad_norm": 6.977797508239746,
"learning_rate": 3.88783783745549e-05,
"loss": 1.7800270080566407,
"memory(GiB)": 41.84,
"step": 2825,
"token_acc": 0.565359477124183,
"train_speed(iter/s)": 0.581974
},
{
"epoch": 1.7172330097087378,
"grad_norm": 8.389069557189941,
"learning_rate": 3.872355563143173e-05,
"loss": 1.479856300354004,
"memory(GiB)": 41.84,
"step": 2830,
"token_acc": 0.6463878326996197,
"train_speed(iter/s)": 0.582021
},
{
"epoch": 1.7202669902912622,
"grad_norm": 8.598016738891602,
"learning_rate": 3.856884671776085e-05,
"loss": 1.9001766204833985,
"memory(GiB)": 41.84,
"step": 2835,
"token_acc": 0.5427509293680297,
"train_speed(iter/s)": 0.582021
},
{
"epoch": 1.7233009708737863,
"grad_norm": 7.339463233947754,
"learning_rate": 3.8414253195242986e-05,
"loss": 2.0311508178710938,
"memory(GiB)": 41.84,
"step": 2840,
"token_acc": 0.5960912052117264,
"train_speed(iter/s)": 0.582075
},
{
"epoch": 1.7263349514563107,
"grad_norm": 6.700257778167725,
"learning_rate": 3.8259776624414e-05,
"loss": 1.824915313720703,
"memory(GiB)": 41.84,
"step": 2845,
"token_acc": 0.5838709677419355,
"train_speed(iter/s)": 0.582141
},
{
"epoch": 1.729368932038835,
"grad_norm": 7.298790454864502,
"learning_rate": 3.81054185646292e-05,
"loss": 2.0110477447509765,
"memory(GiB)": 41.84,
"step": 2850,
"token_acc": 0.5802047781569966,
"train_speed(iter/s)": 0.581997
},
{
"epoch": 1.7324029126213594,
"grad_norm": 7.2910332679748535,
"learning_rate": 3.795118057404761e-05,
"loss": 1.9101539611816407,
"memory(GiB)": 41.84,
"step": 2855,
"token_acc": 0.5787545787545788,
"train_speed(iter/s)": 0.582142
},
{
"epoch": 1.7354368932038835,
"grad_norm": 5.262487411499023,
"learning_rate": 3.779706420961617e-05,
"loss": 1.8585384368896485,
"memory(GiB)": 41.84,
"step": 2860,
"token_acc": 0.5941176470588235,
"train_speed(iter/s)": 0.5821
},
{
"epoch": 1.7384708737864076,
"grad_norm": 10.52902603149414,
"learning_rate": 3.764307102705417e-05,
"loss": 2.2284523010253907,
"memory(GiB)": 41.84,
"step": 2865,
"token_acc": 0.5323076923076923,
"train_speed(iter/s)": 0.582044
},
{
"epoch": 1.741504854368932,
"grad_norm": 7.36726188659668,
"learning_rate": 3.748920258083736e-05,
"loss": 2.3935964584350584,
"memory(GiB)": 41.84,
"step": 2870,
"token_acc": 0.5157593123209169,
"train_speed(iter/s)": 0.582023
},
{
"epoch": 1.7445388349514563,
"grad_norm": 9.515303611755371,
"learning_rate": 3.7335460424182356e-05,
"loss": 2.0206344604492186,
"memory(GiB)": 41.84,
"step": 2875,
"token_acc": 0.5436241610738255,
"train_speed(iter/s)": 0.582136
},
{
"epoch": 1.7475728155339807,
"grad_norm": 7.746051788330078,
"learning_rate": 3.7181846109031005e-05,
"loss": 1.9893791198730468,
"memory(GiB)": 41.84,
"step": 2880,
"token_acc": 0.5664335664335665,
"train_speed(iter/s)": 0.582034
},
{
"epoch": 1.7506067961165048,
"grad_norm": 7.868143081665039,
"learning_rate": 3.702836118603458e-05,
"loss": 2.369589614868164,
"memory(GiB)": 41.84,
"step": 2885,
"token_acc": 0.5084745762711864,
"train_speed(iter/s)": 0.581894
},
{
"epoch": 1.7536407766990292,
"grad_norm": 6.672244071960449,
"learning_rate": 3.687500720453831e-05,
"loss": 1.9809467315673828,
"memory(GiB)": 41.84,
"step": 2890,
"token_acc": 0.5498489425981873,
"train_speed(iter/s)": 0.58182
},
{
"epoch": 1.7566747572815533,
"grad_norm": 5.8379011154174805,
"learning_rate": 3.672178571256556e-05,
"loss": 2.137996864318848,
"memory(GiB)": 41.84,
"step": 2895,
"token_acc": 0.5470588235294118,
"train_speed(iter/s)": 0.581917
},
{
"epoch": 1.7597087378640777,
"grad_norm": 5.696329593658447,
"learning_rate": 3.656869825680234e-05,
"loss": 1.7796316146850586,
"memory(GiB)": 41.84,
"step": 2900,
"token_acc": 0.6054421768707483,
"train_speed(iter/s)": 0.581974
},
{
"epoch": 1.762742718446602,
"grad_norm": 7.160623550415039,
"learning_rate": 3.641574638258162e-05,
"loss": 2.0094619750976563,
"memory(GiB)": 41.84,
"step": 2905,
"token_acc": 0.5428571428571428,
"train_speed(iter/s)": 0.58194
},
{
"epoch": 1.7657766990291264,
"grad_norm": 5.733323097229004,
"learning_rate": 3.62629316338677e-05,
"loss": 2.0144931793212892,
"memory(GiB)": 41.84,
"step": 2910,
"token_acc": 0.5308988764044944,
"train_speed(iter/s)": 0.581796
},
{
"epoch": 1.7688106796116505,
"grad_norm": 6.644180774688721,
"learning_rate": 3.611025555324079e-05,
"loss": 1.9589729309082031,
"memory(GiB)": 41.84,
"step": 2915,
"token_acc": 0.5672727272727273,
"train_speed(iter/s)": 0.581878
},
{
"epoch": 1.7718446601941746,
"grad_norm": 13.900938034057617,
"learning_rate": 3.595771968188121e-05,
"loss": 1.9292577743530273,
"memory(GiB)": 41.84,
"step": 2920,
"token_acc": 0.59,
"train_speed(iter/s)": 0.582002
},
{
"epoch": 1.774878640776699,
"grad_norm": 9.342930793762207,
"learning_rate": 3.5805325559554006e-05,
"loss": 1.8789905548095702,
"memory(GiB)": 41.84,
"step": 2925,
"token_acc": 0.556,
"train_speed(iter/s)": 0.582068
},
{
"epoch": 1.7779126213592233,
"grad_norm": 10.121810913085938,
"learning_rate": 3.5653074724593306e-05,
"loss": 2.171294593811035,
"memory(GiB)": 41.84,
"step": 2930,
"token_acc": 0.5441176470588235,
"train_speed(iter/s)": 0.582172
},
{
"epoch": 1.7809466019417477,
"grad_norm": 8.192787170410156,
"learning_rate": 3.550096871388689e-05,
"loss": 1.9008895874023437,
"memory(GiB)": 41.84,
"step": 2935,
"token_acc": 0.5387205387205387,
"train_speed(iter/s)": 0.582287
},
{
"epoch": 1.7839805825242718,
"grad_norm": 9.528207778930664,
"learning_rate": 3.5349009062860586e-05,
"loss": 2.1617660522460938,
"memory(GiB)": 41.84,
"step": 2940,
"token_acc": 0.5551601423487544,
"train_speed(iter/s)": 0.582322
},
{
"epoch": 1.787014563106796,
"grad_norm": 11.588967323303223,
"learning_rate": 3.519719730546275e-05,
"loss": 1.679486083984375,
"memory(GiB)": 41.84,
"step": 2945,
"token_acc": 0.6188679245283019,
"train_speed(iter/s)": 0.582412
},
{
"epoch": 1.7900485436893203,
"grad_norm": 8.055891990661621,
"learning_rate": 3.504553497414893e-05,
"loss": 1.960872268676758,
"memory(GiB)": 41.84,
"step": 2950,
"token_acc": 0.6287878787878788,
"train_speed(iter/s)": 0.582489
},
{
"epoch": 1.7930825242718447,
"grad_norm": 6.05890417098999,
"learning_rate": 3.489402359986621e-05,
"loss": 1.9190954208374023,
"memory(GiB)": 41.84,
"step": 2955,
"token_acc": 0.5660377358490566,
"train_speed(iter/s)": 0.582598
},
{
"epoch": 1.796116504854369,
"grad_norm": 10.20227336883545,
"learning_rate": 3.474266471203794e-05,
"loss": 1.7310752868652344,
"memory(GiB)": 41.84,
"step": 2960,
"token_acc": 0.6046511627906976,
"train_speed(iter/s)": 0.582734
},
{
"epoch": 1.7991504854368932,
"grad_norm": 8.176021575927734,
"learning_rate": 3.459145983854813e-05,
"loss": 1.9539764404296875,
"memory(GiB)": 41.84,
"step": 2965,
"token_acc": 0.5962962962962963,
"train_speed(iter/s)": 0.582861
},
{
"epoch": 1.8021844660194175,
"grad_norm": 7.691636085510254,
"learning_rate": 3.444041050572611e-05,
"loss": 2.0364006042480467,
"memory(GiB)": 41.84,
"step": 2970,
"token_acc": 0.5605536332179931,
"train_speed(iter/s)": 0.582943
},
{
"epoch": 1.8052184466019416,
"grad_norm": 8.828807830810547,
"learning_rate": 3.4289518238331145e-05,
"loss": 1.7169891357421876,
"memory(GiB)": 41.84,
"step": 2975,
"token_acc": 0.654275092936803,
"train_speed(iter/s)": 0.583015
},
{
"epoch": 1.808252427184466,
"grad_norm": 6.20446252822876,
"learning_rate": 3.413878455953698e-05,
"loss": 2.094204902648926,
"memory(GiB)": 41.84,
"step": 2980,
"token_acc": 0.5351170568561873,
"train_speed(iter/s)": 0.583072
},
{
"epoch": 1.8112864077669903,
"grad_norm": 7.542689800262451,
"learning_rate": 3.398821099091652e-05,
"loss": 1.8194765090942382,
"memory(GiB)": 41.84,
"step": 2985,
"token_acc": 0.5900621118012422,
"train_speed(iter/s)": 0.583267
},
{
"epoch": 1.8143203883495147,
"grad_norm": 5.989041328430176,
"learning_rate": 3.3837799052426434e-05,
"loss": 2.085628128051758,
"memory(GiB)": 41.84,
"step": 2990,
"token_acc": 0.5573770491803278,
"train_speed(iter/s)": 0.58343
},
{
"epoch": 1.8173543689320388,
"grad_norm": 8.956052780151367,
"learning_rate": 3.3687550262391836e-05,
"loss": 2.0220142364501954,
"memory(GiB)": 41.84,
"step": 2995,
"token_acc": 0.563076923076923,
"train_speed(iter/s)": 0.583454
},
{
"epoch": 1.820388349514563,
"grad_norm": 9.703901290893555,
"learning_rate": 3.353746613749094e-05,
"loss": 1.7758405685424805,
"memory(GiB)": 41.84,
"step": 3000,
"token_acc": 0.5978260869565217,
"train_speed(iter/s)": 0.583443
},
{
"epoch": 1.820388349514563,
"eval_loss": 2.155855655670166,
"eval_runtime": 12.3446,
"eval_samples_per_second": 8.101,
"eval_steps_per_second": 8.101,
"eval_token_acc": 0.5071335927367056,
"step": 3000
},
{
"epoch": 1.8234223300970873,
"grad_norm": 10.289823532104492,
"learning_rate": 3.33875481927397e-05,
"loss": 1.9597461700439454,
"memory(GiB)": 41.84,
"step": 3005,
"token_acc": 0.5239887111947319,
"train_speed(iter/s)": 0.581978
},
{
"epoch": 1.8264563106796117,
"grad_norm": 8.281176567077637,
"learning_rate": 3.3237797941476715e-05,
"loss": 1.7820388793945312,
"memory(GiB)": 41.84,
"step": 3010,
"token_acc": 0.5830258302583026,
"train_speed(iter/s)": 0.582021
},
{
"epoch": 1.829490291262136,
"grad_norm": 6.512312889099121,
"learning_rate": 3.308821689534766e-05,
"loss": 1.9633775711059571,
"memory(GiB)": 41.84,
"step": 3015,
"token_acc": 0.526984126984127,
"train_speed(iter/s)": 0.582048
},
{
"epoch": 1.8325242718446602,
"grad_norm": 6.695690631866455,
"learning_rate": 3.293880656429028e-05,
"loss": 1.9059555053710937,
"memory(GiB)": 41.84,
"step": 3020,
"token_acc": 0.558282208588957,
"train_speed(iter/s)": 0.582014
},
{
"epoch": 1.8355582524271845,
"grad_norm": 8.261147499084473,
"learning_rate": 3.278956845651897e-05,
"loss": 1.9743257522583009,
"memory(GiB)": 41.84,
"step": 3025,
"token_acc": 0.5537974683544303,
"train_speed(iter/s)": 0.582071
},
{
"epoch": 1.8385922330097086,
"grad_norm": 8.489652633666992,
"learning_rate": 3.2640504078509706e-05,
"loss": 2.0056623458862304,
"memory(GiB)": 41.84,
"step": 3030,
"token_acc": 0.5425219941348973,
"train_speed(iter/s)": 0.582011
},
{
"epoch": 1.841626213592233,
"grad_norm": 5.517820835113525,
"learning_rate": 3.2491614934984706e-05,
"loss": 2.0196483612060545,
"memory(GiB)": 41.84,
"step": 3035,
"token_acc": 0.5681159420289855,
"train_speed(iter/s)": 0.58204
},
{
"epoch": 1.8446601941747574,
"grad_norm": 11.377049446105957,
"learning_rate": 3.2342902528897276e-05,
"loss": 2.4981143951416014,
"memory(GiB)": 41.84,
"step": 3040,
"token_acc": 0.4857142857142857,
"train_speed(iter/s)": 0.582056
},
{
"epoch": 1.8476941747572817,
"grad_norm": 9.072402954101562,
"learning_rate": 3.219436836141672e-05,
"loss": 1.7939895629882812,
"memory(GiB)": 41.84,
"step": 3045,
"token_acc": 0.5458015267175572,
"train_speed(iter/s)": 0.582115
},
{
"epoch": 1.8507281553398058,
"grad_norm": 8.273455619812012,
"learning_rate": 3.204601393191305e-05,
"loss": 2.0849941253662108,
"memory(GiB)": 41.84,
"step": 3050,
"token_acc": 0.5551948051948052,
"train_speed(iter/s)": 0.582202
},
{
"epoch": 1.85376213592233,
"grad_norm": 6.509883880615234,
"learning_rate": 3.1897840737941996e-05,
"loss": 1.894825553894043,
"memory(GiB)": 41.84,
"step": 3055,
"token_acc": 0.5211726384364821,
"train_speed(iter/s)": 0.582142
},
{
"epoch": 1.8567961165048543,
"grad_norm": 8.81839370727539,
"learning_rate": 3.174985027522978e-05,
"loss": 1.9194953918457032,
"memory(GiB)": 41.84,
"step": 3060,
"token_acc": 0.5727554179566563,
"train_speed(iter/s)": 0.582189
},
{
"epoch": 1.8598300970873787,
"grad_norm": 7.000573635101318,
"learning_rate": 3.1602044037657994e-05,
"loss": 1.977131462097168,
"memory(GiB)": 41.84,
"step": 3065,
"token_acc": 0.543046357615894,
"train_speed(iter/s)": 0.582179
},
{
"epoch": 1.862864077669903,
"grad_norm": 8.45114803314209,
"learning_rate": 3.1454423517248704e-05,
"loss": 2.187137985229492,
"memory(GiB)": 41.84,
"step": 3070,
"token_acc": 0.5319767441860465,
"train_speed(iter/s)": 0.582204
},
{
"epoch": 1.8658980582524272,
"grad_norm": 11.056445121765137,
"learning_rate": 3.1306990204149146e-05,
"loss": 1.8925033569335938,
"memory(GiB)": 41.84,
"step": 3075,
"token_acc": 0.568,
"train_speed(iter/s)": 0.582108
},
{
"epoch": 1.8689320388349513,
"grad_norm": 7.232324123382568,
"learning_rate": 3.115974558661691e-05,
"loss": 2.050203323364258,
"memory(GiB)": 41.84,
"step": 3080,
"token_acc": 0.5853658536585366,
"train_speed(iter/s)": 0.582179
},
{
"epoch": 1.8719660194174756,
"grad_norm": 6.1433024406433105,
"learning_rate": 3.1012691151004694e-05,
"loss": 1.7500345230102539,
"memory(GiB)": 41.84,
"step": 3085,
"token_acc": 0.5871886120996441,
"train_speed(iter/s)": 0.582301
},
{
"epoch": 1.875,
"grad_norm": 5.706048488616943,
"learning_rate": 3.086582838174551e-05,
"loss": 1.8604692459106444,
"memory(GiB)": 41.84,
"step": 3090,
"token_acc": 0.5847953216374269,
"train_speed(iter/s)": 0.582285
},
{
"epoch": 1.8780339805825244,
"grad_norm": 7.604012489318848,
"learning_rate": 3.0719158761337574e-05,
"loss": 1.8550039291381837,
"memory(GiB)": 41.84,
"step": 3095,
"token_acc": 0.558641975308642,
"train_speed(iter/s)": 0.582099
},
{
"epoch": 1.8810679611650487,
"grad_norm": 7.333124160766602,
"learning_rate": 3.0572683770329316e-05,
"loss": 2.143758010864258,
"memory(GiB)": 41.84,
"step": 3100,
"token_acc": 0.5300353356890459,
"train_speed(iter/s)": 0.581945
},
{
"epoch": 1.8841019417475728,
"grad_norm": 6.552914142608643,
"learning_rate": 3.0426404887304605e-05,
"loss": 1.7599102020263673,
"memory(GiB)": 41.84,
"step": 3105,
"token_acc": 0.5617283950617284,
"train_speed(iter/s)": 0.58193
},
{
"epoch": 1.887135922330097,
"grad_norm": 7.146379470825195,
"learning_rate": 3.0280323588867586e-05,
"loss": 1.814478302001953,
"memory(GiB)": 41.84,
"step": 3110,
"token_acc": 0.5836177474402731,
"train_speed(iter/s)": 0.582031
},
{
"epoch": 1.8901699029126213,
"grad_norm": 4.842132568359375,
"learning_rate": 3.0134441349627997e-05,
"loss": 2.0125823974609376,
"memory(GiB)": 41.84,
"step": 3115,
"token_acc": 0.575,
"train_speed(iter/s)": 0.581976
},
{
"epoch": 1.8932038834951457,
"grad_norm": 7.434795379638672,
"learning_rate": 2.9988759642186097e-05,
"loss": 2.0929500579833986,
"memory(GiB)": 41.84,
"step": 3120,
"token_acc": 0.5663956639566395,
"train_speed(iter/s)": 0.582017
},
{
"epoch": 1.89623786407767,
"grad_norm": 15.827396392822266,
"learning_rate": 2.9843279937117997e-05,
"loss": 2.314325141906738,
"memory(GiB)": 41.84,
"step": 3125,
"token_acc": 0.5579399141630901,
"train_speed(iter/s)": 0.582137
},
{
"epoch": 1.8992718446601942,
"grad_norm": 7.544915199279785,
"learning_rate": 2.9698003702960586e-05,
"loss": 2.055324745178223,
"memory(GiB)": 41.84,
"step": 3130,
"token_acc": 0.5213903743315508,
"train_speed(iter/s)": 0.5821
},
{
"epoch": 1.9023058252427183,
"grad_norm": 6.31001091003418,
"learning_rate": 2.9552932406196876e-05,
"loss": 1.8344003677368164,
"memory(GiB)": 41.84,
"step": 3135,
"token_acc": 0.5980707395498392,
"train_speed(iter/s)": 0.582162
},
{
"epoch": 1.9053398058252426,
"grad_norm": 9.230671882629395,
"learning_rate": 2.94080675112412e-05,
"loss": 1.9021150588989257,
"memory(GiB)": 41.84,
"step": 3140,
"token_acc": 0.5845070422535211,
"train_speed(iter/s)": 0.582193
},
{
"epoch": 1.908373786407767,
"grad_norm": 7.505317211151123,
"learning_rate": 2.9263410480424303e-05,
"loss": 2.2937973022460936,
"memory(GiB)": 41.84,
"step": 3145,
"token_acc": 0.5370370370370371,
"train_speed(iter/s)": 0.582135
},
{
"epoch": 1.9114077669902914,
"grad_norm": 11.365267753601074,
"learning_rate": 2.9118962773978693e-05,
"loss": 2.124867057800293,
"memory(GiB)": 41.84,
"step": 3150,
"token_acc": 0.5379310344827586,
"train_speed(iter/s)": 0.582048
},
{
"epoch": 1.9144417475728155,
"grad_norm": 6.946807861328125,
"learning_rate": 2.8974725850023886e-05,
"loss": 1.7865402221679687,
"memory(GiB)": 41.84,
"step": 3155,
"token_acc": 0.6114649681528662,
"train_speed(iter/s)": 0.582042
},
{
"epoch": 1.9174757281553398,
"grad_norm": 7.430286884307861,
"learning_rate": 2.8830701164551598e-05,
"loss": 2.096043014526367,
"memory(GiB)": 41.84,
"step": 3160,
"token_acc": 0.5474006116207951,
"train_speed(iter/s)": 0.581966
},
{
"epoch": 1.920509708737864,
"grad_norm": 5.716464996337891,
"learning_rate": 2.8686890171411175e-05,
"loss": 1.8883914947509766,
"memory(GiB)": 41.84,
"step": 3165,
"token_acc": 0.55,
"train_speed(iter/s)": 0.582002
},
{
"epoch": 1.9235436893203883,
"grad_norm": 6.345276832580566,
"learning_rate": 2.8543294322294846e-05,
"loss": 1.888068962097168,
"memory(GiB)": 41.84,
"step": 3170,
"token_acc": 0.5714285714285714,
"train_speed(iter/s)": 0.581965
},
{
"epoch": 1.9265776699029127,
"grad_norm": 8.231746673583984,
"learning_rate": 2.8399915066723072e-05,
"loss": 2.047636795043945,
"memory(GiB)": 41.84,
"step": 3175,
"token_acc": 0.5,
"train_speed(iter/s)": 0.581961
},
{
"epoch": 1.929611650485437,
"grad_norm": 7.52333927154541,
"learning_rate": 2.8256753852029915e-05,
"loss": 1.964263916015625,
"memory(GiB)": 41.84,
"step": 3180,
"token_acc": 0.5566666666666666,
"train_speed(iter/s)": 0.581957
},
{
"epoch": 1.9326456310679612,
"grad_norm": 8.115636825561523,
"learning_rate": 2.811381212334847e-05,
"loss": 1.974155807495117,
"memory(GiB)": 41.84,
"step": 3185,
"token_acc": 0.5273775216138329,
"train_speed(iter/s)": 0.58201
},
{
"epoch": 1.9356796116504853,
"grad_norm": 10.928778648376465,
"learning_rate": 2.7971091323596177e-05,
"loss": 1.7765790939331054,
"memory(GiB)": 41.84,
"step": 3190,
"token_acc": 0.5967078189300411,
"train_speed(iter/s)": 0.581997
},
{
"epoch": 1.9387135922330097,
"grad_norm": 10.940017700195312,
"learning_rate": 2.782859289346038e-05,
"loss": 2.00123291015625,
"memory(GiB)": 41.84,
"step": 3195,
"token_acc": 0.5628930817610063,
"train_speed(iter/s)": 0.58186
},
{
"epoch": 1.941747572815534,
"grad_norm": 5.479226112365723,
"learning_rate": 2.7686318271383714e-05,
"loss": 1.7830612182617187,
"memory(GiB)": 41.84,
"step": 3200,
"token_acc": 0.5899705014749262,
"train_speed(iter/s)": 0.581899
},
{
"epoch": 1.9447815533980584,
"grad_norm": 9.235628128051758,
"learning_rate": 2.7544268893549573e-05,
"loss": 2.1630695343017576,
"memory(GiB)": 41.84,
"step": 3205,
"token_acc": 0.5141843971631206,
"train_speed(iter/s)": 0.581774
},
{
"epoch": 1.9478155339805825,
"grad_norm": 9.116209030151367,
"learning_rate": 2.740244619386768e-05,
"loss": 1.9152229309082032,
"memory(GiB)": 41.84,
"step": 3210,
"token_acc": 0.5373134328358209,
"train_speed(iter/s)": 0.581685
},
{
"epoch": 1.9508495145631068,
"grad_norm": 8.476284980773926,
"learning_rate": 2.726085160395948e-05,
"loss": 1.9020435333251953,
"memory(GiB)": 41.84,
"step": 3215,
"token_acc": 0.6095238095238096,
"train_speed(iter/s)": 0.581688
},
{
"epoch": 1.953883495145631,
"grad_norm": 6.1975226402282715,
"learning_rate": 2.7119486553143904e-05,
"loss": 1.6950944900512694,
"memory(GiB)": 41.84,
"step": 3220,
"token_acc": 0.5774193548387097,
"train_speed(iter/s)": 0.58168
},
{
"epoch": 1.9569174757281553,
"grad_norm": 8.92437744140625,
"learning_rate": 2.6978352468422685e-05,
"loss": 1.9295099258422852,
"memory(GiB)": 41.84,
"step": 3225,
"token_acc": 0.5393586005830904,
"train_speed(iter/s)": 0.581539
},
{
"epoch": 1.9599514563106797,
"grad_norm": 7.443687438964844,
"learning_rate": 2.683745077446616e-05,
"loss": 1.8496671676635743,
"memory(GiB)": 41.84,
"step": 3230,
"token_acc": 0.6013289036544851,
"train_speed(iter/s)": 0.581521
},
{
"epoch": 1.962985436893204,
"grad_norm": 8.71033763885498,
"learning_rate": 2.6696782893598816e-05,
"loss": 1.8758098602294921,
"memory(GiB)": 41.84,
"step": 3235,
"token_acc": 0.5804195804195804,
"train_speed(iter/s)": 0.581603
},
{
"epoch": 1.9660194174757282,
"grad_norm": 9.311905860900879,
"learning_rate": 2.6556350245784833e-05,
"loss": 2.088191795349121,
"memory(GiB)": 41.84,
"step": 3240,
"token_acc": 0.5805626598465473,
"train_speed(iter/s)": 0.581562
},
{
"epoch": 1.9690533980582523,
"grad_norm": 7.559510707855225,
"learning_rate": 2.641615424861399e-05,
"loss": 2.090311050415039,
"memory(GiB)": 41.84,
"step": 3245,
"token_acc": 0.5533980582524272,
"train_speed(iter/s)": 0.58146
},
{
"epoch": 1.9720873786407767,
"grad_norm": 9.421564102172852,
"learning_rate": 2.6276196317287083e-05,
"loss": 2.2272558212280273,
"memory(GiB)": 41.84,
"step": 3250,
"token_acc": 0.5223463687150838,
"train_speed(iter/s)": 0.581307
},
{
"epoch": 1.975121359223301,
"grad_norm": 6.799111843109131,
"learning_rate": 2.6136477864601817e-05,
"loss": 2.049495887756348,
"memory(GiB)": 41.84,
"step": 3255,
"token_acc": 0.5488215488215489,
"train_speed(iter/s)": 0.581288
},
{
"epoch": 1.9781553398058254,
"grad_norm": 6.001493453979492,
"learning_rate": 2.5997000300938506e-05,
"loss": 1.8592962265014648,
"memory(GiB)": 41.84,
"step": 3260,
"token_acc": 0.5870206489675516,
"train_speed(iter/s)": 0.581194
},
{
"epoch": 1.9811893203883495,
"grad_norm": 8.738608360290527,
"learning_rate": 2.585776503424576e-05,
"loss": 2.017384719848633,
"memory(GiB)": 41.84,
"step": 3265,
"token_acc": 0.5529801324503312,
"train_speed(iter/s)": 0.581142
},
{
"epoch": 1.9842233009708736,
"grad_norm": 9.666224479675293,
"learning_rate": 2.5718773470026448e-05,
"loss": 1.999835205078125,
"memory(GiB)": 41.84,
"step": 3270,
"token_acc": 0.5418060200668896,
"train_speed(iter/s)": 0.581015
},
{
"epoch": 1.987257281553398,
"grad_norm": 10.135787963867188,
"learning_rate": 2.5580027011323282e-05,
"loss": 1.6806678771972656,
"memory(GiB)": 41.84,
"step": 3275,
"token_acc": 0.5893536121673004,
"train_speed(iter/s)": 0.580994
},
{
"epoch": 1.9902912621359223,
"grad_norm": 7.922843933105469,
"learning_rate": 2.544152705870483e-05,
"loss": 2.177354431152344,
"memory(GiB)": 41.84,
"step": 3280,
"token_acc": 0.5117056856187291,
"train_speed(iter/s)": 0.581002
},
{
"epoch": 1.9933252427184467,
"grad_norm": 6.94931697845459,
"learning_rate": 2.5303275010251315e-05,
"loss": 2.506937026977539,
"memory(GiB)": 41.84,
"step": 3285,
"token_acc": 0.4811594202898551,
"train_speed(iter/s)": 0.581043
},
{
"epoch": 1.9963592233009708,
"grad_norm": 10.3767728805542,
"learning_rate": 2.5165272261540458e-05,
"loss": 2.0383968353271484,
"memory(GiB)": 41.84,
"step": 3290,
"token_acc": 0.5487364620938628,
"train_speed(iter/s)": 0.581
},
{
"epoch": 1.9993932038834952,
"grad_norm": 9.176785469055176,
"learning_rate": 2.5027520205633537e-05,
"loss": 2.0018213272094725,
"memory(GiB)": 41.84,
"step": 3295,
"token_acc": 0.5522875816993464,
"train_speed(iter/s)": 0.580883
},
{
"epoch": 2.0024271844660193,
"grad_norm": 6.717225551605225,
"learning_rate": 2.4890020233061117e-05,
"loss": 1.7098587036132813,
"memory(GiB)": 41.84,
"step": 3300,
"token_acc": 0.5948905109489051,
"train_speed(iter/s)": 0.580883
},
{
"epoch": 2.0054611650485437,
"grad_norm": 5.7973246574401855,
"learning_rate": 2.4752773731809176e-05,
"loss": 2.0262834548950197,
"memory(GiB)": 41.84,
"step": 3305,
"token_acc": 0.558641975308642,
"train_speed(iter/s)": 0.580856
},
{
"epoch": 2.008495145631068,
"grad_norm": 7.4671831130981445,
"learning_rate": 2.461578208730504e-05,
"loss": 1.7233488082885742,
"memory(GiB)": 41.84,
"step": 3310,
"token_acc": 0.6162790697674418,
"train_speed(iter/s)": 0.580794
},
{
"epoch": 2.0115291262135924,
"grad_norm": 12.061534881591797,
"learning_rate": 2.447904668240338e-05,
"loss": 1.8241962432861327,
"memory(GiB)": 41.84,
"step": 3315,
"token_acc": 0.6076923076923076,
"train_speed(iter/s)": 0.580734
},
{
"epoch": 2.0145631067961167,
"grad_norm": 8.090734481811523,
"learning_rate": 2.4342568897372304e-05,
"loss": 1.7618919372558595,
"memory(GiB)": 41.84,
"step": 3320,
"token_acc": 0.5701219512195121,
"train_speed(iter/s)": 0.58063
},
{
"epoch": 2.0175970873786406,
"grad_norm": 9.886768341064453,
"learning_rate": 2.4206350109879322e-05,
"loss": 2.333799362182617,
"memory(GiB)": 41.84,
"step": 3325,
"token_acc": 0.5303514376996805,
"train_speed(iter/s)": 0.58057
},
{
"epoch": 2.020631067961165,
"grad_norm": 9.405782699584961,
"learning_rate": 2.4070391694977578e-05,
"loss": 1.9533946990966797,
"memory(GiB)": 41.84,
"step": 3330,
"token_acc": 0.5647840531561462,
"train_speed(iter/s)": 0.580546
},
{
"epoch": 2.0236650485436893,
"grad_norm": 8.449411392211914,
"learning_rate": 2.3934695025091863e-05,
"loss": 1.9143606185913087,
"memory(GiB)": 41.84,
"step": 3335,
"token_acc": 0.5501618122977346,
"train_speed(iter/s)": 0.580463
},
{
"epoch": 2.0266990291262137,
"grad_norm": 9.61319351196289,
"learning_rate": 2.3799261470004817e-05,
"loss": 1.825465202331543,
"memory(GiB)": 41.84,
"step": 3340,
"token_acc": 0.5772357723577236,
"train_speed(iter/s)": 0.580476
},
{
"epoch": 2.029733009708738,
"grad_norm": 10.004016876220703,
"learning_rate": 2.3664092396843078e-05,
"loss": 2.128991889953613,
"memory(GiB)": 41.84,
"step": 3345,
"token_acc": 0.5173501577287066,
"train_speed(iter/s)": 0.58049
},
{
"epoch": 2.032766990291262,
"grad_norm": 8.138049125671387,
"learning_rate": 2.3529189170063448e-05,
"loss": 2.3146188735961912,
"memory(GiB)": 41.84,
"step": 3350,
"token_acc": 0.5306122448979592,
"train_speed(iter/s)": 0.580442
},
{
"epoch": 2.0358009708737863,
"grad_norm": 8.229063987731934,
"learning_rate": 2.3394553151439207e-05,
"loss": 1.8358327865600585,
"memory(GiB)": 41.84,
"step": 3355,
"token_acc": 0.583941605839416,
"train_speed(iter/s)": 0.580463
},
{
"epoch": 2.0388349514563107,
"grad_norm": 7.304425239562988,
"learning_rate": 2.3260185700046294e-05,
"loss": 1.8064495086669923,
"memory(GiB)": 41.84,
"step": 3360,
"token_acc": 0.5791245791245792,
"train_speed(iter/s)": 0.58043
},
{
"epoch": 2.041868932038835,
"grad_norm": 9.741589546203613,
"learning_rate": 2.3126088172249617e-05,
"loss": 1.8935234069824218,
"memory(GiB)": 41.84,
"step": 3365,
"token_acc": 0.5535055350553506,
"train_speed(iter/s)": 0.580243
},
{
"epoch": 2.0449029126213594,
"grad_norm": 11.936101913452148,
"learning_rate": 2.299226192168935e-05,
"loss": 1.8312896728515624,
"memory(GiB)": 41.84,
"step": 3370,
"token_acc": 0.5755627009646302,
"train_speed(iter/s)": 0.580241
},
{
"epoch": 2.0479368932038833,
"grad_norm": 8.954520225524902,
"learning_rate": 2.28587082992673e-05,
"loss": 1.9918130874633788,
"memory(GiB)": 41.84,
"step": 3375,
"token_acc": 0.5692307692307692,
"train_speed(iter/s)": 0.580236
},
{
"epoch": 2.0509708737864076,
"grad_norm": 7.279824256896973,
"learning_rate": 2.2725428653133178e-05,
"loss": 2.056449317932129,
"memory(GiB)": 41.84,
"step": 3380,
"token_acc": 0.5582655826558266,
"train_speed(iter/s)": 0.580216
},
{
"epoch": 2.054004854368932,
"grad_norm": 8.318132400512695,
"learning_rate": 2.2592424328671125e-05,
"loss": 1.845474624633789,
"memory(GiB)": 41.84,
"step": 3385,
"token_acc": 0.5753846153846154,
"train_speed(iter/s)": 0.580119
},
{
"epoch": 2.0570388349514563,
"grad_norm": 8.473575592041016,
"learning_rate": 2.2459696668486025e-05,
"loss": 2.0317916870117188,
"memory(GiB)": 41.84,
"step": 3390,
"token_acc": 0.5693950177935944,
"train_speed(iter/s)": 0.580048
},
{
"epoch": 2.0600728155339807,
"grad_norm": 6.581578254699707,
"learning_rate": 2.2327247012390005e-05,
"loss": 1.8874988555908203,
"memory(GiB)": 41.84,
"step": 3395,
"token_acc": 0.5551839464882943,
"train_speed(iter/s)": 0.579895
},
{
"epoch": 2.063106796116505,
"grad_norm": 9.253079414367676,
"learning_rate": 2.2195076697388915e-05,
"loss": 1.6856924057006837,
"memory(GiB)": 41.84,
"step": 3400,
"token_acc": 0.6493506493506493,
"train_speed(iter/s)": 0.57988
},
{
"epoch": 2.066140776699029,
"grad_norm": 8.945847511291504,
"learning_rate": 2.2063187057668727e-05,
"loss": 1.6917535781860351,
"memory(GiB)": 41.84,
"step": 3405,
"token_acc": 0.5947712418300654,
"train_speed(iter/s)": 0.57992
},
{
"epoch": 2.0691747572815533,
"grad_norm": 9.185718536376953,
"learning_rate": 2.1931579424582283e-05,
"loss": 1.7603189468383789,
"memory(GiB)": 41.84,
"step": 3410,
"token_acc": 0.5736434108527132,
"train_speed(iter/s)": 0.579988
},
{
"epoch": 2.0722087378640777,
"grad_norm": 6.9922332763671875,
"learning_rate": 2.18002551266356e-05,
"loss": 2.1215755462646486,
"memory(GiB)": 41.84,
"step": 3415,
"token_acc": 0.5202312138728323,
"train_speed(iter/s)": 0.579988
},
{
"epoch": 2.075242718446602,
"grad_norm": 8.512064933776855,
"learning_rate": 2.166921548947466e-05,
"loss": 1.720651626586914,
"memory(GiB)": 41.84,
"step": 3420,
"token_acc": 0.5985915492957746,
"train_speed(iter/s)": 0.579814
},
{
"epoch": 2.0782766990291264,
"grad_norm": 8.933260917663574,
"learning_rate": 2.1538461835871937e-05,
"loss": 1.8302701950073241,
"memory(GiB)": 41.84,
"step": 3425,
"token_acc": 0.5993975903614458,
"train_speed(iter/s)": 0.579754
},
{
"epoch": 2.0813106796116503,
"grad_norm": 7.324397087097168,
"learning_rate": 2.1407995485713007e-05,
"loss": 1.9634611129760742,
"memory(GiB)": 41.84,
"step": 3430,
"token_acc": 0.5775075987841946,
"train_speed(iter/s)": 0.579633
},
{
"epoch": 2.0843446601941746,
"grad_norm": 6.617276191711426,
"learning_rate": 2.127781775598339e-05,
"loss": 1.535646343231201,
"memory(GiB)": 41.84,
"step": 3435,
"token_acc": 0.62,
"train_speed(iter/s)": 0.579668
},
{
"epoch": 2.087378640776699,
"grad_norm": 8.722604751586914,
"learning_rate": 2.1147929960755032e-05,
"loss": 1.8054920196533204,
"memory(GiB)": 41.84,
"step": 3440,
"token_acc": 0.5772058823529411,
"train_speed(iter/s)": 0.579792
},
{
"epoch": 2.0904126213592233,
"grad_norm": 7.9137043952941895,
"learning_rate": 2.101833341117319e-05,
"loss": 1.9117881774902343,
"memory(GiB)": 41.84,
"step": 3445,
"token_acc": 0.5891238670694864,
"train_speed(iter/s)": 0.579887
},
{
"epoch": 2.0934466019417477,
"grad_norm": 8.221436500549316,
"learning_rate": 2.08890294154432e-05,
"loss": 2.002272033691406,
"memory(GiB)": 41.84,
"step": 3450,
"token_acc": 0.5616883116883117,
"train_speed(iter/s)": 0.580016
},
{
"epoch": 2.096480582524272,
"grad_norm": 8.50936222076416,
"learning_rate": 2.0760019278817123e-05,
"loss": 1.9437885284423828,
"memory(GiB)": 44.28,
"step": 3455,
"token_acc": 0.6167247386759582,
"train_speed(iter/s)": 0.580014
},
{
"epoch": 2.099514563106796,
"grad_norm": 8.858839988708496,
"learning_rate": 2.0631304303580824e-05,
"loss": 1.8394168853759765,
"memory(GiB)": 44.28,
"step": 3460,
"token_acc": 0.5693430656934306,
"train_speed(iter/s)": 0.580039
},
{
"epoch": 2.1025485436893203,
"grad_norm": 7.461985111236572,
"learning_rate": 2.0502885789040537e-05,
"loss": 2.222452163696289,
"memory(GiB)": 44.28,
"step": 3465,
"token_acc": 0.5231607629427792,
"train_speed(iter/s)": 0.580161
},
{
"epoch": 2.1055825242718447,
"grad_norm": 6.139802932739258,
"learning_rate": 2.037476503150997e-05,
"loss": 1.6303333282470702,
"memory(GiB)": 44.28,
"step": 3470,
"token_acc": 0.6225165562913907,
"train_speed(iter/s)": 0.580274
},
{
"epoch": 2.108616504854369,
"grad_norm": 9.019342422485352,
"learning_rate": 2.024694332429713e-05,
"loss": 2.1092754364013673,
"memory(GiB)": 44.28,
"step": 3475,
"token_acc": 0.5068493150684932,
"train_speed(iter/s)": 0.580406
},
{
"epoch": 2.1116504854368934,
"grad_norm": 10.167961120605469,
"learning_rate": 2.011942195769122e-05,
"loss": 1.965473747253418,
"memory(GiB)": 44.28,
"step": 3480,
"token_acc": 0.5733788395904437,
"train_speed(iter/s)": 0.580421
},
{
"epoch": 2.1146844660194173,
"grad_norm": 11.388608932495117,
"learning_rate": 1.9992202218949784e-05,
"loss": 1.9142690658569337,
"memory(GiB)": 44.28,
"step": 3485,
"token_acc": 0.5517241379310345,
"train_speed(iter/s)": 0.58051
},
{
"epoch": 2.1177184466019416,
"grad_norm": 6.913421154022217,
"learning_rate": 1.986528539228548e-05,
"loss": 1.9621810913085938,
"memory(GiB)": 44.28,
"step": 3490,
"token_acc": 0.55,
"train_speed(iter/s)": 0.580592
},
{
"epoch": 2.120752427184466,
"grad_norm": 7.60167121887207,
"learning_rate": 1.9738672758853305e-05,
"loss": 1.8437973022460938,
"memory(GiB)": 44.28,
"step": 3495,
"token_acc": 0.5822368421052632,
"train_speed(iter/s)": 0.580641
},
{
"epoch": 2.1237864077669903,
"grad_norm": 7.163271427154541,
"learning_rate": 1.9612365596737598e-05,
"loss": 1.6543169021606445,
"memory(GiB)": 44.28,
"step": 3500,
"token_acc": 0.6292134831460674,
"train_speed(iter/s)": 0.58071
},
{
"epoch": 2.1237864077669903,
"eval_loss": 1.866715431213379,
"eval_runtime": 11.5427,
"eval_samples_per_second": 8.663,
"eval_steps_per_second": 8.663,
"eval_token_acc": 0.5221745350500715,
"step": 3500
},
{
"epoch": 2.1268203883495147,
"grad_norm": 8.629687309265137,
"learning_rate": 1.948636518093906e-05,
"loss": 2.077587127685547,
"memory(GiB)": 44.29,
"step": 3505,
"token_acc": 0.5247895229186156,
"train_speed(iter/s)": 0.579649
},
{
"epoch": 2.1298543689320386,
"grad_norm": 9.276599884033203,
"learning_rate": 1.9360672783362076e-05,
"loss": 1.759820556640625,
"memory(GiB)": 44.29,
"step": 3510,
"token_acc": 0.5964912280701754,
"train_speed(iter/s)": 0.579656
},
{
"epoch": 2.132888349514563,
"grad_norm": 7.651179790496826,
"learning_rate": 1.9235289672801653e-05,
"loss": 2.0451793670654297,
"memory(GiB)": 44.29,
"step": 3515,
"token_acc": 0.4962025316455696,
"train_speed(iter/s)": 0.579702
},
{
"epoch": 2.1359223300970873,
"grad_norm": 8.811480522155762,
"learning_rate": 1.911021711493077e-05,
"loss": 2.1489105224609375,
"memory(GiB)": 44.29,
"step": 3520,
"token_acc": 0.5483870967741935,
"train_speed(iter/s)": 0.579789
},
{
"epoch": 2.1389563106796117,
"grad_norm": 7.404130935668945,
"learning_rate": 1.8985456372287534e-05,
"loss": 1.7454706192016602,
"memory(GiB)": 44.29,
"step": 3525,
"token_acc": 0.6334405144694534,
"train_speed(iter/s)": 0.579816
},
{
"epoch": 2.141990291262136,
"grad_norm": 10.67794132232666,
"learning_rate": 1.8861008704262457e-05,
"loss": 1.8724552154541017,
"memory(GiB)": 44.29,
"step": 3530,
"token_acc": 0.5796610169491525,
"train_speed(iter/s)": 0.579807
},
{
"epoch": 2.1450242718446604,
"grad_norm": 8.964271545410156,
"learning_rate": 1.8736875367085755e-05,
"loss": 1.8260086059570313,
"memory(GiB)": 44.29,
"step": 3535,
"token_acc": 0.5640138408304498,
"train_speed(iter/s)": 0.579826
},
{
"epoch": 2.1480582524271843,
"grad_norm": 8.768270492553711,
"learning_rate": 1.8613057613814584e-05,
"loss": 1.9611518859863282,
"memory(GiB)": 44.29,
"step": 3540,
"token_acc": 0.552901023890785,
"train_speed(iter/s)": 0.579777
},
{
"epoch": 2.1510922330097086,
"grad_norm": 9.284390449523926,
"learning_rate": 1.8489556694320513e-05,
"loss": 2.0381515502929686,
"memory(GiB)": 44.29,
"step": 3545,
"token_acc": 0.5503875968992248,
"train_speed(iter/s)": 0.579814
},
{
"epoch": 2.154126213592233,
"grad_norm": 11.225659370422363,
"learning_rate": 1.836637385527684e-05,
"loss": 2.1475587844848634,
"memory(GiB)": 44.29,
"step": 3550,
"token_acc": 0.5677233429394812,
"train_speed(iter/s)": 0.579745
},
{
"epoch": 2.1571601941747574,
"grad_norm": 7.625835418701172,
"learning_rate": 1.8243510340146015e-05,
"loss": 1.9312858581542969,
"memory(GiB)": 44.29,
"step": 3555,
"token_acc": 0.5634328358208955,
"train_speed(iter/s)": 0.579783
},
{
"epoch": 2.1601941747572817,
"grad_norm": 9.190287590026855,
"learning_rate": 1.8120967389167076e-05,
"loss": 1.5170929908752442,
"memory(GiB)": 44.29,
"step": 3560,
"token_acc": 0.6521739130434783,
"train_speed(iter/s)": 0.579754
},
{
"epoch": 2.163228155339806,
"grad_norm": 11.153077125549316,
"learning_rate": 1.799874623934318e-05,
"loss": 1.931208610534668,
"memory(GiB)": 44.29,
"step": 3565,
"token_acc": 0.5909090909090909,
"train_speed(iter/s)": 0.579843
},
{
"epoch": 2.16626213592233,
"grad_norm": 6.920065879821777,
"learning_rate": 1.7876848124429014e-05,
"loss": 1.7487638473510743,
"memory(GiB)": 44.29,
"step": 3570,
"token_acc": 0.5811209439528023,
"train_speed(iter/s)": 0.579783
},
{
"epoch": 2.1692961165048543,
"grad_norm": 9.68315601348877,
"learning_rate": 1.775527427491847e-05,
"loss": 1.9304796218872071,
"memory(GiB)": 44.29,
"step": 3575,
"token_acc": 0.5424354243542435,
"train_speed(iter/s)": 0.579873
},
{
"epoch": 2.1723300970873787,
"grad_norm": 8.648472785949707,
"learning_rate": 1.7634025918032132e-05,
"loss": 1.822089385986328,
"memory(GiB)": 44.29,
"step": 3580,
"token_acc": 0.615625,
"train_speed(iter/s)": 0.579913
},
{
"epoch": 2.175364077669903,
"grad_norm": 5.941222190856934,
"learning_rate": 1.7513104277704926e-05,
"loss": 1.5487011909484862,
"memory(GiB)": 44.29,
"step": 3585,
"token_acc": 0.6267123287671232,
"train_speed(iter/s)": 0.579944
},
{
"epoch": 2.1783980582524274,
"grad_norm": 8.311307907104492,
"learning_rate": 1.739251057457377e-05,
"loss": 1.876582145690918,
"memory(GiB)": 44.29,
"step": 3590,
"token_acc": 0.5734463276836158,
"train_speed(iter/s)": 0.579958
},
{
"epoch": 2.1814320388349513,
"grad_norm": 9.144810676574707,
"learning_rate": 1.7272246025965178e-05,
"loss": 2.155200386047363,
"memory(GiB)": 44.29,
"step": 3595,
"token_acc": 0.5140845070422535,
"train_speed(iter/s)": 0.579869
},
{
"epoch": 2.1844660194174756,
"grad_norm": 7.681180953979492,
"learning_rate": 1.7152311845883095e-05,
"loss": 1.7877147674560547,
"memory(GiB)": 44.29,
"step": 3600,
"token_acc": 0.5666666666666667,
"train_speed(iter/s)": 0.57981
},
{
"epoch": 2.1875,
"grad_norm": 8.98862361907959,
"learning_rate": 1.703270924499656e-05,
"loss": 1.7724479675292968,
"memory(GiB)": 44.29,
"step": 3605,
"token_acc": 0.5951557093425606,
"train_speed(iter/s)": 0.57974
},
{
"epoch": 2.1905339805825244,
"grad_norm": 6.949456214904785,
"learning_rate": 1.691343943062749e-05,
"loss": 1.7420495986938476,
"memory(GiB)": 44.29,
"step": 3610,
"token_acc": 0.5741935483870968,
"train_speed(iter/s)": 0.579755
},
{
"epoch": 2.1935679611650487,
"grad_norm": 7.481090545654297,
"learning_rate": 1.6794503606738548e-05,
"loss": 2.0047124862670898,
"memory(GiB)": 44.29,
"step": 3615,
"token_acc": 0.5398230088495575,
"train_speed(iter/s)": 0.57981
},
{
"epoch": 2.1966019417475726,
"grad_norm": 7.942904472351074,
"learning_rate": 1.667590297392086e-05,
"loss": 2.1652708053588867,
"memory(GiB)": 44.29,
"step": 3620,
"token_acc": 0.5389048991354467,
"train_speed(iter/s)": 0.57981
},
{
"epoch": 2.199635922330097,
"grad_norm": 7.470623016357422,
"learning_rate": 1.6557638729382107e-05,
"loss": 1.7064685821533203,
"memory(GiB)": 44.29,
"step": 3625,
"token_acc": 0.6104651162790697,
"train_speed(iter/s)": 0.57981
},
{
"epoch": 2.2026699029126213,
"grad_norm": 6.908362865447998,
"learning_rate": 1.6439712066934204e-05,
"loss": 1.8296821594238282,
"memory(GiB)": 44.29,
"step": 3630,
"token_acc": 0.5864022662889519,
"train_speed(iter/s)": 0.579781
},
{
"epoch": 2.2057038834951457,
"grad_norm": 7.870819568634033,
"learning_rate": 1.632212417698143e-05,
"loss": 1.9550270080566405,
"memory(GiB)": 44.29,
"step": 3635,
"token_acc": 0.5835777126099707,
"train_speed(iter/s)": 0.579726
},
{
"epoch": 2.20873786407767,
"grad_norm": 8.01059627532959,
"learning_rate": 1.620487624650834e-05,
"loss": 1.8678318023681642,
"memory(GiB)": 44.29,
"step": 3640,
"token_acc": 0.61875,
"train_speed(iter/s)": 0.579706
},
{
"epoch": 2.211771844660194,
"grad_norm": 7.753682613372803,
"learning_rate": 1.6087969459067708e-05,
"loss": 1.5739126205444336,
"memory(GiB)": 44.29,
"step": 3645,
"token_acc": 0.6141479099678456,
"train_speed(iter/s)": 0.57974
},
{
"epoch": 2.2148058252427183,
"grad_norm": 8.250489234924316,
"learning_rate": 1.5971404994768797e-05,
"loss": 1.9059646606445313,
"memory(GiB)": 44.29,
"step": 3650,
"token_acc": 0.5551470588235294,
"train_speed(iter/s)": 0.579782
},
{
"epoch": 2.2178398058252426,
"grad_norm": 8.499149322509766,
"learning_rate": 1.585518403026518e-05,
"loss": 2.0898170471191406,
"memory(GiB)": 44.29,
"step": 3655,
"token_acc": 0.59,
"train_speed(iter/s)": 0.579868
},
{
"epoch": 2.220873786407767,
"grad_norm": 9.44747543334961,
"learning_rate": 1.5739307738743057e-05,
"loss": 1.9359277725219726,
"memory(GiB)": 44.29,
"step": 3660,
"token_acc": 0.5628930817610063,
"train_speed(iter/s)": 0.579972
},
{
"epoch": 2.2239077669902914,
"grad_norm": 6.627506256103516,
"learning_rate": 1.5623777289909347e-05,
"loss": 1.749598503112793,
"memory(GiB)": 44.29,
"step": 3665,
"token_acc": 0.621160409556314,
"train_speed(iter/s)": 0.579956
},
{
"epoch": 2.2269417475728157,
"grad_norm": 9.652698516845703,
"learning_rate": 1.5508593849979812e-05,
"loss": 1.946786117553711,
"memory(GiB)": 44.29,
"step": 3670,
"token_acc": 0.5962732919254659,
"train_speed(iter/s)": 0.580051
},
{
"epoch": 2.2299757281553396,
"grad_norm": 11.728522300720215,
"learning_rate": 1.5393758581667462e-05,
"loss": 1.8440595626831056,
"memory(GiB)": 44.29,
"step": 3675,
"token_acc": 0.5598455598455598,
"train_speed(iter/s)": 0.580023
},
{
"epoch": 2.233009708737864,
"grad_norm": 9.42689323425293,
"learning_rate": 1.52792726441706e-05,
"loss": 2.040317916870117,
"memory(GiB)": 44.29,
"step": 3680,
"token_acc": 0.5699658703071673,
"train_speed(iter/s)": 0.580045
},
{
"epoch": 2.2360436893203883,
"grad_norm": 9.370969772338867,
"learning_rate": 1.5165137193161289e-05,
"loss": 1.9046701431274413,
"memory(GiB)": 44.29,
"step": 3685,
"token_acc": 0.5689149560117303,
"train_speed(iter/s)": 0.580056
},
{
"epoch": 2.2390776699029127,
"grad_norm": 9.691226959228516,
"learning_rate": 1.505135338077363e-05,
"loss": 2.0255931854248046,
"memory(GiB)": 44.29,
"step": 3690,
"token_acc": 0.5156695156695157,
"train_speed(iter/s)": 0.579994
},
{
"epoch": 2.242111650485437,
"grad_norm": 7.089369773864746,
"learning_rate": 1.4937922355592054e-05,
"loss": 1.856874656677246,
"memory(GiB)": 44.29,
"step": 3695,
"token_acc": 0.5548961424332344,
"train_speed(iter/s)": 0.580088
},
{
"epoch": 2.2451456310679614,
"grad_norm": 8.272523880004883,
"learning_rate": 1.482484526263993e-05,
"loss": 1.9418399810791016,
"memory(GiB)": 44.29,
"step": 3700,
"token_acc": 0.5479041916167665,
"train_speed(iter/s)": 0.580039
},
{
"epoch": 2.2481796116504853,
"grad_norm": 19.949644088745117,
"learning_rate": 1.4712123243367742e-05,
"loss": 2.0299962997436523,
"memory(GiB)": 44.29,
"step": 3705,
"token_acc": 0.5658362989323843,
"train_speed(iter/s)": 0.580004
},
{
"epoch": 2.2512135922330097,
"grad_norm": 12.743327140808105,
"learning_rate": 1.459975743564178e-05,
"loss": 1.9635414123535155,
"memory(GiB)": 44.29,
"step": 3710,
"token_acc": 0.5590277777777778,
"train_speed(iter/s)": 0.579959
},
{
"epoch": 2.254247572815534,
"grad_norm": 6.324910640716553,
"learning_rate": 1.4487748973732567e-05,
"loss": 2.068693733215332,
"memory(GiB)": 44.29,
"step": 3715,
"token_acc": 0.5710382513661202,
"train_speed(iter/s)": 0.579955
},
{
"epoch": 2.2572815533980584,
"grad_norm": 8.611750602722168,
"learning_rate": 1.4376098988303405e-05,
"loss": 1.7477828979492187,
"memory(GiB)": 44.29,
"step": 3720,
"token_acc": 0.5627118644067797,
"train_speed(iter/s)": 0.579921
},
{
"epoch": 2.2603155339805827,
"grad_norm": 8.731199264526367,
"learning_rate": 1.4264808606398988e-05,
"loss": 1.9445646286010743,
"memory(GiB)": 44.29,
"step": 3725,
"token_acc": 0.5650969529085873,
"train_speed(iter/s)": 0.579908
},
{
"epoch": 2.2633495145631066,
"grad_norm": 8.617072105407715,
"learning_rate": 1.4153878951433985e-05,
"loss": 1.764409065246582,
"memory(GiB)": 44.29,
"step": 3730,
"token_acc": 0.6271186440677966,
"train_speed(iter/s)": 0.579952
},
{
"epoch": 2.266383495145631,
"grad_norm": 6.622957706451416,
"learning_rate": 1.4043311143181743e-05,
"loss": 1.8772661209106445,
"memory(GiB)": 44.29,
"step": 3735,
"token_acc": 0.5902578796561605,
"train_speed(iter/s)": 0.579925
},
{
"epoch": 2.2694174757281553,
"grad_norm": 7.272273063659668,
"learning_rate": 1.3933106297762983e-05,
"loss": 1.6700300216674804,
"memory(GiB)": 44.29,
"step": 3740,
"token_acc": 0.6431095406360424,
"train_speed(iter/s)": 0.579928
},
{
"epoch": 2.2724514563106797,
"grad_norm": 8.500160217285156,
"learning_rate": 1.38232655276345e-05,
"loss": 1.9523941040039063,
"memory(GiB)": 44.29,
"step": 3745,
"token_acc": 0.5574324324324325,
"train_speed(iter/s)": 0.579866
},
{
"epoch": 2.275485436893204,
"grad_norm": 10.481255531311035,
"learning_rate": 1.3713789941577947e-05,
"loss": 1.935152816772461,
"memory(GiB)": 44.29,
"step": 3750,
"token_acc": 0.5851851851851851,
"train_speed(iter/s)": 0.579728
},
{
"epoch": 2.278519417475728,
"grad_norm": 8.817157745361328,
"learning_rate": 1.3604680644688673e-05,
"loss": 2.029979705810547,
"memory(GiB)": 44.29,
"step": 3755,
"token_acc": 0.5822784810126582,
"train_speed(iter/s)": 0.579834
},
{
"epoch": 2.2815533980582523,
"grad_norm": 8.694374084472656,
"learning_rate": 1.3495938738364495e-05,
"loss": 1.8262203216552735,
"memory(GiB)": 44.29,
"step": 3760,
"token_acc": 0.6044776119402985,
"train_speed(iter/s)": 0.57994
},
{
"epoch": 2.2845873786407767,
"grad_norm": 8.665304183959961,
"learning_rate": 1.338756532029466e-05,
"loss": 1.8623455047607422,
"memory(GiB)": 44.29,
"step": 3765,
"token_acc": 0.5573122529644269,
"train_speed(iter/s)": 0.579929
},
{
"epoch": 2.287621359223301,
"grad_norm": 8.049120903015137,
"learning_rate": 1.3279561484448726e-05,
"loss": 1.8126539230346679,
"memory(GiB)": 44.29,
"step": 3770,
"token_acc": 0.6254416961130742,
"train_speed(iter/s)": 0.579935
},
{
"epoch": 2.2906553398058254,
"grad_norm": 5.988779544830322,
"learning_rate": 1.3171928321065525e-05,
"loss": 1.5385218620300294,
"memory(GiB)": 44.29,
"step": 3775,
"token_acc": 0.657243816254417,
"train_speed(iter/s)": 0.580025
},
{
"epoch": 2.2936893203883493,
"grad_norm": 5.922063827514648,
"learning_rate": 1.306466691664216e-05,
"loss": 1.7553050994873047,
"memory(GiB)": 44.29,
"step": 3780,
"token_acc": 0.5925925925925926,
"train_speed(iter/s)": 0.579998
},
{
"epoch": 2.2967233009708736,
"grad_norm": 6.754926681518555,
"learning_rate": 1.2957778353922994e-05,
"loss": 1.6977853775024414,
"memory(GiB)": 44.29,
"step": 3785,
"token_acc": 0.5875912408759124,
"train_speed(iter/s)": 0.580116
},
{
"epoch": 2.299757281553398,
"grad_norm": 6.817199230194092,
"learning_rate": 1.285126371188881e-05,
"loss": 1.9571613311767577,
"memory(GiB)": 44.29,
"step": 3790,
"token_acc": 0.5661971830985916,
"train_speed(iter/s)": 0.580194
},
{
"epoch": 2.3027912621359223,
"grad_norm": 10.896566390991211,
"learning_rate": 1.2745124065745845e-05,
"loss": 1.7496770858764648,
"memory(GiB)": 44.29,
"step": 3795,
"token_acc": 0.6095238095238096,
"train_speed(iter/s)": 0.580115
},
{
"epoch": 2.3058252427184467,
"grad_norm": 7.805569171905518,
"learning_rate": 1.2639360486914964e-05,
"loss": 2.1383758544921876,
"memory(GiB)": 44.29,
"step": 3800,
"token_acc": 0.5325779036827195,
"train_speed(iter/s)": 0.580145
},
{
"epoch": 2.308859223300971,
"grad_norm": 8.069032669067383,
"learning_rate": 1.2533974043020862e-05,
"loss": 1.7861778259277343,
"memory(GiB)": 44.29,
"step": 3805,
"token_acc": 0.594855305466238,
"train_speed(iter/s)": 0.580121
},
{
"epoch": 2.311893203883495,
"grad_norm": 10.002004623413086,
"learning_rate": 1.2428965797881204e-05,
"loss": 1.8549165725708008,
"memory(GiB)": 44.29,
"step": 3810,
"token_acc": 0.559375,
"train_speed(iter/s)": 0.580167
},
{
"epoch": 2.3149271844660193,
"grad_norm": 10.041362762451172,
"learning_rate": 1.232433681149604e-05,
"loss": 1.9269153594970703,
"memory(GiB)": 44.29,
"step": 3815,
"token_acc": 0.5762195121951219,
"train_speed(iter/s)": 0.580098
},
{
"epoch": 2.3179611650485437,
"grad_norm": 7.903229236602783,
"learning_rate": 1.2220088140036934e-05,
"loss": 1.8197761535644532,
"memory(GiB)": 44.29,
"step": 3820,
"token_acc": 0.6220735785953178,
"train_speed(iter/s)": 0.580156
},
{
"epoch": 2.320995145631068,
"grad_norm": 7.331014156341553,
"learning_rate": 1.2116220835836389e-05,
"loss": 2.0878772735595703,
"memory(GiB)": 44.29,
"step": 3825,
"token_acc": 0.5467128027681661,
"train_speed(iter/s)": 0.580167
},
{
"epoch": 2.3240291262135924,
"grad_norm": 11.29516315460205,
"learning_rate": 1.2012735947377297e-05,
"loss": 1.9641210556030273,
"memory(GiB)": 44.29,
"step": 3830,
"token_acc": 0.5759493670886076,
"train_speed(iter/s)": 0.580195
},
{
"epoch": 2.3270631067961167,
"grad_norm": 9.621826171875,
"learning_rate": 1.1909634519282154e-05,
"loss": 1.9087528228759765,
"memory(GiB)": 44.29,
"step": 3835,
"token_acc": 0.5802047781569966,
"train_speed(iter/s)": 0.580189
},
{
"epoch": 2.3300970873786406,
"grad_norm": 7.312023162841797,
"learning_rate": 1.1806917592302762e-05,
"loss": 1.5428638458251953,
"memory(GiB)": 44.29,
"step": 3840,
"token_acc": 0.6550522648083623,
"train_speed(iter/s)": 0.580258
},
{
"epoch": 2.333131067961165,
"grad_norm": 12.322574615478516,
"learning_rate": 1.1704586203309486e-05,
"loss": 2.2512718200683595,
"memory(GiB)": 44.29,
"step": 3845,
"token_acc": 0.5261627906976745,
"train_speed(iter/s)": 0.580196
},
{
"epoch": 2.3361650485436893,
"grad_norm": 7.682923316955566,
"learning_rate": 1.1602641385280971e-05,
"loss": 2.14353084564209,
"memory(GiB)": 44.29,
"step": 3850,
"token_acc": 0.5100502512562815,
"train_speed(iter/s)": 0.580288
},
{
"epoch": 2.3391990291262137,
"grad_norm": 8.914677619934082,
"learning_rate": 1.1501084167293624e-05,
"loss": 1.8753440856933594,
"memory(GiB)": 44.29,
"step": 3855,
"token_acc": 0.587248322147651,
"train_speed(iter/s)": 0.58029
},
{
"epoch": 2.342233009708738,
"grad_norm": 8.7797212600708,
"learning_rate": 1.1399915574511205e-05,
"loss": 1.93109130859375,
"memory(GiB)": 44.29,
"step": 3860,
"token_acc": 0.5684931506849316,
"train_speed(iter/s)": 0.58018
},
{
"epoch": 2.345266990291262,
"grad_norm": 6.757023334503174,
"learning_rate": 1.1299136628174606e-05,
"loss": 1.959303855895996,
"memory(GiB)": 44.29,
"step": 3865,
"token_acc": 0.573134328358209,
"train_speed(iter/s)": 0.580123
},
{
"epoch": 2.3483009708737863,
"grad_norm": 7.724388599395752,
"learning_rate": 1.1198748345591358e-05,
"loss": 1.923073959350586,
"memory(GiB)": 44.29,
"step": 3870,
"token_acc": 0.5460122699386503,
"train_speed(iter/s)": 0.580153
},
{
"epoch": 2.3513349514563107,
"grad_norm": 8.733378410339355,
"learning_rate": 1.1098751740125518e-05,
"loss": 1.9303054809570312,
"memory(GiB)": 44.29,
"step": 3875,
"token_acc": 0.5620437956204379,
"train_speed(iter/s)": 0.580197
},
{
"epoch": 2.354368932038835,
"grad_norm": 7.248959541320801,
"learning_rate": 1.0999147821187378e-05,
"loss": 1.9763971328735352,
"memory(GiB)": 44.29,
"step": 3880,
"token_acc": 0.528052805280528,
"train_speed(iter/s)": 0.580278
},
{
"epoch": 2.3574029126213594,
"grad_norm": 7.560742378234863,
"learning_rate": 1.0899937594223225e-05,
"loss": 2.138459014892578,
"memory(GiB)": 44.29,
"step": 3885,
"token_acc": 0.5240793201133145,
"train_speed(iter/s)": 0.580203
},
{
"epoch": 2.3604368932038833,
"grad_norm": 7.769505023956299,
"learning_rate": 1.080112206070531e-05,
"loss": 1.8142425537109375,
"memory(GiB)": 44.29,
"step": 3890,
"token_acc": 0.5935483870967742,
"train_speed(iter/s)": 0.580071
},
{
"epoch": 2.3634708737864076,
"grad_norm": 8.150938987731934,
"learning_rate": 1.070270221812163e-05,
"loss": 2.216781234741211,
"memory(GiB)": 44.29,
"step": 3895,
"token_acc": 0.49122807017543857,
"train_speed(iter/s)": 0.580092
},
{
"epoch": 2.366504854368932,
"grad_norm": 6.342752456665039,
"learning_rate": 1.0604679059965922e-05,
"loss": 1.5916692733764648,
"memory(GiB)": 44.29,
"step": 3900,
"token_acc": 0.6594982078853047,
"train_speed(iter/s)": 0.580113
},
{
"epoch": 2.3695388349514563,
"grad_norm": 8.572466850280762,
"learning_rate": 1.050705357572761e-05,
"loss": 1.6800006866455077,
"memory(GiB)": 44.29,
"step": 3905,
"token_acc": 0.5981873111782477,
"train_speed(iter/s)": 0.580211
},
{
"epoch": 2.3725728155339807,
"grad_norm": 10.74704360961914,
"learning_rate": 1.0409826750881824e-05,
"loss": 2.0315380096435547,
"memory(GiB)": 44.29,
"step": 3910,
"token_acc": 0.5259067357512953,
"train_speed(iter/s)": 0.580153
},
{
"epoch": 2.375606796116505,
"grad_norm": 10.060522079467773,
"learning_rate": 1.031299956687941e-05,
"loss": 1.925653839111328,
"memory(GiB)": 44.29,
"step": 3915,
"token_acc": 0.5745454545454546,
"train_speed(iter/s)": 0.580105
},
{
"epoch": 2.378640776699029,
"grad_norm": 6.9279704093933105,
"learning_rate": 1.0216573001137126e-05,
"loss": 1.7791040420532227,
"memory(GiB)": 44.29,
"step": 3920,
"token_acc": 0.6091954022988506,
"train_speed(iter/s)": 0.580094
},
{
"epoch": 2.3816747572815533,
"grad_norm": 8.384385108947754,
"learning_rate": 1.0120548027027655e-05,
"loss": 1.839115524291992,
"memory(GiB)": 44.29,
"step": 3925,
"token_acc": 0.5759493670886076,
"train_speed(iter/s)": 0.580033
},
{
"epoch": 2.3847087378640777,
"grad_norm": 6.519843578338623,
"learning_rate": 1.0024925613869874e-05,
"loss": 2.303724670410156,
"memory(GiB)": 44.29,
"step": 3930,
"token_acc": 0.5181818181818182,
"train_speed(iter/s)": 0.580098
},
{
"epoch": 2.387742718446602,
"grad_norm": 6.988163948059082,
"learning_rate": 9.929706726919019e-06,
"loss": 2.0136226654052733,
"memory(GiB)": 44.29,
"step": 3935,
"token_acc": 0.5746031746031746,
"train_speed(iter/s)": 0.580148
},
{
"epoch": 2.3907766990291264,
"grad_norm": 8.978435516357422,
"learning_rate": 9.834892327356909e-06,
"loss": 2.091661262512207,
"memory(GiB)": 44.29,
"step": 3940,
"token_acc": 0.5614035087719298,
"train_speed(iter/s)": 0.580167
},
{
"epoch": 2.3938106796116507,
"grad_norm": 8.229738235473633,
"learning_rate": 9.740483372282383e-06,
"loss": 1.8495658874511718,
"memory(GiB)": 44.29,
"step": 3945,
"token_acc": 0.6066176470588235,
"train_speed(iter/s)": 0.580188
},
{
"epoch": 2.3968446601941746,
"grad_norm": 9.419842720031738,
"learning_rate": 9.646480814701447e-06,
"loss": 1.9571540832519532,
"memory(GiB)": 44.29,
"step": 3950,
"token_acc": 0.5364431486880467,
"train_speed(iter/s)": 0.580215
},
{
"epoch": 2.399878640776699,
"grad_norm": 9.840128898620605,
"learning_rate": 9.552885603517797e-06,
"loss": 1.9348846435546876,
"memory(GiB)": 44.29,
"step": 3955,
"token_acc": 0.570446735395189,
"train_speed(iter/s)": 0.580228
},
{
"epoch": 2.4029126213592233,
"grad_norm": 11.587018013000488,
"learning_rate": 9.459698683523204e-06,
"loss": 2.1948358535766603,
"memory(GiB)": 44.29,
"step": 3960,
"token_acc": 0.5457413249211357,
"train_speed(iter/s)": 0.580218
},
{
"epoch": 2.4059466019417477,
"grad_norm": 7.861437797546387,
"learning_rate": 9.366920995387901e-06,
"loss": 2.0211660385131838,
"memory(GiB)": 44.29,
"step": 3965,
"token_acc": 0.5498489425981873,
"train_speed(iter/s)": 0.580257
},
{
"epoch": 2.408980582524272,
"grad_norm": 10.794283866882324,
"learning_rate": 9.274553475651254e-06,
"loss": 1.9600090026855468,
"memory(GiB)": 44.29,
"step": 3970,
"token_acc": 0.5950413223140496,
"train_speed(iter/s)": 0.580279
},
{
"epoch": 2.412014563106796,
"grad_norm": 7.574179649353027,
"learning_rate": 9.182597056712111e-06,
"loss": 1.962773895263672,
"memory(GiB)": 44.29,
"step": 3975,
"token_acc": 0.5454545454545454,
"train_speed(iter/s)": 0.580177
},
{
"epoch": 2.4150485436893203,
"grad_norm": 8.261923789978027,
"learning_rate": 9.09105266681954e-06,
"loss": 2.138422393798828,
"memory(GiB)": 44.29,
"step": 3980,
"token_acc": 0.518796992481203,
"train_speed(iter/s)": 0.580227
},
{
"epoch": 2.4180825242718447,
"grad_norm": 9.629799842834473,
"learning_rate": 8.99992123006339e-06,
"loss": 1.972011184692383,
"memory(GiB)": 44.29,
"step": 3985,
"token_acc": 0.5466237942122186,
"train_speed(iter/s)": 0.580297
},
{
"epoch": 2.421116504854369,
"grad_norm": 6.383166313171387,
"learning_rate": 8.909203666364957e-06,
"loss": 1.874557113647461,
"memory(GiB)": 44.29,
"step": 3990,
"token_acc": 0.583941605839416,
"train_speed(iter/s)": 0.580373
},
{
"epoch": 2.4241504854368934,
"grad_norm": 7.899206161499023,
"learning_rate": 8.818900891467773e-06,
"loss": 2.0880853652954103,
"memory(GiB)": 44.29,
"step": 3995,
"token_acc": 0.54,
"train_speed(iter/s)": 0.580411
},
{
"epoch": 2.4271844660194173,
"grad_norm": 8.335851669311523,
"learning_rate": 8.729013816928239e-06,
"loss": 1.8050338745117187,
"memory(GiB)": 44.29,
"step": 4000,
"token_acc": 0.6138613861386139,
"train_speed(iter/s)": 0.580506
},
{
"epoch": 2.4271844660194173,
"eval_loss": 1.988856554031372,
"eval_runtime": 12.22,
"eval_samples_per_second": 8.183,
"eval_steps_per_second": 8.183,
"eval_token_acc": 0.5174337517433751,
"step": 4000
},
{
"epoch": 2.4302184466019416,
"grad_norm": 6.326101303100586,
"learning_rate": 8.639543350106532e-06,
"loss": 1.6620052337646485,
"memory(GiB)": 44.29,
"step": 4005,
"token_acc": 0.5577651515151515,
"train_speed(iter/s)": 0.579332
},
{
"epoch": 2.433252427184466,
"grad_norm": 5.722497463226318,
"learning_rate": 8.550490394157417e-06,
"loss": 2.129566192626953,
"memory(GiB)": 44.29,
"step": 4010,
"token_acc": 0.5157593123209169,
"train_speed(iter/s)": 0.57938
},
{
"epoch": 2.4362864077669903,
"grad_norm": 7.888674736022949,
"learning_rate": 8.46185584802106e-06,
"loss": 1.7735406875610351,
"memory(GiB)": 44.29,
"step": 4015,
"token_acc": 0.6041666666666666,
"train_speed(iter/s)": 0.579409
},
{
"epoch": 2.4393203883495147,
"grad_norm": 9.027255058288574,
"learning_rate": 8.373640606414096e-06,
"loss": 2.2499406814575194,
"memory(GiB)": 44.29,
"step": 4020,
"token_acc": 0.5216049382716049,
"train_speed(iter/s)": 0.579437
},
{
"epoch": 2.4423543689320386,
"grad_norm": 6.910282611846924,
"learning_rate": 8.285845559820427e-06,
"loss": 1.820733642578125,
"memory(GiB)": 44.29,
"step": 4025,
"token_acc": 0.5671641791044776,
"train_speed(iter/s)": 0.579456
},
{
"epoch": 2.445388349514563,
"grad_norm": 8.852483749389648,
"learning_rate": 8.198471594482376e-06,
"loss": 2.3667272567749023,
"memory(GiB)": 44.29,
"step": 4030,
"token_acc": 0.5066666666666667,
"train_speed(iter/s)": 0.579474
},
{
"epoch": 2.4484223300970873,
"grad_norm": 7.324892520904541,
"learning_rate": 8.111519592391669e-06,
"loss": 1.7319637298583985,
"memory(GiB)": 44.29,
"step": 4035,
"token_acc": 0.6045751633986928,
"train_speed(iter/s)": 0.579496
},
{
"epoch": 2.4514563106796117,
"grad_norm": 7.501872539520264,
"learning_rate": 8.024990431280543e-06,
"loss": 2.2290987014770507,
"memory(GiB)": 44.29,
"step": 4040,
"token_acc": 0.5184049079754601,
"train_speed(iter/s)": 0.57949
},
{
"epoch": 2.454490291262136,
"grad_norm": 11.032537460327148,
"learning_rate": 7.93888498461291e-06,
"loss": 1.7387943267822266,
"memory(GiB)": 44.29,
"step": 4045,
"token_acc": 0.5924657534246576,
"train_speed(iter/s)": 0.579449
},
{
"epoch": 2.4575242718446604,
"grad_norm": 7.408664226531982,
"learning_rate": 7.853204121575475e-06,
"loss": 2.0464914321899412,
"memory(GiB)": 44.29,
"step": 4050,
"token_acc": 0.5613496932515337,
"train_speed(iter/s)": 0.579515
},
{
"epoch": 2.4605582524271843,
"grad_norm": 8.151251792907715,
"learning_rate": 7.76794870706905e-06,
"loss": 1.9731042861938477,
"memory(GiB)": 44.29,
"step": 4055,
"token_acc": 0.5264900662251656,
"train_speed(iter/s)": 0.579583
},
{
"epoch": 2.4635922330097086,
"grad_norm": 7.136772632598877,
"learning_rate": 7.683119601699757e-06,
"loss": 1.9375322341918946,
"memory(GiB)": 44.29,
"step": 4060,
"token_acc": 0.5538922155688623,
"train_speed(iter/s)": 0.579566
},
{
"epoch": 2.466626213592233,
"grad_norm": 8.133397102355957,
"learning_rate": 7.598717661770377e-06,
"loss": 1.9626676559448242,
"memory(GiB)": 44.29,
"step": 4065,
"token_acc": 0.5642633228840125,
"train_speed(iter/s)": 0.579562
},
{
"epoch": 2.4696601941747574,
"grad_norm": 7.656953811645508,
"learning_rate": 7.514743739271696e-06,
"loss": 1.7955259323120116,
"memory(GiB)": 44.29,
"step": 4070,
"token_acc": 0.6,
"train_speed(iter/s)": 0.579478
},
{
"epoch": 2.4726941747572817,
"grad_norm": 9.205748558044434,
"learning_rate": 7.4311986818738685e-06,
"loss": 1.7786579132080078,
"memory(GiB)": 44.29,
"step": 4075,
"token_acc": 0.6013071895424836,
"train_speed(iter/s)": 0.579436
},
{
"epoch": 2.475728155339806,
"grad_norm": 9.025361061096191,
"learning_rate": 7.348083332917926e-06,
"loss": 2.109883689880371,
"memory(GiB)": 44.29,
"step": 4080,
"token_acc": 0.536,
"train_speed(iter/s)": 0.579399
},
{
"epoch": 2.47876213592233,
"grad_norm": 7.150624752044678,
"learning_rate": 7.26539853140723e-06,
"loss": 1.8995925903320312,
"memory(GiB)": 44.29,
"step": 4085,
"token_acc": 0.5714285714285714,
"train_speed(iter/s)": 0.57952
},
{
"epoch": 2.4817961165048543,
"grad_norm": 7.212602138519287,
"learning_rate": 7.1831451119989955e-06,
"loss": 2.258907508850098,
"memory(GiB)": 44.29,
"step": 4090,
"token_acc": 0.4742547425474255,
"train_speed(iter/s)": 0.579551
},
{
"epoch": 2.4848300970873787,
"grad_norm": 9.11235523223877,
"learning_rate": 7.1013239049958714e-06,
"loss": 1.7706048965454102,
"memory(GiB)": 44.29,
"step": 4095,
"token_acc": 0.5925925925925926,
"train_speed(iter/s)": 0.579509
},
{
"epoch": 2.487864077669903,
"grad_norm": 10.799226760864258,
"learning_rate": 7.019935736337585e-06,
"loss": 2.1821046829223634,
"memory(GiB)": 44.29,
"step": 4100,
"token_acc": 0.5335463258785943,
"train_speed(iter/s)": 0.579558
},
{
"epoch": 2.4908980582524274,
"grad_norm": 11.041994094848633,
"learning_rate": 6.938981427592534e-06,
"loss": 2.088601303100586,
"memory(GiB)": 44.29,
"step": 4105,
"token_acc": 0.5171339563862928,
"train_speed(iter/s)": 0.579608
},
{
"epoch": 2.4939320388349513,
"grad_norm": 10.685086250305176,
"learning_rate": 6.858461795949583e-06,
"loss": 1.5177223205566406,
"memory(GiB)": 44.29,
"step": 4110,
"token_acc": 0.6339285714285714,
"train_speed(iter/s)": 0.579595
},
{
"epoch": 2.4969660194174756,
"grad_norm": 8.770302772521973,
"learning_rate": 6.778377654209761e-06,
"loss": 1.7158885955810548,
"memory(GiB)": 44.29,
"step": 4115,
"token_acc": 0.5580524344569289,
"train_speed(iter/s)": 0.579559
},
{
"epoch": 2.5,
"grad_norm": 8.05949878692627,
"learning_rate": 6.698729810778065e-06,
"loss": 2.1136884689331055,
"memory(GiB)": 44.29,
"step": 4120,
"token_acc": 0.5434782608695652,
"train_speed(iter/s)": 0.579581
},
{
"epoch": 2.5030339805825244,
"grad_norm": 9.553973197937012,
"learning_rate": 6.619519069655322e-06,
"loss": 1.8230070114135741,
"memory(GiB)": 44.29,
"step": 4125,
"token_acc": 0.5797101449275363,
"train_speed(iter/s)": 0.579641
},
{
"epoch": 2.5060679611650487,
"grad_norm": 7.956108093261719,
"learning_rate": 6.54074623042999e-06,
"loss": 2.0894168853759765,
"memory(GiB)": 44.29,
"step": 4130,
"token_acc": 0.5565749235474006,
"train_speed(iter/s)": 0.579668
},
{
"epoch": 2.5091019417475726,
"grad_norm": 8.481484413146973,
"learning_rate": 6.4624120882702535e-06,
"loss": 1.8939842224121093,
"memory(GiB)": 44.29,
"step": 4135,
"token_acc": 0.5757575757575758,
"train_speed(iter/s)": 0.579678
},
{
"epoch": 2.512135922330097,
"grad_norm": 9.907540321350098,
"learning_rate": 6.384517433915793e-06,
"loss": 1.9347640991210937,
"memory(GiB)": 44.29,
"step": 4140,
"token_acc": 0.5447761194029851,
"train_speed(iter/s)": 0.579755
},
{
"epoch": 2.5151699029126213,
"grad_norm": 7.414953231811523,
"learning_rate": 6.30706305366996e-06,
"loss": 1.730459213256836,
"memory(GiB)": 44.29,
"step": 4145,
"token_acc": 0.5985401459854015,
"train_speed(iter/s)": 0.579655
},
{
"epoch": 2.5182038834951457,
"grad_norm": 8.353326797485352,
"learning_rate": 6.230049729391779e-06,
"loss": 1.9265541076660155,
"memory(GiB)": 44.29,
"step": 4150,
"token_acc": 0.5632183908045977,
"train_speed(iter/s)": 0.57963
},
{
"epoch": 2.52123786407767,
"grad_norm": 6.5633673667907715,
"learning_rate": 6.153478238488019e-06,
"loss": 1.7929351806640625,
"memory(GiB)": 44.29,
"step": 4155,
"token_acc": 0.5882352941176471,
"train_speed(iter/s)": 0.579611
},
{
"epoch": 2.524271844660194,
"grad_norm": 7.694858551025391,
"learning_rate": 6.077349353905465e-06,
"loss": 2.095606231689453,
"memory(GiB)": 44.29,
"step": 4160,
"token_acc": 0.5905511811023622,
"train_speed(iter/s)": 0.579685
},
{
"epoch": 2.5273058252427183,
"grad_norm": 6.272264003753662,
"learning_rate": 6.00166384412294e-06,
"loss": 2.1394012451171873,
"memory(GiB)": 44.29,
"step": 4165,
"token_acc": 0.5434782608695652,
"train_speed(iter/s)": 0.579716
},
{
"epoch": 2.5303398058252426,
"grad_norm": 8.841377258300781,
"learning_rate": 5.926422473143717e-06,
"loss": 1.972856330871582,
"memory(GiB)": 44.29,
"step": 4170,
"token_acc": 0.5674740484429066,
"train_speed(iter/s)": 0.579732
},
{
"epoch": 2.533373786407767,
"grad_norm": 7.734652996063232,
"learning_rate": 5.851626000487714e-06,
"loss": 1.771505355834961,
"memory(GiB)": 44.29,
"step": 4175,
"token_acc": 0.5695364238410596,
"train_speed(iter/s)": 0.57976
},
{
"epoch": 2.5364077669902914,
"grad_norm": 8.464856147766113,
"learning_rate": 5.7772751811838165e-06,
"loss": 2.1697675704956056,
"memory(GiB)": 44.29,
"step": 4180,
"token_acc": 0.5490196078431373,
"train_speed(iter/s)": 0.579766
},
{
"epoch": 2.5394417475728153,
"grad_norm": 6.249225616455078,
"learning_rate": 5.703370765762345e-06,
"loss": 2.03582706451416,
"memory(GiB)": 44.29,
"step": 4185,
"token_acc": 0.5623188405797102,
"train_speed(iter/s)": 0.57982
},
{
"epoch": 2.54247572815534,
"grad_norm": 9.059986114501953,
"learning_rate": 5.629913500247364e-06,
"loss": 2.067348098754883,
"memory(GiB)": 44.29,
"step": 4190,
"token_acc": 0.5376712328767124,
"train_speed(iter/s)": 0.579847
},
{
"epoch": 2.545509708737864,
"grad_norm": 7.16273307800293,
"learning_rate": 5.556904126149237e-06,
"loss": 1.619649314880371,
"memory(GiB)": 44.29,
"step": 4195,
"token_acc": 0.6006600660066007,
"train_speed(iter/s)": 0.579885
},
{
"epoch": 2.5485436893203883,
"grad_norm": 8.991573333740234,
"learning_rate": 5.484343380457125e-06,
"loss": 1.7998830795288085,
"memory(GiB)": 44.29,
"step": 4200,
"token_acc": 0.5662650602409639,
"train_speed(iter/s)": 0.57986
},
{
"epoch": 2.5515776699029127,
"grad_norm": 8.396170616149902,
"learning_rate": 5.412231995631473e-06,
"loss": 1.9479732513427734,
"memory(GiB)": 44.29,
"step": 4205,
"token_acc": 0.5418060200668896,
"train_speed(iter/s)": 0.579772
},
{
"epoch": 2.554611650485437,
"grad_norm": 9.159605979919434,
"learning_rate": 5.340570699596769e-06,
"loss": 1.8561626434326173,
"memory(GiB)": 44.29,
"step": 4210,
"token_acc": 0.5530973451327433,
"train_speed(iter/s)": 0.579768
},
{
"epoch": 2.5576456310679614,
"grad_norm": 9.651739120483398,
"learning_rate": 5.269360215734026e-06,
"loss": 2.021830940246582,
"memory(GiB)": 44.29,
"step": 4215,
"token_acc": 0.5821917808219178,
"train_speed(iter/s)": 0.579771
},
{
"epoch": 2.5606796116504853,
"grad_norm": 6.730819225311279,
"learning_rate": 5.198601262873593e-06,
"loss": 1.8237226486206055,
"memory(GiB)": 44.29,
"step": 4220,
"token_acc": 0.5780821917808219,
"train_speed(iter/s)": 0.57971
},
{
"epoch": 2.5637135922330097,
"grad_norm": 10.186707496643066,
"learning_rate": 5.12829455528786e-06,
"loss": 1.6941600799560548,
"memory(GiB)": 44.29,
"step": 4225,
"token_acc": 0.6114649681528662,
"train_speed(iter/s)": 0.579676
},
{
"epoch": 2.566747572815534,
"grad_norm": 11.422538757324219,
"learning_rate": 5.0584408026840555e-06,
"loss": 1.9525514602661134,
"memory(GiB)": 44.29,
"step": 4230,
"token_acc": 0.5769230769230769,
"train_speed(iter/s)": 0.57969
},
{
"epoch": 2.5697815533980584,
"grad_norm": 6.254408836364746,
"learning_rate": 4.989040710197068e-06,
"loss": 1.8417320251464844,
"memory(GiB)": 44.29,
"step": 4235,
"token_acc": 0.5893854748603352,
"train_speed(iter/s)": 0.579743
},
{
"epoch": 2.5728155339805827,
"grad_norm": 8.989594459533691,
"learning_rate": 4.920094978382339e-06,
"loss": 2.2028553009033205,
"memory(GiB)": 44.29,
"step": 4240,
"token_acc": 0.52,
"train_speed(iter/s)": 0.579778
},
{
"epoch": 2.5758495145631066,
"grad_norm": 8.0951566696167,
"learning_rate": 4.851604303208801e-06,
"loss": 1.881844711303711,
"memory(GiB)": 44.29,
"step": 4245,
"token_acc": 0.6153846153846154,
"train_speed(iter/s)": 0.579786
},
{
"epoch": 2.578883495145631,
"grad_norm": 7.341141700744629,
"learning_rate": 4.783569376051833e-06,
"loss": 2.057468223571777,
"memory(GiB)": 44.29,
"step": 4250,
"token_acc": 0.5373563218390804,
"train_speed(iter/s)": 0.579789
},
{
"epoch": 2.5819174757281553,
"grad_norm": 8.066463470458984,
"learning_rate": 4.7159908836862994e-06,
"loss": 1.9251741409301757,
"memory(GiB)": 44.29,
"step": 4255,
"token_acc": 0.559322033898305,
"train_speed(iter/s)": 0.579759
},
{
"epoch": 2.5849514563106797,
"grad_norm": 14.030436515808105,
"learning_rate": 4.648869508279613e-06,
"loss": 1.9517692565917968,
"memory(GiB)": 44.29,
"step": 4260,
"token_acc": 0.5652173913043478,
"train_speed(iter/s)": 0.579826
},
{
"epoch": 2.587985436893204,
"grad_norm": 7.453925132751465,
"learning_rate": 4.582205927384814e-06,
"loss": 1.7124622344970704,
"memory(GiB)": 44.29,
"step": 4265,
"token_acc": 0.6095890410958904,
"train_speed(iter/s)": 0.579851
},
{
"epoch": 2.591019417475728,
"grad_norm": 9.848562240600586,
"learning_rate": 4.51600081393379e-06,
"loss": 1.534929084777832,
"memory(GiB)": 44.29,
"step": 4270,
"token_acc": 0.6325757575757576,
"train_speed(iter/s)": 0.579913
},
{
"epoch": 2.5940533980582523,
"grad_norm": 8.778762817382812,
"learning_rate": 4.450254836230449e-06,
"loss": 1.9810653686523438,
"memory(GiB)": 44.29,
"step": 4275,
"token_acc": 0.5531914893617021,
"train_speed(iter/s)": 0.579963
},
{
"epoch": 2.5970873786407767,
"grad_norm": 7.1793742179870605,
"learning_rate": 4.384968657943972e-06,
"loss": 2.044744682312012,
"memory(GiB)": 44.29,
"step": 4280,
"token_acc": 0.5632530120481928,
"train_speed(iter/s)": 0.579985
},
{
"epoch": 2.600121359223301,
"grad_norm": 6.974610805511475,
"learning_rate": 4.3201429381021285e-06,
"loss": 1.750173568725586,
"memory(GiB)": 44.29,
"step": 4285,
"token_acc": 0.6114864864864865,
"train_speed(iter/s)": 0.580089
},
{
"epoch": 2.6031553398058254,
"grad_norm": 7.6272196769714355,
"learning_rate": 4.255778331084609e-06,
"loss": 2.1643795013427733,
"memory(GiB)": 44.29,
"step": 4290,
"token_acc": 0.5138539042821159,
"train_speed(iter/s)": 0.580145
},
{
"epoch": 2.6061893203883493,
"grad_norm": 8.453348159790039,
"learning_rate": 4.1918754866164205e-06,
"loss": 2.236542510986328,
"memory(GiB)": 44.29,
"step": 4295,
"token_acc": 0.4793650793650794,
"train_speed(iter/s)": 0.580165
},
{
"epoch": 2.6092233009708736,
"grad_norm": 10.852858543395996,
"learning_rate": 4.1284350497613426e-06,
"loss": 1.9886856079101562,
"memory(GiB)": 44.29,
"step": 4300,
"token_acc": 0.5439739413680782,
"train_speed(iter/s)": 0.58019
},
{
"epoch": 2.612257281553398,
"grad_norm": 9.123336791992188,
"learning_rate": 4.065457660915401e-06,
"loss": 1.9303335189819335,
"memory(GiB)": 44.29,
"step": 4305,
"token_acc": 0.5847457627118644,
"train_speed(iter/s)": 0.580152
},
{
"epoch": 2.6152912621359223,
"grad_norm": 7.784154891967773,
"learning_rate": 4.002943955800409e-06,
"loss": 2.0141778945922852,
"memory(GiB)": 44.29,
"step": 4310,
"token_acc": 0.5393258426966292,
"train_speed(iter/s)": 0.580169
},
{
"epoch": 2.6183252427184467,
"grad_norm": 9.202990531921387,
"learning_rate": 3.94089456545757e-06,
"loss": 1.936072540283203,
"memory(GiB)": 44.29,
"step": 4315,
"token_acc": 0.5570469798657718,
"train_speed(iter/s)": 0.580162
},
{
"epoch": 2.6213592233009706,
"grad_norm": 8.248907089233398,
"learning_rate": 3.879310116241042e-06,
"loss": 1.968276596069336,
"memory(GiB)": 44.29,
"step": 4320,
"token_acc": 0.5680272108843537,
"train_speed(iter/s)": 0.580185
},
{
"epoch": 2.6243932038834954,
"grad_norm": 10.208954811096191,
"learning_rate": 3.818191229811696e-06,
"loss": 1.9195415496826171,
"memory(GiB)": 44.29,
"step": 4325,
"token_acc": 0.5785123966942148,
"train_speed(iter/s)": 0.580191
},
{
"epoch": 2.6274271844660193,
"grad_norm": 8.11597728729248,
"learning_rate": 3.757538523130799e-06,
"loss": 2.197231674194336,
"memory(GiB)": 44.29,
"step": 4330,
"token_acc": 0.5173501577287066,
"train_speed(iter/s)": 0.580246
},
{
"epoch": 2.6304611650485437,
"grad_norm": 10.075161933898926,
"learning_rate": 3.697352608453791e-06,
"loss": 2.041206932067871,
"memory(GiB)": 44.29,
"step": 4335,
"token_acc": 0.5785714285714286,
"train_speed(iter/s)": 0.580244
},
{
"epoch": 2.633495145631068,
"grad_norm": 9.632774353027344,
"learning_rate": 3.6376340933241104e-06,
"loss": 1.9504831314086915,
"memory(GiB)": 44.29,
"step": 4340,
"token_acc": 0.5544217687074829,
"train_speed(iter/s)": 0.580233
},
{
"epoch": 2.6365291262135924,
"grad_norm": 7.268722057342529,
"learning_rate": 3.5783835805670183e-06,
"loss": 2.2769695281982423,
"memory(GiB)": 44.29,
"step": 4345,
"token_acc": 0.5015197568389058,
"train_speed(iter/s)": 0.580233
},
{
"epoch": 2.6395631067961167,
"grad_norm": 13.444320678710938,
"learning_rate": 3.519601668283623e-06,
"loss": 1.9888429641723633,
"memory(GiB)": 44.29,
"step": 4350,
"token_acc": 0.5563636363636364,
"train_speed(iter/s)": 0.580169
},
{
"epoch": 2.6425970873786406,
"grad_norm": 8.871038436889648,
"learning_rate": 3.4612889498447043e-06,
"loss": 1.7693092346191406,
"memory(GiB)": 44.29,
"step": 4355,
"token_acc": 0.5708812260536399,
"train_speed(iter/s)": 0.580141
},
{
"epoch": 2.645631067961165,
"grad_norm": 9.421436309814453,
"learning_rate": 3.40344601388482e-06,
"loss": 1.8508855819702148,
"memory(GiB)": 44.29,
"step": 4360,
"token_acc": 0.5864197530864198,
"train_speed(iter/s)": 0.580145
},
{
"epoch": 2.6486650485436893,
"grad_norm": 14.255351066589355,
"learning_rate": 3.346073444296338e-06,
"loss": 1.8605754852294922,
"memory(GiB)": 44.29,
"step": 4365,
"token_acc": 0.6238244514106583,
"train_speed(iter/s)": 0.580184
},
{
"epoch": 2.6516990291262137,
"grad_norm": 9.838223457336426,
"learning_rate": 3.289171820223519e-06,
"loss": 1.8943605422973633,
"memory(GiB)": 44.29,
"step": 4370,
"token_acc": 0.59375,
"train_speed(iter/s)": 0.580196
},
{
"epoch": 2.654733009708738,
"grad_norm": 7.1384148597717285,
"learning_rate": 3.2327417160567196e-06,
"loss": 1.945779037475586,
"memory(GiB)": 44.29,
"step": 4375,
"token_acc": 0.5589225589225589,
"train_speed(iter/s)": 0.580208
},
{
"epoch": 2.657766990291262,
"grad_norm": 7.130894184112549,
"learning_rate": 3.176783701426528e-06,
"loss": 1.920769500732422,
"memory(GiB)": 44.29,
"step": 4380,
"token_acc": 0.5652173913043478,
"train_speed(iter/s)": 0.5802
},
{
"epoch": 2.6608009708737863,
"grad_norm": 7.5801215171813965,
"learning_rate": 3.121298341198081e-06,
"loss": 2.089648628234863,
"memory(GiB)": 44.29,
"step": 4385,
"token_acc": 0.5445026178010471,
"train_speed(iter/s)": 0.580307
},
{
"epoch": 2.6638349514563107,
"grad_norm": 9.623913764953613,
"learning_rate": 3.0662861954653232e-06,
"loss": 2.102077674865723,
"memory(GiB)": 44.29,
"step": 4390,
"token_acc": 0.5238095238095238,
"train_speed(iter/s)": 0.580352
},
{
"epoch": 2.666868932038835,
"grad_norm": 8.056645393371582,
"learning_rate": 3.0117478195453353e-06,
"loss": 2.002307319641113,
"memory(GiB)": 44.29,
"step": 4395,
"token_acc": 0.5625,
"train_speed(iter/s)": 0.580375
},
{
"epoch": 2.6699029126213594,
"grad_norm": 9.664189338684082,
"learning_rate": 2.9576837639728073e-06,
"loss": 1.638421630859375,
"memory(GiB)": 44.29,
"step": 4400,
"token_acc": 0.6138328530259366,
"train_speed(iter/s)": 0.580313
},
{
"epoch": 2.6729368932038833,
"grad_norm": 7.514981269836426,
"learning_rate": 2.9040945744943757e-06,
"loss": 1.8152626037597657,
"memory(GiB)": 44.29,
"step": 4405,
"token_acc": 0.5535055350553506,
"train_speed(iter/s)": 0.580281
},
{
"epoch": 2.6759708737864076,
"grad_norm": 7.019512176513672,
"learning_rate": 2.850980792063196e-06,
"loss": 1.805082130432129,
"memory(GiB)": 44.29,
"step": 4410,
"token_acc": 0.5683890577507599,
"train_speed(iter/s)": 0.580336
},
{
"epoch": 2.679004854368932,
"grad_norm": 8.447052955627441,
"learning_rate": 2.798342952833455e-06,
"loss": 1.9645135879516602,
"memory(GiB)": 44.29,
"step": 4415,
"token_acc": 0.5562700964630225,
"train_speed(iter/s)": 0.580375
},
{
"epoch": 2.6820388349514563,
"grad_norm": 15.852560997009277,
"learning_rate": 2.7461815881549225e-06,
"loss": 1.9464908599853517,
"memory(GiB)": 44.29,
"step": 4420,
"token_acc": 0.5913978494623656,
"train_speed(iter/s)": 0.580409
},
{
"epoch": 2.6850728155339807,
"grad_norm": 8.933894157409668,
"learning_rate": 2.694497224567688e-06,
"loss": 2.005167007446289,
"memory(GiB)": 44.29,
"step": 4425,
"token_acc": 0.5362903225806451,
"train_speed(iter/s)": 0.580346
},
{
"epoch": 2.6881067961165046,
"grad_norm": 5.791989326477051,
"learning_rate": 2.6432903837967036e-06,
"loss": 1.905177116394043,
"memory(GiB)": 44.29,
"step": 4430,
"token_acc": 0.556923076923077,
"train_speed(iter/s)": 0.580417
},
{
"epoch": 2.6911407766990294,
"grad_norm": 7.198362350463867,
"learning_rate": 2.5925615827466444e-06,
"loss": 2.0099058151245117,
"memory(GiB)": 44.29,
"step": 4435,
"token_acc": 0.5861111111111111,
"train_speed(iter/s)": 0.580452
},
{
"epoch": 2.6941747572815533,
"grad_norm": 10.059782981872559,
"learning_rate": 2.542311333496622e-06,
"loss": 2.030255126953125,
"memory(GiB)": 44.29,
"step": 4440,
"token_acc": 0.5693215339233039,
"train_speed(iter/s)": 0.580427
},
{
"epoch": 2.6972087378640777,
"grad_norm": 8.237997055053711,
"learning_rate": 2.492540143295036e-06,
"loss": 1.9501361846923828,
"memory(GiB)": 44.29,
"step": 4445,
"token_acc": 0.5619335347432024,
"train_speed(iter/s)": 0.5805
},
{
"epoch": 2.700242718446602,
"grad_norm": 8.189979553222656,
"learning_rate": 2.4432485145544527e-06,
"loss": 2.0908411026000975,
"memory(GiB)": 44.29,
"step": 4450,
"token_acc": 0.5563139931740614,
"train_speed(iter/s)": 0.580487
},
{
"epoch": 2.7032766990291264,
"grad_norm": 7.352850437164307,
"learning_rate": 2.394436944846523e-06,
"loss": 1.8278610229492187,
"memory(GiB)": 44.29,
"step": 4455,
"token_acc": 0.5739130434782609,
"train_speed(iter/s)": 0.580446
},
{
"epoch": 2.7063106796116507,
"grad_norm": 7.38375997543335,
"learning_rate": 2.3461059268969744e-06,
"loss": 1.9157276153564453,
"memory(GiB)": 44.29,
"step": 4460,
"token_acc": 0.5667655786350149,
"train_speed(iter/s)": 0.580538
},
{
"epoch": 2.7093446601941746,
"grad_norm": 11.641793251037598,
"learning_rate": 2.29825594858063e-06,
"loss": 1.723676872253418,
"memory(GiB)": 44.29,
"step": 4465,
"token_acc": 0.5962962962962963,
"train_speed(iter/s)": 0.580636
},
{
"epoch": 2.712378640776699,
"grad_norm": 8.030855178833008,
"learning_rate": 2.250887492916487e-06,
"loss": 1.855816650390625,
"memory(GiB)": 44.29,
"step": 4470,
"token_acc": 0.5791044776119403,
"train_speed(iter/s)": 0.580678
},
{
"epoch": 2.7154126213592233,
"grad_norm": 8.97354793548584,
"learning_rate": 2.204001038062836e-06,
"loss": 1.9793785095214844,
"memory(GiB)": 44.29,
"step": 4475,
"token_acc": 0.5257731958762887,
"train_speed(iter/s)": 0.58069
},
{
"epoch": 2.7184466019417477,
"grad_norm": 10.212775230407715,
"learning_rate": 2.157597057312444e-06,
"loss": 2.099479103088379,
"memory(GiB)": 44.29,
"step": 4480,
"token_acc": 0.5294117647058824,
"train_speed(iter/s)": 0.580635
},
{
"epoch": 2.721480582524272,
"grad_norm": 9.049674034118652,
"learning_rate": 2.1116760190877437e-06,
"loss": 1.7141408920288086,
"memory(GiB)": 44.29,
"step": 4485,
"token_acc": 0.6265822784810127,
"train_speed(iter/s)": 0.580726
},
{
"epoch": 2.724514563106796,
"grad_norm": 10.3820219039917,
"learning_rate": 2.0662383869361645e-06,
"loss": 1.9986873626708985,
"memory(GiB)": 44.29,
"step": 4490,
"token_acc": 0.5757575757575758,
"train_speed(iter/s)": 0.580832
},
{
"epoch": 2.7275485436893203,
"grad_norm": 7.463447093963623,
"learning_rate": 2.0212846195253987e-06,
"loss": 2.1121494293212892,
"memory(GiB)": 44.29,
"step": 4495,
"token_acc": 0.5421052631578948,
"train_speed(iter/s)": 0.580767
},
{
"epoch": 2.7305825242718447,
"grad_norm": 8.770597457885742,
"learning_rate": 1.976815170638802e-06,
"loss": 2.0751237869262695,
"memory(GiB)": 44.29,
"step": 4500,
"token_acc": 0.5382165605095541,
"train_speed(iter/s)": 0.580785
},
{
"epoch": 2.7305825242718447,
"eval_loss": 1.9814581871032715,
"eval_runtime": 12.0819,
"eval_samples_per_second": 8.277,
"eval_steps_per_second": 8.277,
"eval_token_acc": 0.5314591700133868,
"step": 4500
},
{
"epoch": 2.733616504854369,
"grad_norm": 11.466297149658203,
"learning_rate": 1.9328304891708003e-06,
"loss": 1.710250473022461,
"memory(GiB)": 44.29,
"step": 4505,
"token_acc": 0.5572666025024061,
"train_speed(iter/s)": 0.579903
},
{
"epoch": 2.7366504854368934,
"grad_norm": 7.724918365478516,
"learning_rate": 1.8893310191223535e-06,
"loss": 1.6978034973144531,
"memory(GiB)": 44.29,
"step": 4510,
"token_acc": 0.5993377483443708,
"train_speed(iter/s)": 0.579938
},
{
"epoch": 2.7396844660194173,
"grad_norm": 7.455316543579102,
"learning_rate": 1.8463171995964978e-06,
"loss": 1.7312326431274414,
"memory(GiB)": 44.29,
"step": 4515,
"token_acc": 0.584717607973422,
"train_speed(iter/s)": 0.579877
},
{
"epoch": 2.7427184466019416,
"grad_norm": 9.585491180419922,
"learning_rate": 1.8037894647938758e-06,
"loss": 1.9202953338623048,
"memory(GiB)": 44.29,
"step": 4520,
"token_acc": 0.5942492012779552,
"train_speed(iter/s)": 0.579941
},
{
"epoch": 2.745752427184466,
"grad_norm": 7.608863830566406,
"learning_rate": 1.7617482440083931e-06,
"loss": 1.9673721313476562,
"memory(GiB)": 44.29,
"step": 4525,
"token_acc": 0.5555555555555556,
"train_speed(iter/s)": 0.579965
},
{
"epoch": 2.7487864077669903,
"grad_norm": 7.024211883544922,
"learning_rate": 1.7201939616228569e-06,
"loss": 1.9407548904418945,
"memory(GiB)": 44.29,
"step": 4530,
"token_acc": 0.5653333333333334,
"train_speed(iter/s)": 0.579964
},
{
"epoch": 2.7518203883495147,
"grad_norm": 8.106232643127441,
"learning_rate": 1.6791270371046997e-06,
"loss": 1.7603139877319336,
"memory(GiB)": 44.29,
"step": 4535,
"token_acc": 0.5956112852664577,
"train_speed(iter/s)": 0.579993
},
{
"epoch": 2.7548543689320386,
"grad_norm": 8.373075485229492,
"learning_rate": 1.638547885001762e-06,
"loss": 2.115154838562012,
"memory(GiB)": 44.29,
"step": 4540,
"token_acc": 0.5537459283387622,
"train_speed(iter/s)": 0.58
},
{
"epoch": 2.757888349514563,
"grad_norm": 8.133313179016113,
"learning_rate": 1.5984569149380678e-06,
"loss": 1.959366226196289,
"memory(GiB)": 44.29,
"step": 4545,
"token_acc": 0.592948717948718,
"train_speed(iter/s)": 0.580015
},
{
"epoch": 2.7609223300970873,
"grad_norm": 10.857476234436035,
"learning_rate": 1.5588545316097269e-06,
"loss": 1.636090087890625,
"memory(GiB)": 44.29,
"step": 4550,
"token_acc": 0.5871212121212122,
"train_speed(iter/s)": 0.580078
},
{
"epoch": 2.7639563106796117,
"grad_norm": 7.435564041137695,
"learning_rate": 1.51974113478085e-06,
"loss": 1.679054069519043,
"memory(GiB)": 44.29,
"step": 4555,
"token_acc": 0.597864768683274,
"train_speed(iter/s)": 0.580029
},
{
"epoch": 2.766990291262136,
"grad_norm": 8.17315673828125,
"learning_rate": 1.4811171192794627e-06,
"loss": 2.029444694519043,
"memory(GiB)": 44.29,
"step": 4560,
"token_acc": 0.568561872909699,
"train_speed(iter/s)": 0.579988
},
{
"epoch": 2.77002427184466,
"grad_norm": 10.141511917114258,
"learning_rate": 1.4429828749936092e-06,
"loss": 1.9936655044555665,
"memory(GiB)": 44.29,
"step": 4565,
"token_acc": 0.5769230769230769,
"train_speed(iter/s)": 0.580042
},
{
"epoch": 2.7730582524271847,
"grad_norm": 8.14281177520752,
"learning_rate": 1.4053387868673217e-06,
"loss": 1.8854488372802733,
"memory(GiB)": 44.29,
"step": 4570,
"token_acc": 0.582089552238806,
"train_speed(iter/s)": 0.580069
},
{
"epoch": 2.7760922330097086,
"grad_norm": 7.9403910636901855,
"learning_rate": 1.368185234896796e-06,
"loss": 2.005961608886719,
"memory(GiB)": 44.29,
"step": 4575,
"token_acc": 0.5279503105590062,
"train_speed(iter/s)": 0.580129
},
{
"epoch": 2.779126213592233,
"grad_norm": 8.195262908935547,
"learning_rate": 1.3315225941265386e-06,
"loss": 1.789654541015625,
"memory(GiB)": 44.29,
"step": 4580,
"token_acc": 0.5811688311688312,
"train_speed(iter/s)": 0.580137
},
{
"epoch": 2.7821601941747574,
"grad_norm": 7.385119438171387,
"learning_rate": 1.2953512346455643e-06,
"loss": 1.678761100769043,
"memory(GiB)": 44.29,
"step": 4585,
"token_acc": 0.5840978593272171,
"train_speed(iter/s)": 0.580145
},
{
"epoch": 2.7851941747572817,
"grad_norm": 6.9788336753845215,
"learning_rate": 1.2596715215836996e-06,
"loss": 1.8593015670776367,
"memory(GiB)": 44.29,
"step": 4590,
"token_acc": 0.5896551724137931,
"train_speed(iter/s)": 0.580152
},
{
"epoch": 2.788228155339806,
"grad_norm": 7.658742427825928,
"learning_rate": 1.224483815107863e-06,
"loss": 1.924429702758789,
"memory(GiB)": 44.29,
"step": 4595,
"token_acc": 0.5796610169491525,
"train_speed(iter/s)": 0.580188
},
{
"epoch": 2.79126213592233,
"grad_norm": 7.426290035247803,
"learning_rate": 1.1897884704184236e-06,
"loss": 1.8148229598999024,
"memory(GiB)": 44.29,
"step": 4600,
"token_acc": 0.6019108280254777,
"train_speed(iter/s)": 0.580172
},
{
"epoch": 2.7942961165048543,
"grad_norm": 9.051165580749512,
"learning_rate": 1.1555858377456596e-06,
"loss": 1.9418960571289063,
"memory(GiB)": 44.29,
"step": 4605,
"token_acc": 0.6119402985074627,
"train_speed(iter/s)": 0.580258
},
{
"epoch": 2.7973300970873787,
"grad_norm": 6.436223030090332,
"learning_rate": 1.1218762623461666e-06,
"loss": 1.7338180541992188,
"memory(GiB)": 44.29,
"step": 4610,
"token_acc": 0.6037735849056604,
"train_speed(iter/s)": 0.580253
},
{
"epoch": 2.800364077669903,
"grad_norm": 9.345931053161621,
"learning_rate": 1.0886600844994266e-06,
"loss": 2.1333446502685547,
"memory(GiB)": 44.29,
"step": 4615,
"token_acc": 0.5629139072847682,
"train_speed(iter/s)": 0.580306
},
{
"epoch": 2.8033980582524274,
"grad_norm": 9.715279579162598,
"learning_rate": 1.0559376395043285e-06,
"loss": 1.706222152709961,
"memory(GiB)": 44.29,
"step": 4620,
"token_acc": 0.6322314049586777,
"train_speed(iter/s)": 0.580351
},
{
"epoch": 2.8064320388349513,
"grad_norm": 5.747392654418945,
"learning_rate": 1.0237092576758034e-06,
"loss": 1.9026046752929688,
"memory(GiB)": 44.29,
"step": 4625,
"token_acc": 0.5642458100558659,
"train_speed(iter/s)": 0.580353
},
{
"epoch": 2.8094660194174756,
"grad_norm": 9.29836654663086,
"learning_rate": 9.919752643414992e-07,
"loss": 1.9644575119018555,
"memory(GiB)": 44.29,
"step": 4630,
"token_acc": 0.5217391304347826,
"train_speed(iter/s)": 0.580395
},
{
"epoch": 2.8125,
"grad_norm": 6.784262657165527,
"learning_rate": 9.607359798384785e-07,
"loss": 2.0778518676757813,
"memory(GiB)": 44.29,
"step": 4635,
"token_acc": 0.5848375451263538,
"train_speed(iter/s)": 0.58043
},
{
"epoch": 2.8155339805825244,
"grad_norm": 10.704444885253906,
"learning_rate": 9.299917195099927e-07,
"loss": 1.6303802490234376,
"memory(GiB)": 44.29,
"step": 4640,
"token_acc": 0.5941176470588235,
"train_speed(iter/s)": 0.580467
},
{
"epoch": 2.8185679611650487,
"grad_norm": 9.466361045837402,
"learning_rate": 8.997427937023018e-07,
"loss": 2.072785758972168,
"memory(GiB)": 44.29,
"step": 4645,
"token_acc": 0.5944272445820433,
"train_speed(iter/s)": 0.580569
},
{
"epoch": 2.8216019417475726,
"grad_norm": 8.331581115722656,
"learning_rate": 8.699895077615316e-07,
"loss": 1.9922773361206054,
"memory(GiB)": 44.29,
"step": 4650,
"token_acc": 0.5819935691318328,
"train_speed(iter/s)": 0.580614
},
{
"epoch": 2.824635922330097,
"grad_norm": 7.199705600738525,
"learning_rate": 8.407321620306108e-07,
"loss": 2.1337678909301756,
"memory(GiB)": 44.29,
"step": 4655,
"token_acc": 0.583011583011583,
"train_speed(iter/s)": 0.580633
},
{
"epoch": 2.8276699029126213,
"grad_norm": 11.327582359313965,
"learning_rate": 8.119710518462164e-07,
"loss": 1.815553855895996,
"memory(GiB)": 44.29,
"step": 4660,
"token_acc": 0.5860058309037901,
"train_speed(iter/s)": 0.58062
},
{
"epoch": 2.8307038834951457,
"grad_norm": 9.220823287963867,
"learning_rate": 7.837064675357997e-07,
"loss": 2.0095773696899415,
"memory(GiB)": 44.29,
"step": 4665,
"token_acc": 0.5482866043613707,
"train_speed(iter/s)": 0.580668
},
{
"epoch": 2.83373786407767,
"grad_norm": 8.487168312072754,
"learning_rate": 7.559386944146762e-07,
"loss": 1.874141311645508,
"memory(GiB)": 44.29,
"step": 4670,
"token_acc": 0.5662337662337662,
"train_speed(iter/s)": 0.58063
},
{
"epoch": 2.836771844660194,
"grad_norm": 10.926680564880371,
"learning_rate": 7.28668012783107e-07,
"loss": 1.9664880752563476,
"memory(GiB)": 44.29,
"step": 4675,
"token_acc": 0.567398119122257,
"train_speed(iter/s)": 0.580617
},
{
"epoch": 2.8398058252427183,
"grad_norm": 8.504984855651855,
"learning_rate": 7.018946979234997e-07,
"loss": 2.202426528930664,
"memory(GiB)": 44.29,
"step": 4680,
"token_acc": 0.5260416666666666,
"train_speed(iter/s)": 0.580652
},
{
"epoch": 2.8428398058252426,
"grad_norm": 8.054615020751953,
"learning_rate": 6.756190200976287e-07,
"loss": 2.008488082885742,
"memory(GiB)": 44.29,
"step": 4685,
"token_acc": 0.5735735735735735,
"train_speed(iter/s)": 0.580661
},
{
"epoch": 2.845873786407767,
"grad_norm": 7.760517597198486,
"learning_rate": 6.498412445438751e-07,
"loss": 1.9507659912109374,
"memory(GiB)": 44.29,
"step": 4690,
"token_acc": 0.5644699140401146,
"train_speed(iter/s)": 0.58068
},
{
"epoch": 2.8489077669902914,
"grad_norm": 8.335232734680176,
"learning_rate": 6.245616314746072e-07,
"loss": 2.067840576171875,
"memory(GiB)": 44.29,
"step": 4695,
"token_acc": 0.5325779036827195,
"train_speed(iter/s)": 0.580562
},
{
"epoch": 2.8519417475728153,
"grad_norm": 10.580134391784668,
"learning_rate": 5.997804360734827e-07,
"loss": 2.042892837524414,
"memory(GiB)": 44.29,
"step": 4700,
"token_acc": 0.5509554140127388,
"train_speed(iter/s)": 0.580537
},
{
"epoch": 2.85497572815534,
"grad_norm": 7.85345983505249,
"learning_rate": 5.754979084929335e-07,
"loss": 1.6745044708251953,
"memory(GiB)": 44.29,
"step": 4705,
"token_acc": 0.6067796610169491,
"train_speed(iter/s)": 0.580553
},
{
"epoch": 2.858009708737864,
"grad_norm": 6.51752233505249,
"learning_rate": 5.517142938516074e-07,
"loss": 1.8814077377319336,
"memory(GiB)": 44.29,
"step": 4710,
"token_acc": 0.5815384615384616,
"train_speed(iter/s)": 0.58047
},
{
"epoch": 2.8610436893203883,
"grad_norm": 9.65807819366455,
"learning_rate": 5.284298322319026e-07,
"loss": 2.0154050827026366,
"memory(GiB)": 44.29,
"step": 4715,
"token_acc": 0.5488958990536278,
"train_speed(iter/s)": 0.580497
},
{
"epoch": 2.8640776699029127,
"grad_norm": 6.690892696380615,
"learning_rate": 5.056447586775593e-07,
"loss": 1.9270032882690429,
"memory(GiB)": 44.29,
"step": 4720,
"token_acc": 0.589041095890411,
"train_speed(iter/s)": 0.580493
},
{
"epoch": 2.867111650485437,
"grad_norm": 7.775207996368408,
"learning_rate": 4.833593031912387e-07,
"loss": 1.9307134628295899,
"memory(GiB)": 44.29,
"step": 4725,
"token_acc": 0.584045584045584,
"train_speed(iter/s)": 0.580464
},
{
"epoch": 2.8701456310679614,
"grad_norm": 6.894526481628418,
"learning_rate": 4.6157369073226984e-07,
"loss": 1.5071632385253906,
"memory(GiB)": 44.29,
"step": 4730,
"token_acc": 0.6421725239616614,
"train_speed(iter/s)": 0.58048
},
{
"epoch": 2.8731796116504853,
"grad_norm": 6.513083457946777,
"learning_rate": 4.402881412143234e-07,
"loss": 2.146462249755859,
"memory(GiB)": 44.29,
"step": 4735,
"token_acc": 0.5506849315068493,
"train_speed(iter/s)": 0.580443
},
{
"epoch": 2.8762135922330097,
"grad_norm": 7.810274600982666,
"learning_rate": 4.1950286950321327e-07,
"loss": 1.9746414184570313,
"memory(GiB)": 44.29,
"step": 4740,
"token_acc": 0.5451713395638629,
"train_speed(iter/s)": 0.580426
},
{
"epoch": 2.879247572815534,
"grad_norm": 8.50667667388916,
"learning_rate": 3.9921808541474316e-07,
"loss": 1.7838300704956054,
"memory(GiB)": 44.29,
"step": 4745,
"token_acc": 0.5792880258899676,
"train_speed(iter/s)": 0.580387
},
{
"epoch": 2.8822815533980584,
"grad_norm": 7.629726886749268,
"learning_rate": 3.7943399371254686e-07,
"loss": 1.6623340606689454,
"memory(GiB)": 44.29,
"step": 4750,
"token_acc": 0.6351791530944625,
"train_speed(iter/s)": 0.580241
},
{
"epoch": 2.8853155339805827,
"grad_norm": 7.58314323425293,
"learning_rate": 3.601507941060622e-07,
"loss": 2.0338212966918947,
"memory(GiB)": 44.29,
"step": 4755,
"token_acc": 0.5410764872521246,
"train_speed(iter/s)": 0.580202
},
{
"epoch": 2.8883495145631066,
"grad_norm": 11.662416458129883,
"learning_rate": 3.41368681248494e-07,
"loss": 1.8530158996582031,
"memory(GiB)": 44.29,
"step": 4760,
"token_acc": 0.5753424657534246,
"train_speed(iter/s)": 0.580151
},
{
"epoch": 2.891383495145631,
"grad_norm": 9.764945030212402,
"learning_rate": 3.2308784473485956e-07,
"loss": 1.810487937927246,
"memory(GiB)": 44.29,
"step": 4765,
"token_acc": 0.6116071428571429,
"train_speed(iter/s)": 0.580171
},
{
"epoch": 2.8944174757281553,
"grad_norm": 7.281760215759277,
"learning_rate": 3.053084691000685e-07,
"loss": 1.762740707397461,
"memory(GiB)": 44.29,
"step": 4770,
"token_acc": 0.5967213114754099,
"train_speed(iter/s)": 0.580116
},
{
"epoch": 2.8974514563106797,
"grad_norm": 8.245015144348145,
"learning_rate": 2.8803073381704626e-07,
"loss": 1.84234561920166,
"memory(GiB)": 44.29,
"step": 4775,
"token_acc": 0.5969230769230769,
"train_speed(iter/s)": 0.580086
},
{
"epoch": 2.900485436893204,
"grad_norm": 6.5408935546875,
"learning_rate": 2.712548132949577e-07,
"loss": 1.842409896850586,
"memory(GiB)": 44.29,
"step": 4780,
"token_acc": 0.60790273556231,
"train_speed(iter/s)": 0.580074
},
{
"epoch": 2.903519417475728,
"grad_norm": 7.102424144744873,
"learning_rate": 2.5498087687741424e-07,
"loss": 1.610619354248047,
"memory(GiB)": 44.64,
"step": 4785,
"token_acc": 0.6106870229007634,
"train_speed(iter/s)": 0.580062
},
{
"epoch": 2.9065533980582523,
"grad_norm": 8.561509132385254,
"learning_rate": 2.3920908884078053e-07,
"loss": 1.9039691925048827,
"memory(GiB)": 44.64,
"step": 4790,
"token_acc": 0.5791245791245792,
"train_speed(iter/s)": 0.580072
},
{
"epoch": 2.9095873786407767,
"grad_norm": 10.727002143859863,
"learning_rate": 2.239396083925094e-07,
"loss": 1.9637014389038085,
"memory(GiB)": 44.64,
"step": 4795,
"token_acc": 0.5355029585798816,
"train_speed(iter/s)": 0.579908
},
{
"epoch": 2.912621359223301,
"grad_norm": 8.442927360534668,
"learning_rate": 2.0917258966953733e-07,
"loss": 2.2038265228271485,
"memory(GiB)": 44.64,
"step": 4800,
"token_acc": 0.5454545454545454,
"train_speed(iter/s)": 0.579874
},
{
"epoch": 2.9156553398058254,
"grad_norm": 8.77606201171875,
"learning_rate": 1.9490818173672486e-07,
"loss": 1.8866867065429687,
"memory(GiB)": 44.64,
"step": 4805,
"token_acc": 0.5782747603833865,
"train_speed(iter/s)": 0.579853
},
{
"epoch": 2.9186893203883493,
"grad_norm": 8.638134956359863,
"learning_rate": 1.8114652858536862e-07,
"loss": 1.8457630157470704,
"memory(GiB)": 44.64,
"step": 4810,
"token_acc": 0.5689655172413793,
"train_speed(iter/s)": 0.579838
},
{
"epoch": 2.9217233009708736,
"grad_norm": 6.532174587249756,
"learning_rate": 1.6788776913171932e-07,
"loss": 1.879047966003418,
"memory(GiB)": 44.64,
"step": 4815,
"token_acc": 0.5876923076923077,
"train_speed(iter/s)": 0.579795
},
{
"epoch": 2.924757281553398,
"grad_norm": 10.169187545776367,
"learning_rate": 1.5513203721559955e-07,
"loss": 2.0470817565917967,
"memory(GiB)": 44.64,
"step": 4820,
"token_acc": 0.5559322033898305,
"train_speed(iter/s)": 0.57982
},
{
"epoch": 2.9277912621359223,
"grad_norm": 7.9186015129089355,
"learning_rate": 1.428794615990603e-07,
"loss": 1.855224609375,
"memory(GiB)": 44.64,
"step": 4825,
"token_acc": 0.559322033898305,
"train_speed(iter/s)": 0.579822
},
{
"epoch": 2.9308252427184467,
"grad_norm": 8.12701416015625,
"learning_rate": 1.3113016596503769e-07,
"loss": 1.8893653869628906,
"memory(GiB)": 44.64,
"step": 4830,
"token_acc": 0.551948051948052,
"train_speed(iter/s)": 0.579803
},
{
"epoch": 2.9338592233009706,
"grad_norm": 6.349172592163086,
"learning_rate": 1.1988426891617054e-07,
"loss": 1.6970531463623046,
"memory(GiB)": 44.64,
"step": 4835,
"token_acc": 0.6104294478527608,
"train_speed(iter/s)": 0.579803
},
{
"epoch": 2.9368932038834954,
"grad_norm": 7.324570655822754,
"learning_rate": 1.0914188397355141e-07,
"loss": 1.8949806213378906,
"memory(GiB)": 44.64,
"step": 4840,
"token_acc": 0.5326797385620915,
"train_speed(iter/s)": 0.579773
},
{
"epoch": 2.9399271844660193,
"grad_norm": 7.8848114013671875,
"learning_rate": 9.890311957559406e-08,
"loss": 2.149030303955078,
"memory(GiB)": 44.64,
"step": 4845,
"token_acc": 0.5292207792207793,
"train_speed(iter/s)": 0.579762
},
{
"epoch": 2.9429611650485437,
"grad_norm": 7.4910783767700195,
"learning_rate": 8.916807907695113e-08,
"loss": 2.0563175201416017,
"memory(GiB)": 44.64,
"step": 4850,
"token_acc": 0.5728476821192053,
"train_speed(iter/s)": 0.5797
},
{
"epoch": 2.945995145631068,
"grad_norm": 10.111432075500488,
"learning_rate": 7.993686074744821e-08,
"loss": 1.8403484344482421,
"memory(GiB)": 44.64,
"step": 4855,
"token_acc": 0.6107594936708861,
"train_speed(iter/s)": 0.579672
},
{
"epoch": 2.9490291262135924,
"grad_norm": 8.500150680541992,
"learning_rate": 7.120955777112914e-08,
"loss": 1.9626632690429688,
"memory(GiB)": 44.64,
"step": 4860,
"token_acc": 0.5274390243902439,
"train_speed(iter/s)": 0.579711
},
{
"epoch": 2.9520631067961167,
"grad_norm": 9.124574661254883,
"learning_rate": 6.298625824527337e-08,
"loss": 2.131892776489258,
"memory(GiB)": 44.64,
"step": 4865,
"token_acc": 0.540625,
"train_speed(iter/s)": 0.579721
},
{
"epoch": 2.9550970873786406,
"grad_norm": 8.05234432220459,
"learning_rate": 5.526704517951897e-08,
"loss": 1.5919179916381836,
"memory(GiB)": 44.64,
"step": 4870,
"token_acc": 0.6409495548961425,
"train_speed(iter/s)": 0.579706
},
{
"epoch": 2.958131067961165,
"grad_norm": 9.051335334777832,
"learning_rate": 4.8051996495052096e-08,
"loss": 1.8254867553710938,
"memory(GiB)": 44.64,
"step": 4875,
"token_acc": 0.5714285714285714,
"train_speed(iter/s)": 0.579633
},
{
"epoch": 2.9611650485436893,
"grad_norm": 12.33359432220459,
"learning_rate": 4.134118502378548e-08,
"loss": 1.7505077362060546,
"memory(GiB)": 44.64,
"step": 4880,
"token_acc": 0.6245954692556634,
"train_speed(iter/s)": 0.579568
},
{
"epoch": 2.9641990291262137,
"grad_norm": 8.701041221618652,
"learning_rate": 3.5134678507636745e-08,
"loss": 1.7970073699951172,
"memory(GiB)": 44.64,
"step": 4885,
"token_acc": 0.5992366412213741,
"train_speed(iter/s)": 0.579584
},
{
"epoch": 2.967233009708738,
"grad_norm": 10.156620979309082,
"learning_rate": 2.9432539597851195e-08,
"loss": 2.0175302505493162,
"memory(GiB)": 44.64,
"step": 4890,
"token_acc": 0.5270758122743683,
"train_speed(iter/s)": 0.579648
},
{
"epoch": 2.970266990291262,
"grad_norm": 9.249608039855957,
"learning_rate": 2.423482585435788e-08,
"loss": 1.8950572967529298,
"memory(GiB)": 44.64,
"step": 4895,
"token_acc": 0.5878594249201278,
"train_speed(iter/s)": 0.579643
},
{
"epoch": 2.9733009708737863,
"grad_norm": 8.014542579650879,
"learning_rate": 1.9541589745186717e-08,
"loss": 1.8426591873168945,
"memory(GiB)": 44.64,
"step": 4900,
"token_acc": 0.5846153846153846,
"train_speed(iter/s)": 0.579651
},
{
"epoch": 2.9763349514563107,
"grad_norm": 7.611429691314697,
"learning_rate": 1.5352878645963352e-08,
"loss": 2.1125755310058594,
"memory(GiB)": 44.64,
"step": 4905,
"token_acc": 0.5577557755775577,
"train_speed(iter/s)": 0.579586
},
{
"epoch": 2.979368932038835,
"grad_norm": 12.364704132080078,
"learning_rate": 1.1668734839404006e-08,
"loss": 1.8508235931396484,
"memory(GiB)": 44.64,
"step": 4910,
"token_acc": 0.6196078431372549,
"train_speed(iter/s)": 0.579638
},
{
"epoch": 2.9824029126213594,
"grad_norm": 7.556114196777344,
"learning_rate": 8.489195514888027e-09,
"loss": 2.153472137451172,
"memory(GiB)": 44.64,
"step": 4915,
"token_acc": 0.5749235474006116,
"train_speed(iter/s)": 0.579639
},
{
"epoch": 2.9854368932038833,
"grad_norm": 5.7980546951293945,
"learning_rate": 5.814292768108187e-09,
"loss": 1.972738265991211,
"memory(GiB)": 44.64,
"step": 4920,
"token_acc": 0.5417721518987342,
"train_speed(iter/s)": 0.579645
},
{
"epoch": 2.9884708737864076,
"grad_norm": 8.18667221069336,
"learning_rate": 3.644053600726505e-09,
"loss": 2.019988441467285,
"memory(GiB)": 44.64,
"step": 4925,
"token_acc": 0.5647058823529412,
"train_speed(iter/s)": 0.579688
},
{
"epoch": 2.991504854368932,
"grad_norm": 7.362902641296387,
"learning_rate": 1.978499920096688e-09,
"loss": 1.9861087799072266,
"memory(GiB)": 44.64,
"step": 4930,
"token_acc": 0.5605095541401274,
"train_speed(iter/s)": 0.579756
},
{
"epoch": 2.9945388349514563,
"grad_norm": 7.812079906463623,
"learning_rate": 8.176485390642974e-10,
"loss": 1.789814567565918,
"memory(GiB)": 44.64,
"step": 4935,
"token_acc": 0.5348837209302325,
"train_speed(iter/s)": 0.579831
},
{
"epoch": 2.9975728155339807,
"grad_norm": 9.960822105407715,
"learning_rate": 1.6151117577800633e-10,
"loss": 2.1190773010253907,
"memory(GiB)": 44.64,
"step": 4940,
"token_acc": 0.5483870967741935,
"train_speed(iter/s)": 0.579872
},
{
"epoch": 3.0,
"eval_loss": 1.9859907627105713,
"eval_runtime": 12.2556,
"eval_samples_per_second": 8.16,
"eval_steps_per_second": 8.16,
"eval_token_acc": 0.5401554404145078,
"step": 4944
}
],
"logging_steps": 5,
"max_steps": 4944,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.826945200557008e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}