Decaf-Gen-22b / trainer_state.json
alexshypula's picture
Upload folder using huggingface_hub
54183fa verified
{
"best_global_step": 4170,
"best_metric": 0.9509243994462685,
"best_model_checkpoint": "/workspaces/decompile_search/data/models/jan_experiments/stripped_unstripped_22b_unstripped_stop/checkpoint-4170",
"epoch": 0.8347588717015468,
"eval_steps": 30,
"global_step": 4170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.9579070736061442,
"epoch": 0.0010009099181073704,
"grad_norm": 31.209657669067383,
"learning_rate": 8e-09,
"loss": 1.2257,
"mean_token_accuracy": 0.7700222129171544,
"num_tokens": 1145748.0,
"step": 5
},
{
"entropy": 1.011449571089311,
"epoch": 0.002001819836214741,
"grad_norm": 23.227827072143555,
"learning_rate": 1.8e-08,
"loss": 1.2785,
"mean_token_accuracy": 0.7474062916907397,
"num_tokens": 2249793.0,
"step": 10
},
{
"entropy": 1.05531109679829,
"epoch": 0.003002729754322111,
"grad_norm": 17.10189437866211,
"learning_rate": 2.8e-08,
"loss": 1.3173,
"mean_token_accuracy": 0.732902865247293,
"num_tokens": 3277722.0,
"step": 15
},
{
"entropy": 1.080242946473035,
"epoch": 0.004003639672429482,
"grad_norm": 10.145275115966797,
"learning_rate": 3.7999999999999996e-08,
"loss": 1.3569,
"mean_token_accuracy": 0.7288111207160083,
"num_tokens": 4232543.0,
"step": 20
},
{
"entropy": 1.1169310082088817,
"epoch": 0.005004549590536852,
"grad_norm": 41.83009338378906,
"learning_rate": 4.8e-08,
"loss": 1.548,
"mean_token_accuracy": 0.7169481502337889,
"num_tokens": 4962149.0,
"step": 25
},
{
"entropy": 0.9460688650608062,
"epoch": 0.006005459508644222,
"grad_norm": 29.797571182250977,
"learning_rate": 5.8e-08,
"loss": 1.1969,
"mean_token_accuracy": 0.7767708816311576,
"num_tokens": 6078015.0,
"step": 30
},
{
"epoch": 0.006005459508644222,
"eval_entropy": 0.8858045388440616,
"eval_loss": 1.0837815999984741,
"eval_mean_token_accuracy": 0.7985450283425753,
"eval_num_tokens": 6078015.0,
"eval_runtime": 7.1989,
"eval_samples_per_second": 135.159,
"eval_steps_per_second": 8.473,
"step": 30
},
{
"entropy": 1.004025985436006,
"epoch": 0.0070063694267515925,
"grad_norm": 20.322914123535156,
"learning_rate": 6.8e-08,
"loss": 1.2249,
"mean_token_accuracy": 0.757005724310875,
"num_tokens": 7170484.0,
"step": 35
},
{
"entropy": 1.0326186922463503,
"epoch": 0.008007279344858963,
"grad_norm": 14.3685302734375,
"learning_rate": 7.8e-08,
"loss": 1.2034,
"mean_token_accuracy": 0.7555106471885334,
"num_tokens": 8215978.0,
"step": 40
},
{
"entropy": 1.050350191376426,
"epoch": 0.009008189262966333,
"grad_norm": 8.891011238098145,
"learning_rate": 8.8e-08,
"loss": 1.2058,
"mean_token_accuracy": 0.7581022883003409,
"num_tokens": 9176310.0,
"step": 45
},
{
"entropy": 1.0481942902911794,
"epoch": 0.010009099181073703,
"grad_norm": 25.542049407958984,
"learning_rate": 9.8e-08,
"loss": 1.2043,
"mean_token_accuracy": 0.7758935884995894,
"num_tokens": 9907937.0,
"step": 50
},
{
"entropy": 0.8896388709545135,
"epoch": 0.011010009099181074,
"grad_norm": 17.134859085083008,
"learning_rate": 1.0799999999999999e-07,
"loss": 0.9333,
"mean_token_accuracy": 0.8181748888709328,
"num_tokens": 11040470.0,
"step": 55
},
{
"entropy": 0.9338755531744524,
"epoch": 0.012010919017288443,
"grad_norm": 11.280884742736816,
"learning_rate": 1.1799999999999998e-07,
"loss": 0.9171,
"mean_token_accuracy": 0.809243483435024,
"num_tokens": 12138554.0,
"step": 60
},
{
"epoch": 0.012010919017288443,
"eval_entropy": 0.827404188328102,
"eval_loss": 0.7739703059196472,
"eval_mean_token_accuracy": 0.8451327916051521,
"eval_num_tokens": 12138554.0,
"eval_runtime": 7.1713,
"eval_samples_per_second": 135.679,
"eval_steps_per_second": 8.506,
"step": 60
},
{
"entropy": 0.9394035225564783,
"epoch": 0.013011828935395814,
"grad_norm": 6.800612449645996,
"learning_rate": 1.28e-07,
"loss": 0.816,
"mean_token_accuracy": 0.8172335895625028,
"num_tokens": 13161971.0,
"step": 65
},
{
"entropy": 0.931338392604481,
"epoch": 0.014012738853503185,
"grad_norm": 3.9526984691619873,
"learning_rate": 1.3800000000000002e-07,
"loss": 0.729,
"mean_token_accuracy": 0.8396818897940895,
"num_tokens": 14121047.0,
"step": 70
},
{
"entropy": 0.9432308787649328,
"epoch": 0.015013648771610554,
"grad_norm": 13.202656745910645,
"learning_rate": 1.4799999999999998e-07,
"loss": 0.7149,
"mean_token_accuracy": 0.8470803531733426,
"num_tokens": 14853168.0,
"step": 75
},
{
"entropy": 0.8036102744666013,
"epoch": 0.016014558689717927,
"grad_norm": 5.494961261749268,
"learning_rate": 1.5799999999999999e-07,
"loss": 0.5533,
"mean_token_accuracy": 0.8729204730554061,
"num_tokens": 16006267.0,
"step": 80
},
{
"entropy": 0.8627965910868212,
"epoch": 0.017015468607825296,
"grad_norm": 3.109562635421753,
"learning_rate": 1.68e-07,
"loss": 0.527,
"mean_token_accuracy": 0.8722775372591886,
"num_tokens": 17103593.0,
"step": 85
},
{
"entropy": 0.8908803132447329,
"epoch": 0.018016378525932665,
"grad_norm": 1.8945680856704712,
"learning_rate": 1.7799999999999998e-07,
"loss": 0.5005,
"mean_token_accuracy": 0.8758173530752008,
"num_tokens": 18130627.0,
"step": 90
},
{
"epoch": 0.018016378525932665,
"eval_entropy": 0.7762856190321875,
"eval_loss": 0.40602007508277893,
"eval_mean_token_accuracy": 0.899208920900939,
"eval_num_tokens": 18130627.0,
"eval_runtime": 7.1881,
"eval_samples_per_second": 135.362,
"eval_steps_per_second": 8.486,
"step": 90
},
{
"entropy": 0.9098557423461567,
"epoch": 0.019017288444040038,
"grad_norm": 1.237993597984314,
"learning_rate": 1.88e-07,
"loss": 0.4722,
"mean_token_accuracy": 0.8838203982873396,
"num_tokens": 19076434.0,
"step": 95
},
{
"entropy": 0.9214537311684001,
"epoch": 0.020018198362147407,
"grad_norm": 5.1105427742004395,
"learning_rate": 1.98e-07,
"loss": 0.4761,
"mean_token_accuracy": 0.885197820446708,
"num_tokens": 19806235.0,
"step": 100
},
{
"entropy": 0.7998497681184249,
"epoch": 0.021019108280254776,
"grad_norm": 2.0025761127471924,
"learning_rate": 2.0799999999999998e-07,
"loss": 0.4172,
"mean_token_accuracy": 0.8957362061197107,
"num_tokens": 20936435.0,
"step": 105
},
{
"entropy": 0.8544187637892636,
"epoch": 0.02202001819836215,
"grad_norm": 1.4504088163375854,
"learning_rate": 2.18e-07,
"loss": 0.4103,
"mean_token_accuracy": 0.8934655557979237,
"num_tokens": 22013484.0,
"step": 110
},
{
"entropy": 0.8855489952997728,
"epoch": 0.023020928116469518,
"grad_norm": 1.1078195571899414,
"learning_rate": 2.28e-07,
"loss": 0.4117,
"mean_token_accuracy": 0.8933108893307773,
"num_tokens": 23045886.0,
"step": 115
},
{
"entropy": 0.9073191062970595,
"epoch": 0.024021838034576887,
"grad_norm": 0.9317752718925476,
"learning_rate": 2.38e-07,
"loss": 0.4015,
"mean_token_accuracy": 0.8976379817182367,
"num_tokens": 23997532.0,
"step": 120
},
{
"epoch": 0.024021838034576887,
"eval_entropy": 0.7722517046772066,
"eval_loss": 0.29370784759521484,
"eval_mean_token_accuracy": 0.9163104364129363,
"eval_num_tokens": 23997532.0,
"eval_runtime": 7.1752,
"eval_samples_per_second": 135.606,
"eval_steps_per_second": 8.502,
"step": 120
},
{
"entropy": 0.9209811920469457,
"epoch": 0.02502274795268426,
"grad_norm": 3.9668571949005127,
"learning_rate": 2.48e-07,
"loss": 0.4115,
"mean_token_accuracy": 0.8966231562874534,
"num_tokens": 24736731.0,
"step": 125
},
{
"entropy": 0.7977617914026434,
"epoch": 0.02602365787079163,
"grad_norm": 1.2996954917907715,
"learning_rate": 2.58e-07,
"loss": 0.3835,
"mean_token_accuracy": 0.9029265501282432,
"num_tokens": 25872435.0,
"step": 130
},
{
"entropy": 0.8485897243022918,
"epoch": 0.027024567788898998,
"grad_norm": 1.0686163902282715,
"learning_rate": 2.68e-07,
"loss": 0.3879,
"mean_token_accuracy": 0.8995744439688595,
"num_tokens": 26959841.0,
"step": 135
},
{
"entropy": 0.8709002348509702,
"epoch": 0.02802547770700637,
"grad_norm": 0.92051762342453,
"learning_rate": 2.7800000000000003e-07,
"loss": 0.3856,
"mean_token_accuracy": 0.9004006510431116,
"num_tokens": 27993013.0,
"step": 140
},
{
"entropy": 0.8909420809962533,
"epoch": 0.02902638762511374,
"grad_norm": 0.8413939476013184,
"learning_rate": 2.88e-07,
"loss": 0.388,
"mean_token_accuracy": 0.9014144734902816,
"num_tokens": 28946080.0,
"step": 145
},
{
"entropy": 0.9160865225575187,
"epoch": 0.03002729754322111,
"grad_norm": 4.02235221862793,
"learning_rate": 2.98e-07,
"loss": 0.3961,
"mean_token_accuracy": 0.9003071714531291,
"num_tokens": 29669559.0,
"step": 150
},
{
"epoch": 0.03002729754322111,
"eval_entropy": 0.773854380748311,
"eval_loss": 0.2762528359889984,
"eval_mean_token_accuracy": 0.9212246898744927,
"eval_num_tokens": 29669559.0,
"eval_runtime": 7.2541,
"eval_samples_per_second": 134.132,
"eval_steps_per_second": 8.409,
"step": 150
},
{
"entropy": 0.7960180721499703,
"epoch": 0.03102820746132848,
"grad_norm": 1.3376996517181396,
"learning_rate": 3.08e-07,
"loss": 0.3679,
"mean_token_accuracy": 0.9063912429592826,
"num_tokens": 30786460.0,
"step": 155
},
{
"entropy": 0.8393291793086312,
"epoch": 0.032029117379435854,
"grad_norm": 0.9782769083976746,
"learning_rate": 3.18e-07,
"loss": 0.3739,
"mean_token_accuracy": 0.9036351117220792,
"num_tokens": 31893205.0,
"step": 160
},
{
"entropy": 0.8657339849255302,
"epoch": 0.03303002729754322,
"grad_norm": 0.916830837726593,
"learning_rate": 3.28e-07,
"loss": 0.3732,
"mean_token_accuracy": 0.9034594573757865,
"num_tokens": 32917211.0,
"step": 165
},
{
"entropy": 0.8868166403336959,
"epoch": 0.03403093721565059,
"grad_norm": 0.8498401045799255,
"learning_rate": 3.38e-07,
"loss": 0.3753,
"mean_token_accuracy": 0.9043488231572238,
"num_tokens": 33872966.0,
"step": 170
},
{
"entropy": 0.9020868615670637,
"epoch": 0.03503184713375796,
"grad_norm": 3.632817506790161,
"learning_rate": 3.4799999999999994e-07,
"loss": 0.3814,
"mean_token_accuracy": 0.9035663994875821,
"num_tokens": 34603817.0,
"step": 175
},
{
"entropy": 0.794374041665684,
"epoch": 0.03603275705186533,
"grad_norm": 1.3152629137039185,
"learning_rate": 3.5799999999999995e-07,
"loss": 0.3598,
"mean_token_accuracy": 0.9083835087039254,
"num_tokens": 35738363.0,
"step": 180
},
{
"epoch": 0.03603275705186533,
"eval_entropy": 0.7766775371598416,
"eval_loss": 0.2652251720428467,
"eval_mean_token_accuracy": 0.9241212303521203,
"eval_num_tokens": 35738363.0,
"eval_runtime": 7.1835,
"eval_samples_per_second": 135.449,
"eval_steps_per_second": 8.492,
"step": 180
},
{
"entropy": 0.8374889547174628,
"epoch": 0.0370336669699727,
"grad_norm": 1.069807767868042,
"learning_rate": 3.6799999999999996e-07,
"loss": 0.362,
"mean_token_accuracy": 0.906164778362621,
"num_tokens": 36835769.0,
"step": 185
},
{
"entropy": 0.865638066963716,
"epoch": 0.038034576888080075,
"grad_norm": 0.9059945344924927,
"learning_rate": 3.7799999999999997e-07,
"loss": 0.3667,
"mean_token_accuracy": 0.9057478319514881,
"num_tokens": 37868585.0,
"step": 190
},
{
"entropy": 0.8880685941739516,
"epoch": 0.039035486806187444,
"grad_norm": 0.7509809136390686,
"learning_rate": 3.88e-07,
"loss": 0.3578,
"mean_token_accuracy": 0.9070773032578555,
"num_tokens": 38807693.0,
"step": 195
},
{
"entropy": 0.909015315771103,
"epoch": 0.040036396724294813,
"grad_norm": 3.0933594703674316,
"learning_rate": 3.98e-07,
"loss": 0.3733,
"mean_token_accuracy": 0.9064179236238653,
"num_tokens": 39535122.0,
"step": 200
},
{
"entropy": 0.7939805047078566,
"epoch": 0.04103730664240218,
"grad_norm": 1.179566740989685,
"learning_rate": 4.0799999999999995e-07,
"loss": 0.3551,
"mean_token_accuracy": 0.9092546192082491,
"num_tokens": 40666622.0,
"step": 205
},
{
"entropy": 0.846534158966758,
"epoch": 0.04203821656050955,
"grad_norm": 0.9151410460472107,
"learning_rate": 4.1799999999999996e-07,
"loss": 0.3589,
"mean_token_accuracy": 0.9077435683120381,
"num_tokens": 41748748.0,
"step": 210
},
{
"epoch": 0.04203821656050955,
"eval_entropy": 0.7851541091184147,
"eval_loss": 0.25734254717826843,
"eval_mean_token_accuracy": 0.9266526767464934,
"eval_num_tokens": 41748748.0,
"eval_runtime": 7.1633,
"eval_samples_per_second": 135.832,
"eval_steps_per_second": 8.516,
"step": 210
},
{
"entropy": 0.8759665044871243,
"epoch": 0.04303912647861693,
"grad_norm": 1.0037354230880737,
"learning_rate": 4.2799999999999997e-07,
"loss": 0.3613,
"mean_token_accuracy": 0.906467653946443,
"num_tokens": 42776021.0,
"step": 215
},
{
"entropy": 0.8946322251449932,
"epoch": 0.0440400363967243,
"grad_norm": 0.7233147621154785,
"learning_rate": 4.38e-07,
"loss": 0.3685,
"mean_token_accuracy": 0.9061482787132263,
"num_tokens": 43731864.0,
"step": 220
},
{
"entropy": 0.9084791925820437,
"epoch": 0.045040946314831666,
"grad_norm": 3.0358214378356934,
"learning_rate": 4.48e-07,
"loss": 0.3644,
"mean_token_accuracy": 0.9079454855485396,
"num_tokens": 44458397.0,
"step": 225
},
{
"entropy": 0.795162683725357,
"epoch": 0.046041856232939035,
"grad_norm": 1.060544490814209,
"learning_rate": 4.58e-07,
"loss": 0.347,
"mean_token_accuracy": 0.9115611629052596,
"num_tokens": 45582504.0,
"step": 230
},
{
"entropy": 0.8501786047762091,
"epoch": 0.047042766151046404,
"grad_norm": 0.8345633745193481,
"learning_rate": 4.68e-07,
"loss": 0.3518,
"mean_token_accuracy": 0.9087835637005892,
"num_tokens": 46691664.0,
"step": 235
},
{
"entropy": 0.8739784273234281,
"epoch": 0.04804367606915377,
"grad_norm": 0.8899365067481995,
"learning_rate": 4.779999999999999e-07,
"loss": 0.354,
"mean_token_accuracy": 0.9071491924199191,
"num_tokens": 47712971.0,
"step": 240
},
{
"epoch": 0.04804367606915377,
"eval_entropy": 0.7832569479942322,
"eval_loss": 0.25191357731819153,
"eval_mean_token_accuracy": 0.9278596326953075,
"eval_num_tokens": 47712971.0,
"eval_runtime": 7.1165,
"eval_samples_per_second": 136.725,
"eval_steps_per_second": 8.572,
"step": 240
},
{
"entropy": 0.8918125726959922,
"epoch": 0.04904458598726115,
"grad_norm": 0.7503334879875183,
"learning_rate": 4.879999999999999e-07,
"loss": 0.3536,
"mean_token_accuracy": 0.9086009448224848,
"num_tokens": 48657962.0,
"step": 245
},
{
"entropy": 0.9069400212981484,
"epoch": 0.05004549590536852,
"grad_norm": 2.7447826862335205,
"learning_rate": 4.979999999999999e-07,
"loss": 0.3607,
"mean_token_accuracy": 0.9092577500776811,
"num_tokens": 49382591.0,
"step": 250
},
{
"entropy": 0.7897281029007651,
"epoch": 0.05104640582347589,
"grad_norm": 1.0658516883850098,
"learning_rate": 5.079999999999999e-07,
"loss": 0.3343,
"mean_token_accuracy": 0.9143867644396695,
"num_tokens": 50493880.0,
"step": 255
},
{
"entropy": 0.8420896362174641,
"epoch": 0.05204731574158326,
"grad_norm": 0.9911607503890991,
"learning_rate": 5.18e-07,
"loss": 0.3467,
"mean_token_accuracy": 0.9096509906378659,
"num_tokens": 51593094.0,
"step": 260
},
{
"entropy": 0.8670889122919603,
"epoch": 0.053048225659690626,
"grad_norm": 0.8509455323219299,
"learning_rate": 5.28e-07,
"loss": 0.3429,
"mean_token_accuracy": 0.9105742725459012,
"num_tokens": 52624215.0,
"step": 265
},
{
"entropy": 0.8908169654282656,
"epoch": 0.054049135577797995,
"grad_norm": 1.258712649345398,
"learning_rate": 5.38e-07,
"loss": 0.3471,
"mean_token_accuracy": 0.9100607395172119,
"num_tokens": 53579583.0,
"step": 270
},
{
"epoch": 0.054049135577797995,
"eval_entropy": 0.781644055100738,
"eval_loss": 0.24734267592430115,
"eval_mean_token_accuracy": 0.9283195352945172,
"eval_num_tokens": 53579583.0,
"eval_runtime": 7.1344,
"eval_samples_per_second": 136.382,
"eval_steps_per_second": 8.55,
"step": 270
},
{
"entropy": 0.8974328610030088,
"epoch": 0.05505004549590537,
"grad_norm": 2.7811849117279053,
"learning_rate": 5.48e-07,
"loss": 0.3531,
"mean_token_accuracy": 0.9098310795697299,
"num_tokens": 54306230.0,
"step": 275
},
{
"entropy": 0.790298336202448,
"epoch": 0.05605095541401274,
"grad_norm": 1.0269526243209839,
"learning_rate": 5.58e-07,
"loss": 0.3426,
"mean_token_accuracy": 0.912391439893029,
"num_tokens": 55434008.0,
"step": 280
},
{
"entropy": 0.8415309033610604,
"epoch": 0.05705186533212011,
"grad_norm": 0.8907763957977295,
"learning_rate": 5.679999999999999e-07,
"loss": 0.3457,
"mean_token_accuracy": 0.9103805682875893,
"num_tokens": 56527326.0,
"step": 285
},
{
"entropy": 0.8678737878799438,
"epoch": 0.05805277525022748,
"grad_norm": 0.933925449848175,
"learning_rate": 5.779999999999999e-07,
"loss": 0.3435,
"mean_token_accuracy": 0.9097982775081288,
"num_tokens": 57577716.0,
"step": 290
},
{
"entropy": 0.8823735535144805,
"epoch": 0.05905368516833485,
"grad_norm": 0.7687504291534424,
"learning_rate": 5.879999999999999e-07,
"loss": 0.3448,
"mean_token_accuracy": 0.9104289829730987,
"num_tokens": 58525576.0,
"step": 295
},
{
"entropy": 0.8998256000605497,
"epoch": 0.06005459508644222,
"grad_norm": 2.8133366107940674,
"learning_rate": 5.979999999999999e-07,
"loss": 0.3474,
"mean_token_accuracy": 0.9119950543750416,
"num_tokens": 59255710.0,
"step": 300
},
{
"epoch": 0.06005459508644222,
"eval_entropy": 0.7829059782575388,
"eval_loss": 0.24350076913833618,
"eval_mean_token_accuracy": 0.9303004888237499,
"eval_num_tokens": 59255710.0,
"eval_runtime": 7.4044,
"eval_samples_per_second": 131.408,
"eval_steps_per_second": 8.238,
"step": 300
},
{
"entropy": 0.7927286684513092,
"epoch": 0.06105550500454959,
"grad_norm": 0.9818949103355408,
"learning_rate": 6.079999999999999e-07,
"loss": 0.3337,
"mean_token_accuracy": 0.9142222328619524,
"num_tokens": 60371359.0,
"step": 305
},
{
"entropy": 0.8404727691953833,
"epoch": 0.06205641492265696,
"grad_norm": 0.8817327618598938,
"learning_rate": 6.18e-07,
"loss": 0.336,
"mean_token_accuracy": 0.9129292596470225,
"num_tokens": 61459516.0,
"step": 310
},
{
"entropy": 0.8821817175908522,
"epoch": 0.06305732484076433,
"grad_norm": 1.0255104303359985,
"learning_rate": 6.28e-07,
"loss": 0.3445,
"mean_token_accuracy": 0.9102013338695872,
"num_tokens": 62511188.0,
"step": 315
},
{
"entropy": 0.8907575244253332,
"epoch": 0.06405823475887171,
"grad_norm": 0.6242014169692993,
"learning_rate": 6.38e-07,
"loss": 0.3306,
"mean_token_accuracy": 0.9132393593137914,
"num_tokens": 63462734.0,
"step": 320
},
{
"entropy": 0.9019130771810359,
"epoch": 0.06505914467697907,
"grad_norm": 2.4068405628204346,
"learning_rate": 6.48e-07,
"loss": 0.3431,
"mean_token_accuracy": 0.9133785475384105,
"num_tokens": 64195174.0,
"step": 325
},
{
"entropy": 0.7956698184663599,
"epoch": 0.06606005459508645,
"grad_norm": 1.0124462842941284,
"learning_rate": 6.58e-07,
"loss": 0.3232,
"mean_token_accuracy": 0.916981242461638,
"num_tokens": 65310690.0,
"step": 330
},
{
"epoch": 0.06606005459508645,
"eval_entropy": 0.7826571161629724,
"eval_loss": 0.2395564466714859,
"eval_mean_token_accuracy": 0.9312737300747731,
"eval_num_tokens": 65310690.0,
"eval_runtime": 7.1164,
"eval_samples_per_second": 136.726,
"eval_steps_per_second": 8.572,
"step": 330
},
{
"entropy": 0.8469071046872573,
"epoch": 0.06706096451319381,
"grad_norm": 0.8455765843391418,
"learning_rate": 6.68e-07,
"loss": 0.3336,
"mean_token_accuracy": 0.9130166606469587,
"num_tokens": 66407916.0,
"step": 335
},
{
"entropy": 0.8715857310728593,
"epoch": 0.06806187443130118,
"grad_norm": 0.848980188369751,
"learning_rate": 6.78e-07,
"loss": 0.332,
"mean_token_accuracy": 0.9133841227401387,
"num_tokens": 67462675.0,
"step": 340
},
{
"entropy": 0.8887097320773385,
"epoch": 0.06906278434940856,
"grad_norm": 0.7524086236953735,
"learning_rate": 6.879999999999999e-07,
"loss": 0.3293,
"mean_token_accuracy": 0.9141406866637143,
"num_tokens": 68430686.0,
"step": 345
},
{
"entropy": 0.9178955408659848,
"epoch": 0.07006369426751592,
"grad_norm": 2.6862826347351074,
"learning_rate": 6.979999999999999e-07,
"loss": 0.3503,
"mean_token_accuracy": 0.910759204084223,
"num_tokens": 69159538.0,
"step": 350
},
{
"entropy": 0.7949739504944194,
"epoch": 0.0710646041856233,
"grad_norm": 0.939888060092926,
"learning_rate": 7.079999999999999e-07,
"loss": 0.3226,
"mean_token_accuracy": 0.917046070098877,
"num_tokens": 70297859.0,
"step": 355
},
{
"entropy": 0.8522608453577215,
"epoch": 0.07206551410373066,
"grad_norm": 0.8733773827552795,
"learning_rate": 7.179999999999999e-07,
"loss": 0.3245,
"mean_token_accuracy": 0.9145838070999492,
"num_tokens": 71388877.0,
"step": 360
},
{
"epoch": 0.07206551410373066,
"eval_entropy": 0.7842210042672079,
"eval_loss": 0.23638135194778442,
"eval_mean_token_accuracy": 0.9317856763229996,
"eval_num_tokens": 71388877.0,
"eval_runtime": 7.0752,
"eval_samples_per_second": 137.523,
"eval_steps_per_second": 8.622,
"step": 360
},
{
"entropy": 0.8792784777554599,
"epoch": 0.07306642402183804,
"grad_norm": 0.9495289921760559,
"learning_rate": 7.28e-07,
"loss": 0.3357,
"mean_token_accuracy": 0.9130886435508728,
"num_tokens": 72425465.0,
"step": 365
},
{
"entropy": 0.8881763842972842,
"epoch": 0.0740673339399454,
"grad_norm": 0.7181362509727478,
"learning_rate": 7.38e-07,
"loss": 0.3273,
"mean_token_accuracy": 0.9161153722893108,
"num_tokens": 73377921.0,
"step": 370
},
{
"entropy": 0.9161553989757191,
"epoch": 0.07506824385805277,
"grad_norm": 2.3346126079559326,
"learning_rate": 7.48e-07,
"loss": 0.3416,
"mean_token_accuracy": 0.9133548351851377,
"num_tokens": 74109742.0,
"step": 375
},
{
"entropy": 0.7970652249726382,
"epoch": 0.07606915377616015,
"grad_norm": 0.9897024035453796,
"learning_rate": 7.58e-07,
"loss": 0.3206,
"mean_token_accuracy": 0.9177604393525557,
"num_tokens": 75254960.0,
"step": 380
},
{
"entropy": 0.8417174593968825,
"epoch": 0.07707006369426751,
"grad_norm": 1.0447583198547363,
"learning_rate": 7.68e-07,
"loss": 0.3204,
"mean_token_accuracy": 0.9170865096829154,
"num_tokens": 76368490.0,
"step": 385
},
{
"entropy": 0.8782219377431002,
"epoch": 0.07807097361237489,
"grad_norm": 0.9332379698753357,
"learning_rate": 7.78e-07,
"loss": 0.3301,
"mean_token_accuracy": 0.9143734498457475,
"num_tokens": 77412393.0,
"step": 390
},
{
"epoch": 0.07807097361237489,
"eval_entropy": 0.779330243829821,
"eval_loss": 0.23134513199329376,
"eval_mean_token_accuracy": 0.9325118944293163,
"eval_num_tokens": 77412393.0,
"eval_runtime": 7.1925,
"eval_samples_per_second": 135.279,
"eval_steps_per_second": 8.481,
"step": 390
},
{
"entropy": 0.8939531375061381,
"epoch": 0.07907188353048225,
"grad_norm": 0.7231958508491516,
"learning_rate": 7.88e-07,
"loss": 0.3267,
"mean_token_accuracy": 0.9155195750973442,
"num_tokens": 78362341.0,
"step": 395
},
{
"entropy": 0.9062723474069075,
"epoch": 0.08007279344858963,
"grad_norm": 2.0436787605285645,
"learning_rate": 7.98e-07,
"loss": 0.3365,
"mean_token_accuracy": 0.9146250042048367,
"num_tokens": 79096898.0,
"step": 400
},
{
"entropy": 0.7945969061418013,
"epoch": 0.081073703366697,
"grad_norm": 0.9359119534492493,
"learning_rate": 8.08e-07,
"loss": 0.3177,
"mean_token_accuracy": 0.9181306638500907,
"num_tokens": 80217692.0,
"step": 405
},
{
"entropy": 0.844357325813987,
"epoch": 0.08207461328480437,
"grad_norm": 0.8438592553138733,
"learning_rate": 8.179999999999999e-07,
"loss": 0.3203,
"mean_token_accuracy": 0.9169050964442167,
"num_tokens": 81307270.0,
"step": 410
},
{
"entropy": 0.8725964551622217,
"epoch": 0.08307552320291174,
"grad_norm": 0.7924903035163879,
"learning_rate": 8.28e-07,
"loss": 0.3209,
"mean_token_accuracy": 0.9159043675119226,
"num_tokens": 82357546.0,
"step": 415
},
{
"entropy": 0.8886379382827065,
"epoch": 0.0840764331210191,
"grad_norm": 0.7205538153648376,
"learning_rate": 8.38e-07,
"loss": 0.3235,
"mean_token_accuracy": 0.9162523784420707,
"num_tokens": 83328951.0,
"step": 420
},
{
"epoch": 0.0840764331210191,
"eval_entropy": 0.7785051123040622,
"eval_loss": 0.2309650033712387,
"eval_mean_token_accuracy": 0.9339301380954805,
"eval_num_tokens": 83328951.0,
"eval_runtime": 7.1338,
"eval_samples_per_second": 136.393,
"eval_steps_per_second": 8.551,
"step": 420
},
{
"entropy": 0.9015932933850722,
"epoch": 0.08507734303912648,
"grad_norm": 2.2584614753723145,
"learning_rate": 8.48e-07,
"loss": 0.3281,
"mean_token_accuracy": 0.9161337093873457,
"num_tokens": 84065002.0,
"step": 425
},
{
"entropy": 0.7703537437048825,
"epoch": 0.08607825295723386,
"grad_norm": 0.8600314855575562,
"learning_rate": 8.58e-07,
"loss": 0.3051,
"mean_token_accuracy": 0.9216179517182437,
"num_tokens": 85217505.0,
"step": 430
},
{
"entropy": 0.837453713200309,
"epoch": 0.08707916287534122,
"grad_norm": 0.837756335735321,
"learning_rate": 8.68e-07,
"loss": 0.3171,
"mean_token_accuracy": 0.9169958277182145,
"num_tokens": 86302592.0,
"step": 435
},
{
"entropy": 0.8648753382942893,
"epoch": 0.0880800727934486,
"grad_norm": 0.8668557405471802,
"learning_rate": 8.78e-07,
"loss": 0.3128,
"mean_token_accuracy": 0.9178522229194641,
"num_tokens": 87371908.0,
"step": 440
},
{
"entropy": 0.8813207095319574,
"epoch": 0.08908098271155596,
"grad_norm": 0.7655621767044067,
"learning_rate": 8.88e-07,
"loss": 0.325,
"mean_token_accuracy": 0.9160207412459633,
"num_tokens": 88338748.0,
"step": 445
},
{
"entropy": 0.8887845928018744,
"epoch": 0.09008189262966333,
"grad_norm": 1.8667429685592651,
"learning_rate": 8.98e-07,
"loss": 0.3201,
"mean_token_accuracy": 0.918709414655512,
"num_tokens": 89071435.0,
"step": 450
},
{
"epoch": 0.09008189262966333,
"eval_entropy": 0.7726943004326742,
"eval_loss": 0.2284156084060669,
"eval_mean_token_accuracy": 0.9343840216026932,
"eval_num_tokens": 89071435.0,
"eval_runtime": 7.0961,
"eval_samples_per_second": 137.118,
"eval_steps_per_second": 8.596,
"step": 450
},
{
"entropy": 0.7833973987535997,
"epoch": 0.0910828025477707,
"grad_norm": 0.9149163365364075,
"learning_rate": 9.08e-07,
"loss": 0.309,
"mean_token_accuracy": 0.9209194253791463,
"num_tokens": 90204895.0,
"step": 455
},
{
"entropy": 0.8328197641806169,
"epoch": 0.09208371246587807,
"grad_norm": 0.8828219771385193,
"learning_rate": 9.18e-07,
"loss": 0.3113,
"mean_token_accuracy": 0.9182033365423029,
"num_tokens": 91290928.0,
"step": 460
},
{
"entropy": 0.8542791778391058,
"epoch": 0.09308462238398545,
"grad_norm": 0.847339928150177,
"learning_rate": 9.28e-07,
"loss": 0.3111,
"mean_token_accuracy": 0.9180883992802013,
"num_tokens": 92353449.0,
"step": 465
},
{
"entropy": 0.8792417260733518,
"epoch": 0.09408553230209281,
"grad_norm": 0.6507946848869324,
"learning_rate": 9.379999999999998e-07,
"loss": 0.3093,
"mean_token_accuracy": 0.9185398605736819,
"num_tokens": 93329097.0,
"step": 470
},
{
"entropy": 0.8795178760181773,
"epoch": 0.09508644222020018,
"grad_norm": 1.9719516038894653,
"learning_rate": 9.479999999999999e-07,
"loss": 0.3248,
"mean_token_accuracy": 0.9174890155142004,
"num_tokens": 94076001.0,
"step": 475
},
{
"entropy": 0.7680907699194821,
"epoch": 0.09608735213830755,
"grad_norm": 0.9014910459518433,
"learning_rate": 9.58e-07,
"loss": 0.2999,
"mean_token_accuracy": 0.9230750652876767,
"num_tokens": 95199341.0,
"step": 480
},
{
"epoch": 0.09608735213830755,
"eval_entropy": 0.7766347244137624,
"eval_loss": 0.22492578625679016,
"eval_mean_token_accuracy": 0.9348700437389437,
"eval_num_tokens": 95199341.0,
"eval_runtime": 7.167,
"eval_samples_per_second": 135.761,
"eval_steps_per_second": 8.511,
"step": 480
},
{
"entropy": 0.8319806510751898,
"epoch": 0.09708826205641492,
"grad_norm": 0.8409860134124756,
"learning_rate": 9.679999999999999e-07,
"loss": 0.3096,
"mean_token_accuracy": 0.919183918020942,
"num_tokens": 96280405.0,
"step": 485
},
{
"entropy": 0.8512482968243685,
"epoch": 0.0980891719745223,
"grad_norm": 0.7460948824882507,
"learning_rate": 9.78e-07,
"loss": 0.3048,
"mean_token_accuracy": 0.9195660710334778,
"num_tokens": 97324958.0,
"step": 490
},
{
"entropy": 0.8670887063850056,
"epoch": 0.09909008189262966,
"grad_norm": 0.7144485116004944,
"learning_rate": 9.88e-07,
"loss": 0.3047,
"mean_token_accuracy": 0.9209712348201058,
"num_tokens": 98281184.0,
"step": 495
},
{
"entropy": 0.876132286678661,
"epoch": 0.10009099181073704,
"grad_norm": 2.102391242980957,
"learning_rate": 9.98e-07,
"loss": 0.3134,
"mean_token_accuracy": 0.9201786610213193,
"num_tokens": 99020369.0,
"step": 500
},
{
"entropy": 0.772329353202473,
"epoch": 0.1010919017288444,
"grad_norm": 0.9755299687385559,
"learning_rate": 1.008e-06,
"loss": 0.3018,
"mean_token_accuracy": 0.9223481546748769,
"num_tokens": 100156631.0,
"step": 505
},
{
"entropy": 0.8236495657400651,
"epoch": 0.10209281164695178,
"grad_norm": 0.8604740500450134,
"learning_rate": 1.018e-06,
"loss": 0.3032,
"mean_token_accuracy": 0.9208208913152868,
"num_tokens": 101235364.0,
"step": 510
},
{
"epoch": 0.10209281164695178,
"eval_entropy": 0.7710273295152382,
"eval_loss": 0.22457090020179749,
"eval_mean_token_accuracy": 0.9353827013344062,
"eval_num_tokens": 101235364.0,
"eval_runtime": 7.1319,
"eval_samples_per_second": 136.429,
"eval_steps_per_second": 8.553,
"step": 510
},
{
"entropy": 0.856683783639561,
"epoch": 0.10309372156505915,
"grad_norm": 0.9585539102554321,
"learning_rate": 1.028e-06,
"loss": 0.3121,
"mean_token_accuracy": 0.9176510073921897,
"num_tokens": 102253625.0,
"step": 515
},
{
"entropy": 0.8793982776728544,
"epoch": 0.10409463148316651,
"grad_norm": 0.6796140074729919,
"learning_rate": 1.038e-06,
"loss": 0.3192,
"mean_token_accuracy": 0.9176862608302724,
"num_tokens": 103206550.0,
"step": 520
},
{
"entropy": 0.8828882379965348,
"epoch": 0.10509554140127389,
"grad_norm": 1.9700580835342407,
"learning_rate": 1.048e-06,
"loss": 0.3138,
"mean_token_accuracy": 0.9197612930427898,
"num_tokens": 103937735.0,
"step": 525
},
{
"entropy": 0.7799841582775116,
"epoch": 0.10609645131938125,
"grad_norm": 0.9521822929382324,
"learning_rate": 1.058e-06,
"loss": 0.2993,
"mean_token_accuracy": 0.9223499368537557,
"num_tokens": 105062562.0,
"step": 530
},
{
"entropy": 0.8236062943935394,
"epoch": 0.10709736123748863,
"grad_norm": 0.8883262276649475,
"learning_rate": 1.068e-06,
"loss": 0.3043,
"mean_token_accuracy": 0.9200864168730649,
"num_tokens": 106148854.0,
"step": 535
},
{
"entropy": 0.855875781991265,
"epoch": 0.10809827115559599,
"grad_norm": 0.7604862451553345,
"learning_rate": 1.078e-06,
"loss": 0.301,
"mean_token_accuracy": 0.9207951112227006,
"num_tokens": 107184016.0,
"step": 540
},
{
"epoch": 0.10809827115559599,
"eval_entropy": 0.7688479355124177,
"eval_loss": 0.2222135066986084,
"eval_mean_token_accuracy": 0.9355378277966233,
"eval_num_tokens": 107184016.0,
"eval_runtime": 7.1389,
"eval_samples_per_second": 136.295,
"eval_steps_per_second": 8.545,
"step": 540
},
{
"entropy": 0.8719826313582334,
"epoch": 0.10909918107370337,
"grad_norm": 0.6990140676498413,
"learning_rate": 1.088e-06,
"loss": 0.3101,
"mean_token_accuracy": 0.9189783123406496,
"num_tokens": 108136311.0,
"step": 545
},
{
"entropy": 0.8727446463975039,
"epoch": 0.11010009099181074,
"grad_norm": 1.9771708250045776,
"learning_rate": 1.0980000000000001e-06,
"loss": 0.3056,
"mean_token_accuracy": 0.9217729514295404,
"num_tokens": 108870399.0,
"step": 550
},
{
"entropy": 0.7766597162593495,
"epoch": 0.1111010009099181,
"grad_norm": 0.9047304391860962,
"learning_rate": 1.108e-06,
"loss": 0.2989,
"mean_token_accuracy": 0.9228177655826916,
"num_tokens": 110010412.0,
"step": 555
},
{
"entropy": 0.821604372696443,
"epoch": 0.11210191082802548,
"grad_norm": 0.7859129905700684,
"learning_rate": 1.1180000000000001e-06,
"loss": 0.2986,
"mean_token_accuracy": 0.9216692274267023,
"num_tokens": 111101459.0,
"step": 560
},
{
"entropy": 0.8458013583313335,
"epoch": 0.11310282074613284,
"grad_norm": 0.7374240159988403,
"learning_rate": 1.1279999999999998e-06,
"loss": 0.3023,
"mean_token_accuracy": 0.9201881116086786,
"num_tokens": 112137324.0,
"step": 565
},
{
"entropy": 0.8739073032682593,
"epoch": 0.11410373066424022,
"grad_norm": 0.6672995686531067,
"learning_rate": 1.138e-06,
"loss": 0.3039,
"mean_token_accuracy": 0.9213175762783398,
"num_tokens": 113102193.0,
"step": 570
},
{
"epoch": 0.11410373066424022,
"eval_entropy": 0.7622656685407044,
"eval_loss": 0.21812133491039276,
"eval_mean_token_accuracy": 0.9362532193543481,
"eval_num_tokens": 113102193.0,
"eval_runtime": 7.083,
"eval_samples_per_second": 137.372,
"eval_steps_per_second": 8.612,
"step": 570
},
{
"entropy": 0.8674825570800088,
"epoch": 0.1151046405823476,
"grad_norm": 1.9667503833770752,
"learning_rate": 1.1479999999999999e-06,
"loss": 0.3032,
"mean_token_accuracy": 0.9225559597665614,
"num_tokens": 113844512.0,
"step": 575
},
{
"entropy": 0.7683469571850516,
"epoch": 0.11610555050045496,
"grad_norm": 0.8600199818611145,
"learning_rate": 1.158e-06,
"loss": 0.2874,
"mean_token_accuracy": 0.9255136945030906,
"num_tokens": 114973235.0,
"step": 580
},
{
"entropy": 0.8179097500714388,
"epoch": 0.11710646041856233,
"grad_norm": 0.7877588272094727,
"learning_rate": 1.1679999999999999e-06,
"loss": 0.2992,
"mean_token_accuracy": 0.9217895182696256,
"num_tokens": 116057315.0,
"step": 585
},
{
"entropy": 0.8371223628520965,
"epoch": 0.1181073703366697,
"grad_norm": 0.7893165946006775,
"learning_rate": 1.178e-06,
"loss": 0.2992,
"mean_token_accuracy": 0.9224891754713925,
"num_tokens": 117103213.0,
"step": 590
},
{
"entropy": 0.8535070988264951,
"epoch": 0.11910828025477707,
"grad_norm": 0.7347795367240906,
"learning_rate": 1.1879999999999999e-06,
"loss": 0.2933,
"mean_token_accuracy": 0.9236816352063959,
"num_tokens": 118057061.0,
"step": 595
},
{
"entropy": 0.873365730047226,
"epoch": 0.12010919017288443,
"grad_norm": 2.104503870010376,
"learning_rate": 1.1979999999999998e-06,
"loss": 0.3067,
"mean_token_accuracy": 0.9227487694133412,
"num_tokens": 118778234.0,
"step": 600
},
{
"epoch": 0.12010919017288443,
"eval_entropy": 0.7565031569512164,
"eval_loss": 0.21913714706897736,
"eval_mean_token_accuracy": 0.936484013424545,
"eval_num_tokens": 118778234.0,
"eval_runtime": 7.1122,
"eval_samples_per_second": 136.806,
"eval_steps_per_second": 8.577,
"step": 600
},
{
"entropy": 0.7622005002065139,
"epoch": 0.12111010009099181,
"grad_norm": 0.9007834196090698,
"learning_rate": 1.208e-06,
"loss": 0.2921,
"mean_token_accuracy": 0.9243286257440394,
"num_tokens": 119913350.0,
"step": 605
},
{
"entropy": 0.8152731066400355,
"epoch": 0.12211101000909919,
"grad_norm": 0.8577731251716614,
"learning_rate": 1.2179999999999998e-06,
"loss": 0.303,
"mean_token_accuracy": 0.9211578873070804,
"num_tokens": 121005111.0,
"step": 610
},
{
"entropy": 0.8311775321310216,
"epoch": 0.12311191992720655,
"grad_norm": 0.8874506950378418,
"learning_rate": 1.228e-06,
"loss": 0.2941,
"mean_token_accuracy": 0.9234593711116097,
"num_tokens": 122053423.0,
"step": 615
},
{
"entropy": 0.8455709652467207,
"epoch": 0.12411282984531392,
"grad_norm": 0.6410759687423706,
"learning_rate": 1.2379999999999998e-06,
"loss": 0.2869,
"mean_token_accuracy": 0.9240941790017214,
"num_tokens": 123023538.0,
"step": 620
},
{
"entropy": 0.865269153768366,
"epoch": 0.1251137397634213,
"grad_norm": 1.943968653678894,
"learning_rate": 1.248e-06,
"loss": 0.301,
"mean_token_accuracy": 0.9235067736018788,
"num_tokens": 123756604.0,
"step": 625
},
{
"entropy": 0.7617347641424699,
"epoch": 0.12611464968152866,
"grad_norm": 0.8967196941375732,
"learning_rate": 1.2579999999999999e-06,
"loss": 0.2934,
"mean_token_accuracy": 0.9244806538928639,
"num_tokens": 124876188.0,
"step": 630
},
{
"epoch": 0.12611464968152866,
"eval_entropy": 0.758461444104304,
"eval_loss": 0.21852023899555206,
"eval_mean_token_accuracy": 0.9367063485208105,
"eval_num_tokens": 124876188.0,
"eval_runtime": 7.0895,
"eval_samples_per_second": 137.246,
"eval_steps_per_second": 8.604,
"step": 630
},
{
"entropy": 0.8160103900866075,
"epoch": 0.12711555959963602,
"grad_norm": 0.7799694538116455,
"learning_rate": 1.268e-06,
"loss": 0.2946,
"mean_token_accuracy": 0.9232698521830819,
"num_tokens": 125977507.0,
"step": 635
},
{
"entropy": 0.8289537635716525,
"epoch": 0.12811646951774341,
"grad_norm": 0.784251868724823,
"learning_rate": 1.2779999999999999e-06,
"loss": 0.2917,
"mean_token_accuracy": 0.9241768289696086,
"num_tokens": 127032719.0,
"step": 640
},
{
"entropy": 0.8573456623337485,
"epoch": 0.12911737943585078,
"grad_norm": 0.7045451402664185,
"learning_rate": 1.288e-06,
"loss": 0.2984,
"mean_token_accuracy": 0.9235251740975814,
"num_tokens": 127988387.0,
"step": 645
},
{
"entropy": 0.8507555175911297,
"epoch": 0.13011828935395814,
"grad_norm": 1.819658637046814,
"learning_rate": 1.298e-06,
"loss": 0.2914,
"mean_token_accuracy": 0.924909613349221,
"num_tokens": 128721584.0,
"step": 650
},
{
"entropy": 0.7407966630025343,
"epoch": 0.1311191992720655,
"grad_norm": 0.8966879844665527,
"learning_rate": 1.308e-06,
"loss": 0.28,
"mean_token_accuracy": 0.92760883027857,
"num_tokens": 129860649.0,
"step": 655
},
{
"entropy": 0.8003853126005693,
"epoch": 0.1321201091901729,
"grad_norm": 0.8079231977462769,
"learning_rate": 1.318e-06,
"loss": 0.2894,
"mean_token_accuracy": 0.9240497372367166,
"num_tokens": 130965498.0,
"step": 660
},
{
"epoch": 0.1321201091901729,
"eval_entropy": 0.750291645526886,
"eval_loss": 0.213973730802536,
"eval_mean_token_accuracy": 0.9382940315809406,
"eval_num_tokens": 130965498.0,
"eval_runtime": 7.0264,
"eval_samples_per_second": 138.477,
"eval_steps_per_second": 8.681,
"step": 660
},
{
"entropy": 0.8198544599793174,
"epoch": 0.13312101910828025,
"grad_norm": 0.8206574320793152,
"learning_rate": 1.328e-06,
"loss": 0.2865,
"mean_token_accuracy": 0.9246467048471624,
"num_tokens": 132002292.0,
"step": 665
},
{
"entropy": 0.8489445995200764,
"epoch": 0.13412192902638762,
"grad_norm": 0.6715352535247803,
"learning_rate": 1.338e-06,
"loss": 0.2924,
"mean_token_accuracy": 0.9234138223257932,
"num_tokens": 132956333.0,
"step": 670
},
{
"entropy": 0.8532397329807282,
"epoch": 0.135122838944495,
"grad_norm": 1.8673855066299438,
"learning_rate": 1.348e-06,
"loss": 0.2997,
"mean_token_accuracy": 0.9234183029694991,
"num_tokens": 133677965.0,
"step": 675
},
{
"entropy": 0.7524514572186903,
"epoch": 0.13612374886260237,
"grad_norm": 0.8716238737106323,
"learning_rate": 1.358e-06,
"loss": 0.2841,
"mean_token_accuracy": 0.9266363842920824,
"num_tokens": 134802840.0,
"step": 680
},
{
"entropy": 0.8017716060985218,
"epoch": 0.13712465878070973,
"grad_norm": 0.7700985074043274,
"learning_rate": 1.368e-06,
"loss": 0.2855,
"mean_token_accuracy": 0.9250672015276822,
"num_tokens": 135888025.0,
"step": 685
},
{
"entropy": 0.8074500967155803,
"epoch": 0.13812556869881712,
"grad_norm": 0.8650295734405518,
"learning_rate": 1.3779999999999998e-06,
"loss": 0.286,
"mean_token_accuracy": 0.9245203012769873,
"num_tokens": 136924539.0,
"step": 690
},
{
"epoch": 0.13812556869881712,
"eval_entropy": 0.7438388193239931,
"eval_loss": 0.21345947682857513,
"eval_mean_token_accuracy": 0.9379716603482355,
"eval_num_tokens": 136924539.0,
"eval_runtime": 7.2288,
"eval_samples_per_second": 134.6,
"eval_steps_per_second": 8.438,
"step": 690
},
{
"entropy": 0.8413560314611955,
"epoch": 0.13912647861692448,
"grad_norm": 0.6126115918159485,
"learning_rate": 1.3879999999999999e-06,
"loss": 0.2903,
"mean_token_accuracy": 0.924701126055284,
"num_tokens": 137879499.0,
"step": 695
},
{
"entropy": 0.8566647876392711,
"epoch": 0.14012738853503184,
"grad_norm": 1.6679223775863647,
"learning_rate": 1.3979999999999998e-06,
"loss": 0.2998,
"mean_token_accuracy": 0.9239558404142206,
"num_tokens": 138603916.0,
"step": 700
},
{
"entropy": 0.7479892709038475,
"epoch": 0.1411282984531392,
"grad_norm": 0.8888144493103027,
"learning_rate": 1.408e-06,
"loss": 0.281,
"mean_token_accuracy": 0.9272219023921273,
"num_tokens": 139731892.0,
"step": 705
},
{
"entropy": 0.8010973101312464,
"epoch": 0.1421292083712466,
"grad_norm": 0.7999457716941833,
"learning_rate": 1.4179999999999998e-06,
"loss": 0.2862,
"mean_token_accuracy": 0.9249833226203918,
"num_tokens": 140823520.0,
"step": 710
},
{
"entropy": 0.8243546702645042,
"epoch": 0.14313011828935396,
"grad_norm": 0.7929534912109375,
"learning_rate": 1.428e-06,
"loss": 0.289,
"mean_token_accuracy": 0.9242551738565619,
"num_tokens": 141838998.0,
"step": 715
},
{
"entropy": 0.8342987900430506,
"epoch": 0.14413102820746132,
"grad_norm": 0.7797636985778809,
"learning_rate": 1.4379999999999998e-06,
"loss": 0.286,
"mean_token_accuracy": 0.9250240569764917,
"num_tokens": 142788817.0,
"step": 720
},
{
"epoch": 0.14413102820746132,
"eval_entropy": 0.7430196765993462,
"eval_loss": 0.21336835622787476,
"eval_mean_token_accuracy": 0.9381084842760055,
"eval_num_tokens": 142788817.0,
"eval_runtime": 7.0804,
"eval_samples_per_second": 137.421,
"eval_steps_per_second": 8.615,
"step": 720
},
{
"entropy": 0.8445068792863326,
"epoch": 0.1451319381255687,
"grad_norm": 1.6939398050308228,
"learning_rate": 1.448e-06,
"loss": 0.2895,
"mean_token_accuracy": 0.9256705939769745,
"num_tokens": 143528076.0,
"step": 725
},
{
"entropy": 0.7430232730778781,
"epoch": 0.14613284804367607,
"grad_norm": 0.9444680213928223,
"learning_rate": 1.4579999999999998e-06,
"loss": 0.2684,
"mean_token_accuracy": 0.9300248498266394,
"num_tokens": 144669748.0,
"step": 730
},
{
"entropy": 0.8000332848592238,
"epoch": 0.14713375796178343,
"grad_norm": 0.8172219395637512,
"learning_rate": 1.468e-06,
"loss": 0.2831,
"mean_token_accuracy": 0.9256041104143317,
"num_tokens": 145740545.0,
"step": 735
},
{
"entropy": 0.8183066297661175,
"epoch": 0.1481346678798908,
"grad_norm": 0.7712555527687073,
"learning_rate": 1.4779999999999999e-06,
"loss": 0.2832,
"mean_token_accuracy": 0.9251929564909501,
"num_tokens": 146777060.0,
"step": 740
},
{
"entropy": 0.8324488238854841,
"epoch": 0.1491355777979982,
"grad_norm": 0.6426140666007996,
"learning_rate": 1.488e-06,
"loss": 0.2842,
"mean_token_accuracy": 0.925419792803851,
"num_tokens": 147735794.0,
"step": 745
},
{
"entropy": 0.845582206140865,
"epoch": 0.15013648771610555,
"grad_norm": 1.8751927614212036,
"learning_rate": 1.4979999999999999e-06,
"loss": 0.2924,
"mean_token_accuracy": 0.9251068283211101,
"num_tokens": 148475511.0,
"step": 750
},
{
"epoch": 0.15013648771610555,
"eval_entropy": 0.7409035055363764,
"eval_loss": 0.20810416340827942,
"eval_mean_token_accuracy": 0.9396978216093095,
"eval_num_tokens": 148475511.0,
"eval_runtime": 7.0982,
"eval_samples_per_second": 137.077,
"eval_steps_per_second": 8.594,
"step": 750
},
{
"entropy": 0.738556033372879,
"epoch": 0.1511373976342129,
"grad_norm": 0.9034550189971924,
"learning_rate": 1.508e-06,
"loss": 0.2687,
"mean_token_accuracy": 0.9301519659432498,
"num_tokens": 149602202.0,
"step": 755
},
{
"entropy": 0.8039004320448095,
"epoch": 0.1521383075523203,
"grad_norm": 0.7859927415847778,
"learning_rate": 1.518e-06,
"loss": 0.2835,
"mean_token_accuracy": 0.9249755826863375,
"num_tokens": 150660031.0,
"step": 760
},
{
"entropy": 0.8235690740021793,
"epoch": 0.15313921747042766,
"grad_norm": 0.9304025769233704,
"learning_rate": 1.528e-06,
"loss": 0.2837,
"mean_token_accuracy": 0.9240913233973763,
"num_tokens": 151705865.0,
"step": 765
},
{
"entropy": 0.8295753197236495,
"epoch": 0.15414012738853503,
"grad_norm": 0.6954373717308044,
"learning_rate": 1.538e-06,
"loss": 0.2826,
"mean_token_accuracy": 0.9268539065664465,
"num_tokens": 152662320.0,
"step": 770
},
{
"entropy": 0.8385162537748163,
"epoch": 0.15514103730664242,
"grad_norm": 1.6783963441848755,
"learning_rate": 1.548e-06,
"loss": 0.2861,
"mean_token_accuracy": 0.9266390404917977,
"num_tokens": 153390397.0,
"step": 775
},
{
"entropy": 0.7379371079531583,
"epoch": 0.15614194722474978,
"grad_norm": 0.9129334092140198,
"learning_rate": 1.558e-06,
"loss": 0.2728,
"mean_token_accuracy": 0.9291019217534499,
"num_tokens": 154510653.0,
"step": 780
},
{
"epoch": 0.15614194722474978,
"eval_entropy": 0.744651442668477,
"eval_loss": 0.20912407338619232,
"eval_mean_token_accuracy": 0.9394876233866958,
"eval_num_tokens": 154510653.0,
"eval_runtime": 7.2729,
"eval_samples_per_second": 133.784,
"eval_steps_per_second": 8.387,
"step": 780
},
{
"entropy": 0.7914630618962375,
"epoch": 0.15714285714285714,
"grad_norm": 0.8796635270118713,
"learning_rate": 1.568e-06,
"loss": 0.279,
"mean_token_accuracy": 0.9265191034837202,
"num_tokens": 155598773.0,
"step": 785
},
{
"entropy": 0.8060251300985163,
"epoch": 0.1581437670609645,
"grad_norm": 0.722435474395752,
"learning_rate": 1.578e-06,
"loss": 0.2837,
"mean_token_accuracy": 0.9250888071276925,
"num_tokens": 156631931.0,
"step": 790
},
{
"entropy": 0.8348277311433445,
"epoch": 0.1591446769790719,
"grad_norm": 0.6317898631095886,
"learning_rate": 1.588e-06,
"loss": 0.2873,
"mean_token_accuracy": 0.9253057523207231,
"num_tokens": 157582000.0,
"step": 795
},
{
"entropy": 0.8389149953018535,
"epoch": 0.16014558689717925,
"grad_norm": 1.8928760290145874,
"learning_rate": 1.598e-06,
"loss": 0.2875,
"mean_token_accuracy": 0.9262628761204806,
"num_tokens": 158320297.0,
"step": 800
},
{
"entropy": 0.7366051608865911,
"epoch": 0.16114649681528662,
"grad_norm": 0.8541039228439331,
"learning_rate": 1.608e-06,
"loss": 0.2663,
"mean_token_accuracy": 0.9309936154972424,
"num_tokens": 159440189.0,
"step": 805
},
{
"entropy": 0.7848947503349998,
"epoch": 0.162147406733394,
"grad_norm": 0.752082884311676,
"learning_rate": 1.618e-06,
"loss": 0.2813,
"mean_token_accuracy": 0.9258821048519829,
"num_tokens": 160545730.0,
"step": 810
},
{
"epoch": 0.162147406733394,
"eval_entropy": 0.7330616829825229,
"eval_loss": 0.2090436816215515,
"eval_mean_token_accuracy": 0.9397259344820117,
"eval_num_tokens": 160545730.0,
"eval_runtime": 7.0312,
"eval_samples_per_second": 138.384,
"eval_steps_per_second": 8.676,
"step": 810
},
{
"entropy": 0.8028561315753243,
"epoch": 0.16314831665150137,
"grad_norm": 0.7342677116394043,
"learning_rate": 1.628e-06,
"loss": 0.2804,
"mean_token_accuracy": 0.9256297485394911,
"num_tokens": 161573472.0,
"step": 815
},
{
"entropy": 0.8302115288647738,
"epoch": 0.16414922656960873,
"grad_norm": 0.7893772721290588,
"learning_rate": 1.6379999999999998e-06,
"loss": 0.2836,
"mean_token_accuracy": 0.9263590422543613,
"num_tokens": 162530179.0,
"step": 820
},
{
"entropy": 0.827764184366573,
"epoch": 0.1651501364877161,
"grad_norm": 1.99238920211792,
"learning_rate": 1.648e-06,
"loss": 0.2817,
"mean_token_accuracy": 0.9277313232421875,
"num_tokens": 163263981.0,
"step": 825
},
{
"entropy": 0.722087900205092,
"epoch": 0.16615104640582348,
"grad_norm": 0.8378480076789856,
"learning_rate": 1.6579999999999998e-06,
"loss": 0.2627,
"mean_token_accuracy": 0.9315350006927143,
"num_tokens": 164398357.0,
"step": 830
},
{
"entropy": 0.7725223552096974,
"epoch": 0.16715195632393084,
"grad_norm": 0.7914682626724243,
"learning_rate": 1.668e-06,
"loss": 0.2716,
"mean_token_accuracy": 0.9280308051542803,
"num_tokens": 165468194.0,
"step": 835
},
{
"entropy": 0.8090368211269379,
"epoch": 0.1681528662420382,
"grad_norm": 0.8411707282066345,
"learning_rate": 1.6779999999999999e-06,
"loss": 0.2795,
"mean_token_accuracy": 0.9261684049259532,
"num_tokens": 166510298.0,
"step": 840
},
{
"epoch": 0.1681528662420382,
"eval_entropy": 0.726200912819534,
"eval_loss": 0.20932643115520477,
"eval_mean_token_accuracy": 0.9394924601570505,
"eval_num_tokens": 166510298.0,
"eval_runtime": 7.1551,
"eval_samples_per_second": 135.988,
"eval_steps_per_second": 8.525,
"step": 840
},
{
"entropy": 0.8117384894327684,
"epoch": 0.1691537761601456,
"grad_norm": 0.6859620809555054,
"learning_rate": 1.6879999999999998e-06,
"loss": 0.2775,
"mean_token_accuracy": 0.9275808865373785,
"num_tokens": 167466578.0,
"step": 845
},
{
"entropy": 0.8264376792040738,
"epoch": 0.17015468607825296,
"grad_norm": 1.9784756898880005,
"learning_rate": 1.6979999999999999e-06,
"loss": 0.2832,
"mean_token_accuracy": 0.9271657266400077,
"num_tokens": 168200534.0,
"step": 850
},
{
"entropy": 0.7185999631881714,
"epoch": 0.17115559599636032,
"grad_norm": 0.8721832036972046,
"learning_rate": 1.7079999999999998e-06,
"loss": 0.2669,
"mean_token_accuracy": 0.9305740556933663,
"num_tokens": 169340781.0,
"step": 855
},
{
"entropy": 0.7641947990114039,
"epoch": 0.1721565059144677,
"grad_norm": 0.7572875618934631,
"learning_rate": 1.718e-06,
"loss": 0.2713,
"mean_token_accuracy": 0.9280713466080752,
"num_tokens": 170422517.0,
"step": 860
},
{
"entropy": 0.7978116544810209,
"epoch": 0.17315741583257507,
"grad_norm": 0.7459094524383545,
"learning_rate": 1.7279999999999998e-06,
"loss": 0.2765,
"mean_token_accuracy": 0.9266852221705697,
"num_tokens": 171469260.0,
"step": 865
},
{
"entropy": 0.8156883180141449,
"epoch": 0.17415832575068244,
"grad_norm": 0.6680793166160583,
"learning_rate": 1.738e-06,
"loss": 0.2768,
"mean_token_accuracy": 0.9277672919360074,
"num_tokens": 172427066.0,
"step": 870
},
{
"epoch": 0.17415832575068244,
"eval_entropy": 0.7165868282318115,
"eval_loss": 0.20711387693881989,
"eval_mean_token_accuracy": 0.9398616259215308,
"eval_num_tokens": 172427066.0,
"eval_runtime": 7.0985,
"eval_samples_per_second": 137.071,
"eval_steps_per_second": 8.593,
"step": 870
},
{
"entropy": 0.8210206216031855,
"epoch": 0.1751592356687898,
"grad_norm": 1.6756982803344727,
"learning_rate": 1.7479999999999998e-06,
"loss": 0.2764,
"mean_token_accuracy": 0.9285656040365046,
"num_tokens": 173142991.0,
"step": 875
},
{
"entropy": 0.7199788321148265,
"epoch": 0.1761601455868972,
"grad_norm": 0.8849101066589355,
"learning_rate": 1.758e-06,
"loss": 0.2598,
"mean_token_accuracy": 0.9321709827943282,
"num_tokens": 174282576.0,
"step": 880
},
{
"entropy": 0.7870851630514318,
"epoch": 0.17716105550500455,
"grad_norm": 0.7448862791061401,
"learning_rate": 1.7679999999999998e-06,
"loss": 0.2737,
"mean_token_accuracy": 0.9282706352797422,
"num_tokens": 175369483.0,
"step": 885
},
{
"entropy": 0.8100219070911407,
"epoch": 0.1781619654231119,
"grad_norm": 0.8964276313781738,
"learning_rate": 1.778e-06,
"loss": 0.2773,
"mean_token_accuracy": 0.9265438226136294,
"num_tokens": 176409360.0,
"step": 890
},
{
"entropy": 0.8189065765250813,
"epoch": 0.1791628753412193,
"grad_norm": 0.7688677906990051,
"learning_rate": 1.7879999999999999e-06,
"loss": 0.2699,
"mean_token_accuracy": 0.9298083012754267,
"num_tokens": 177361894.0,
"step": 895
},
{
"entropy": 0.8263901694254442,
"epoch": 0.18016378525932666,
"grad_norm": 1.8425803184509277,
"learning_rate": 1.798e-06,
"loss": 0.2763,
"mean_token_accuracy": 0.9287969903512434,
"num_tokens": 178089733.0,
"step": 900
},
{
"epoch": 0.18016378525932666,
"eval_entropy": 0.7217419499256572,
"eval_loss": 0.20589642226696014,
"eval_mean_token_accuracy": 0.9406212060177912,
"eval_num_tokens": 178089733.0,
"eval_runtime": 7.1263,
"eval_samples_per_second": 136.536,
"eval_steps_per_second": 8.56,
"step": 900
},
{
"entropy": 0.72322096824646,
"epoch": 0.18116469517743403,
"grad_norm": 0.8959372043609619,
"learning_rate": 1.8079999999999999e-06,
"loss": 0.2666,
"mean_token_accuracy": 0.9303640398112211,
"num_tokens": 179231674.0,
"step": 905
},
{
"entropy": 0.7706199407577514,
"epoch": 0.1821656050955414,
"grad_norm": 0.8667539358139038,
"learning_rate": 1.818e-06,
"loss": 0.2765,
"mean_token_accuracy": 0.9267726708542217,
"num_tokens": 180315249.0,
"step": 910
},
{
"entropy": 0.7906596568497745,
"epoch": 0.18316651501364878,
"grad_norm": 0.7227725982666016,
"learning_rate": 1.828e-06,
"loss": 0.2686,
"mean_token_accuracy": 0.9294304208322005,
"num_tokens": 181340192.0,
"step": 915
},
{
"entropy": 0.8029232171448795,
"epoch": 0.18416742493175614,
"grad_norm": 0.6649354696273804,
"learning_rate": 1.838e-06,
"loss": 0.2667,
"mean_token_accuracy": 0.9300361882556568,
"num_tokens": 182307935.0,
"step": 920
},
{
"entropy": 0.7977046684785323,
"epoch": 0.1851683348498635,
"grad_norm": 1.6901991367340088,
"learning_rate": 1.848e-06,
"loss": 0.2723,
"mean_token_accuracy": 0.9294875199144537,
"num_tokens": 183060078.0,
"step": 925
},
{
"entropy": 0.7040667669339613,
"epoch": 0.1861692447679709,
"grad_norm": 0.8532208204269409,
"learning_rate": 1.858e-06,
"loss": 0.2571,
"mean_token_accuracy": 0.9326450272039933,
"num_tokens": 184221420.0,
"step": 930
},
{
"epoch": 0.1861692447679709,
"eval_entropy": 0.725787528225633,
"eval_loss": 0.20359419286251068,
"eval_mean_token_accuracy": 0.9418040504221057,
"eval_num_tokens": 184221420.0,
"eval_runtime": 7.1054,
"eval_samples_per_second": 136.938,
"eval_steps_per_second": 8.585,
"step": 930
},
{
"entropy": 0.7605256313627416,
"epoch": 0.18717015468607826,
"grad_norm": 0.740172266960144,
"learning_rate": 1.868e-06,
"loss": 0.2642,
"mean_token_accuracy": 0.9301990406079725,
"num_tokens": 185315366.0,
"step": 935
},
{
"entropy": 0.788138997554779,
"epoch": 0.18817106460418562,
"grad_norm": 0.7328667640686035,
"learning_rate": 1.8779999999999998e-06,
"loss": 0.2594,
"mean_token_accuracy": 0.9303668347272006,
"num_tokens": 186337490.0,
"step": 940
},
{
"entropy": 0.815633828531612,
"epoch": 0.189171974522293,
"grad_norm": 0.6744860410690308,
"learning_rate": 1.8879999999999998e-06,
"loss": 0.2702,
"mean_token_accuracy": 0.9292519065466794,
"num_tokens": 187269716.0,
"step": 945
},
{
"entropy": 0.8176946092735637,
"epoch": 0.19017288444040037,
"grad_norm": 1.8253427743911743,
"learning_rate": 1.8979999999999999e-06,
"loss": 0.2659,
"mean_token_accuracy": 0.931620988520709,
"num_tokens": 187991692.0,
"step": 950
},
{
"entropy": 0.7086463402618062,
"epoch": 0.19117379435850773,
"grad_norm": 0.9137970209121704,
"learning_rate": 1.9079999999999998e-06,
"loss": 0.2586,
"mean_token_accuracy": 0.9328472657637162,
"num_tokens": 189120364.0,
"step": 955
},
{
"entropy": 0.7753447532653809,
"epoch": 0.1921747042766151,
"grad_norm": 0.8782041668891907,
"learning_rate": 1.9179999999999997e-06,
"loss": 0.2659,
"mean_token_accuracy": 0.9294385693290017,
"num_tokens": 190191796.0,
"step": 960
},
{
"epoch": 0.1921747042766151,
"eval_entropy": 0.724771861170159,
"eval_loss": 0.2042693942785263,
"eval_mean_token_accuracy": 0.9414341498593815,
"eval_num_tokens": 190191796.0,
"eval_runtime": 7.0787,
"eval_samples_per_second": 137.455,
"eval_steps_per_second": 8.617,
"step": 960
},
{
"entropy": 0.7960949518463828,
"epoch": 0.19317561419472248,
"grad_norm": 0.713079035282135,
"learning_rate": 1.928e-06,
"loss": 0.2676,
"mean_token_accuracy": 0.9294228992678902,
"num_tokens": 191241957.0,
"step": 965
},
{
"entropy": 0.8051728243177587,
"epoch": 0.19417652411282985,
"grad_norm": 0.666320264339447,
"learning_rate": 1.938e-06,
"loss": 0.2652,
"mean_token_accuracy": 0.9296959736130455,
"num_tokens": 192200870.0,
"step": 970
},
{
"entropy": 0.8057123639366843,
"epoch": 0.1951774340309372,
"grad_norm": 1.6562741994857788,
"learning_rate": 1.948e-06,
"loss": 0.2605,
"mean_token_accuracy": 0.9326337846842679,
"num_tokens": 192934859.0,
"step": 975
},
{
"entropy": 0.7061334458264438,
"epoch": 0.1961783439490446,
"grad_norm": 0.8408867120742798,
"learning_rate": 1.9579999999999997e-06,
"loss": 0.2536,
"mean_token_accuracy": 0.9338537595488808,
"num_tokens": 194104139.0,
"step": 980
},
{
"entropy": 0.7579250015995719,
"epoch": 0.19717925386715196,
"grad_norm": 0.7840215563774109,
"learning_rate": 1.968e-06,
"loss": 0.2595,
"mean_token_accuracy": 0.9305404896085913,
"num_tokens": 195188455.0,
"step": 985
},
{
"entropy": 0.7914492504163222,
"epoch": 0.19818016378525932,
"grad_norm": 0.692529559135437,
"learning_rate": 1.978e-06,
"loss": 0.2671,
"mean_token_accuracy": 0.9293763789263638,
"num_tokens": 196209141.0,
"step": 990
},
{
"epoch": 0.19818016378525932,
"eval_entropy": 0.7139858779360037,
"eval_loss": 0.20102433860301971,
"eval_mean_token_accuracy": 0.9419027381255979,
"eval_num_tokens": 196209141.0,
"eval_runtime": 7.1947,
"eval_samples_per_second": 135.239,
"eval_steps_per_second": 8.479,
"step": 990
},
{
"entropy": 0.7997732845219698,
"epoch": 0.19918107370336668,
"grad_norm": 0.7436501979827881,
"learning_rate": 1.988e-06,
"loss": 0.2613,
"mean_token_accuracy": 0.9318017385222696,
"num_tokens": 197169823.0,
"step": 995
},
{
"entropy": 0.808041772517291,
"epoch": 0.20018198362147407,
"grad_norm": 1.8346213102340698,
"learning_rate": 1.9979999999999998e-06,
"loss": 0.2684,
"mean_token_accuracy": 0.9309409526261416,
"num_tokens": 197904294.0,
"step": 1000
},
{
"entropy": 0.7090240332213316,
"epoch": 0.20118289353958144,
"grad_norm": 0.8898105025291443,
"learning_rate": 1.9991103202846973e-06,
"loss": 0.2542,
"mean_token_accuracy": 0.9338583967902444,
"num_tokens": 199040537.0,
"step": 1005
},
{
"entropy": 0.7625901590694081,
"epoch": 0.2021838034576888,
"grad_norm": 0.7580350041389465,
"learning_rate": 1.997998220640569e-06,
"loss": 0.2684,
"mean_token_accuracy": 0.9290495872497558,
"num_tokens": 200122330.0,
"step": 1010
},
{
"entropy": 0.7868972290645946,
"epoch": 0.2031847133757962,
"grad_norm": 0.9172696471214294,
"learning_rate": 1.996886120996441e-06,
"loss": 0.2592,
"mean_token_accuracy": 0.9309038433161649,
"num_tokens": 201149457.0,
"step": 1015
},
{
"entropy": 0.7947816740382802,
"epoch": 0.20418562329390355,
"grad_norm": 0.6719794273376465,
"learning_rate": 1.9957740213523133e-06,
"loss": 0.2634,
"mean_token_accuracy": 0.9316002515229311,
"num_tokens": 202101608.0,
"step": 1020
},
{
"epoch": 0.20418562329390355,
"eval_entropy": 0.7147035344702298,
"eval_loss": 0.20133115351200104,
"eval_mean_token_accuracy": 0.9419465289741266,
"eval_num_tokens": 202101608.0,
"eval_runtime": 7.0701,
"eval_samples_per_second": 137.622,
"eval_steps_per_second": 8.628,
"step": 1020
},
{
"entropy": 0.813241909308867,
"epoch": 0.2051865332120109,
"grad_norm": 1.68107271194458,
"learning_rate": 1.994661921708185e-06,
"loss": 0.2721,
"mean_token_accuracy": 0.9300860870968212,
"num_tokens": 202823517.0,
"step": 1025
},
{
"entropy": 0.6989771512421694,
"epoch": 0.2061874431301183,
"grad_norm": 0.9269376397132874,
"learning_rate": 1.9935498220640566e-06,
"loss": 0.2535,
"mean_token_accuracy": 0.9341622206297788,
"num_tokens": 203958059.0,
"step": 1030
},
{
"entropy": 0.7591653926806017,
"epoch": 0.20718835304822567,
"grad_norm": 0.7755193114280701,
"learning_rate": 1.992437722419929e-06,
"loss": 0.2648,
"mean_token_accuracy": 0.9302975632927635,
"num_tokens": 205042771.0,
"step": 1035
},
{
"entropy": 0.7722339581359516,
"epoch": 0.20818926296633303,
"grad_norm": 0.8515006303787231,
"learning_rate": 1.9913256227758007e-06,
"loss": 0.2638,
"mean_token_accuracy": 0.9300298192284324,
"num_tokens": 206086748.0,
"step": 1040
},
{
"entropy": 0.7889559702439741,
"epoch": 0.2091901728844404,
"grad_norm": 0.6690332293510437,
"learning_rate": 1.9902135231316726e-06,
"loss": 0.2565,
"mean_token_accuracy": 0.9327416582541033,
"num_tokens": 207023751.0,
"step": 1045
},
{
"entropy": 0.7905822466720235,
"epoch": 0.21019108280254778,
"grad_norm": 1.524138331413269,
"learning_rate": 1.9891014234875445e-06,
"loss": 0.2618,
"mean_token_accuracy": 0.9318056187846444,
"num_tokens": 207751826.0,
"step": 1050
},
{
"epoch": 0.21019108280254778,
"eval_entropy": 0.6987361546422615,
"eval_loss": 0.20032314956188202,
"eval_mean_token_accuracy": 0.9415211052191063,
"eval_num_tokens": 207751826.0,
"eval_runtime": 7.1237,
"eval_samples_per_second": 136.587,
"eval_steps_per_second": 8.563,
"step": 1050
},
{
"entropy": 0.700323451649059,
"epoch": 0.21119199272065514,
"grad_norm": 0.9274206161499023,
"learning_rate": 1.9879893238434163e-06,
"loss": 0.2499,
"mean_token_accuracy": 0.9347092021595348,
"num_tokens": 208886557.0,
"step": 1055
},
{
"entropy": 0.7475979534062472,
"epoch": 0.2121929026387625,
"grad_norm": 0.8458713293075562,
"learning_rate": 1.986877224199288e-06,
"loss": 0.261,
"mean_token_accuracy": 0.9306270117109472,
"num_tokens": 209999842.0,
"step": 1060
},
{
"entropy": 0.7634694963693619,
"epoch": 0.2131938125568699,
"grad_norm": 0.7438536882400513,
"learning_rate": 1.98576512455516e-06,
"loss": 0.2612,
"mean_token_accuracy": 0.9316813165491278,
"num_tokens": 211047482.0,
"step": 1065
},
{
"entropy": 0.7860465927557512,
"epoch": 0.21419472247497726,
"grad_norm": 0.6679530739784241,
"learning_rate": 1.984653024911032e-06,
"loss": 0.2616,
"mean_token_accuracy": 0.932481362061067,
"num_tokens": 211999890.0,
"step": 1070
},
{
"entropy": 0.7879262474450198,
"epoch": 0.21519563239308462,
"grad_norm": 1.5317449569702148,
"learning_rate": 1.9835409252669037e-06,
"loss": 0.256,
"mean_token_accuracy": 0.9341791461814534,
"num_tokens": 212724971.0,
"step": 1075
},
{
"entropy": 0.6914473251862959,
"epoch": 0.21619654231119198,
"grad_norm": 0.9009571671485901,
"learning_rate": 1.9824288256227756e-06,
"loss": 0.2469,
"mean_token_accuracy": 0.935233576189388,
"num_tokens": 213865483.0,
"step": 1080
},
{
"epoch": 0.21619654231119198,
"eval_entropy": 0.6992622926586964,
"eval_loss": 0.19818614423274994,
"eval_mean_token_accuracy": 0.9426181824480901,
"eval_num_tokens": 213865483.0,
"eval_runtime": 7.3835,
"eval_samples_per_second": 131.78,
"eval_steps_per_second": 8.262,
"step": 1080
},
{
"entropy": 0.7396956460042433,
"epoch": 0.21719745222929937,
"grad_norm": 0.7676311135292053,
"learning_rate": 1.9813167259786475e-06,
"loss": 0.2553,
"mean_token_accuracy": 0.9329301888292486,
"num_tokens": 214946018.0,
"step": 1085
},
{
"entropy": 0.7576209339228543,
"epoch": 0.21819836214740673,
"grad_norm": 0.9512864351272583,
"learning_rate": 1.9802046263345197e-06,
"loss": 0.2574,
"mean_token_accuracy": 0.931071363254027,
"num_tokens": 215999988.0,
"step": 1090
},
{
"entropy": 0.7767835638739846,
"epoch": 0.2191992720655141,
"grad_norm": 0.6882670521736145,
"learning_rate": 1.979092526690391e-06,
"loss": 0.2518,
"mean_token_accuracy": 0.9337078777226535,
"num_tokens": 216962447.0,
"step": 1095
},
{
"entropy": 0.7832509934902191,
"epoch": 0.22020018198362148,
"grad_norm": 1.6970500946044922,
"learning_rate": 1.977980427046263e-06,
"loss": 0.2583,
"mean_token_accuracy": 0.9332552210851149,
"num_tokens": 217692537.0,
"step": 1100
},
{
"entropy": 0.6820299370722337,
"epoch": 0.22120109190172885,
"grad_norm": 0.8949645757675171,
"learning_rate": 1.9768683274021353e-06,
"loss": 0.2445,
"mean_token_accuracy": 0.935930597782135,
"num_tokens": 218839476.0,
"step": 1105
},
{
"entropy": 0.72886228073727,
"epoch": 0.2222020018198362,
"grad_norm": 0.8621814846992493,
"learning_rate": 1.975756227758007e-06,
"loss": 0.2493,
"mean_token_accuracy": 0.9338542092930187,
"num_tokens": 219923390.0,
"step": 1110
},
{
"epoch": 0.2222020018198362,
"eval_entropy": 0.6884255741463333,
"eval_loss": 0.19926953315734863,
"eval_mean_token_accuracy": 0.9423930107570085,
"eval_num_tokens": 219923390.0,
"eval_runtime": 7.0927,
"eval_samples_per_second": 137.184,
"eval_steps_per_second": 8.6,
"step": 1110
},
{
"entropy": 0.7540641031482003,
"epoch": 0.22320291173794357,
"grad_norm": 0.971157431602478,
"learning_rate": 1.974644128113879e-06,
"loss": 0.2567,
"mean_token_accuracy": 0.932219631021673,
"num_tokens": 220957232.0,
"step": 1115
},
{
"entropy": 0.7798225131901828,
"epoch": 0.22420382165605096,
"grad_norm": 0.7949030995368958,
"learning_rate": 1.973532028469751e-06,
"loss": 0.2581,
"mean_token_accuracy": 0.9322475785558874,
"num_tokens": 221909237.0,
"step": 1120
},
{
"entropy": 0.7734460061246698,
"epoch": 0.22520473157415832,
"grad_norm": 1.671317219734192,
"learning_rate": 1.9724199288256227e-06,
"loss": 0.2532,
"mean_token_accuracy": 0.9343869902870872,
"num_tokens": 222629518.0,
"step": 1125
},
{
"entropy": 0.6769220758568156,
"epoch": 0.22620564149226569,
"grad_norm": 0.8417484164237976,
"learning_rate": 1.9713078291814946e-06,
"loss": 0.2432,
"mean_token_accuracy": 0.9365156341682781,
"num_tokens": 223771141.0,
"step": 1130
},
{
"entropy": 0.7289805867455222,
"epoch": 0.22720655141037308,
"grad_norm": 0.8334816694259644,
"learning_rate": 1.9701957295373665e-06,
"loss": 0.2564,
"mean_token_accuracy": 0.9321391544558785,
"num_tokens": 224858611.0,
"step": 1135
},
{
"entropy": 0.7575576175342906,
"epoch": 0.22820746132848044,
"grad_norm": 0.686861515045166,
"learning_rate": 1.9690836298932383e-06,
"loss": 0.2553,
"mean_token_accuracy": 0.932028527693315,
"num_tokens": 225904498.0,
"step": 1140
},
{
"epoch": 0.22820746132848044,
"eval_entropy": 0.687260666831595,
"eval_loss": 0.19723324477672577,
"eval_mean_token_accuracy": 0.9429298082336051,
"eval_num_tokens": 225904498.0,
"eval_runtime": 7.2193,
"eval_samples_per_second": 134.777,
"eval_steps_per_second": 8.45,
"step": 1140
},
{
"entropy": 0.7571648413484747,
"epoch": 0.2292083712465878,
"grad_norm": 0.6368003487586975,
"learning_rate": 1.96797153024911e-06,
"loss": 0.2484,
"mean_token_accuracy": 0.9342491680925543,
"num_tokens": 226858707.0,
"step": 1145
},
{
"entropy": 0.7685175494714217,
"epoch": 0.2302092811646952,
"grad_norm": 1.7895119190216064,
"learning_rate": 1.966859430604982e-06,
"loss": 0.2531,
"mean_token_accuracy": 0.9351052864031358,
"num_tokens": 227586735.0,
"step": 1150
},
{
"entropy": 0.6730130303989758,
"epoch": 0.23121019108280255,
"grad_norm": 0.8514677286148071,
"learning_rate": 1.9657473309608543e-06,
"loss": 0.2434,
"mean_token_accuracy": 0.9364338099956513,
"num_tokens": 228710792.0,
"step": 1155
},
{
"entropy": 0.7245557562871413,
"epoch": 0.23221110100090991,
"grad_norm": 0.7925510406494141,
"learning_rate": 1.9646352313167257e-06,
"loss": 0.2565,
"mean_token_accuracy": 0.9326732272451574,
"num_tokens": 229789807.0,
"step": 1160
},
{
"entropy": 0.7381821754303846,
"epoch": 0.23321201091901728,
"grad_norm": 0.7272951006889343,
"learning_rate": 1.9635231316725976e-06,
"loss": 0.2467,
"mean_token_accuracy": 0.9342716991901397,
"num_tokens": 230830474.0,
"step": 1165
},
{
"entropy": 0.7532747295769778,
"epoch": 0.23421292083712467,
"grad_norm": 0.6639147996902466,
"learning_rate": 1.96241103202847e-06,
"loss": 0.2521,
"mean_token_accuracy": 0.9335366579619321,
"num_tokens": 231790758.0,
"step": 1170
},
{
"epoch": 0.23421292083712467,
"eval_entropy": 0.6738434072400703,
"eval_loss": 0.19970019161701202,
"eval_mean_token_accuracy": 0.9427229283285923,
"eval_num_tokens": 231790758.0,
"eval_runtime": 7.0658,
"eval_samples_per_second": 137.705,
"eval_steps_per_second": 8.633,
"step": 1170
},
{
"entropy": 0.7472162235866894,
"epoch": 0.23521383075523203,
"grad_norm": 1.5396642684936523,
"learning_rate": 1.9612989323843417e-06,
"loss": 0.2494,
"mean_token_accuracy": 0.9352785722775893,
"num_tokens": 232530867.0,
"step": 1175
},
{
"entropy": 0.6697620332241059,
"epoch": 0.2362147406733394,
"grad_norm": 0.8647318482398987,
"learning_rate": 1.960186832740213e-06,
"loss": 0.2433,
"mean_token_accuracy": 0.9363701712001454,
"num_tokens": 233651796.0,
"step": 1180
},
{
"entropy": 0.7114524765448137,
"epoch": 0.23721565059144678,
"grad_norm": 0.8350867629051208,
"learning_rate": 1.9590747330960855e-06,
"loss": 0.251,
"mean_token_accuracy": 0.9338924034075303,
"num_tokens": 234754552.0,
"step": 1185
},
{
"entropy": 0.7274992368438027,
"epoch": 0.23821656050955414,
"grad_norm": 0.6969212293624878,
"learning_rate": 1.9579626334519573e-06,
"loss": 0.2487,
"mean_token_accuracy": 0.9337175385518508,
"num_tokens": 235782960.0,
"step": 1190
},
{
"entropy": 0.7455267862840133,
"epoch": 0.2392174704276615,
"grad_norm": 0.624343752861023,
"learning_rate": 1.956850533807829e-06,
"loss": 0.2532,
"mean_token_accuracy": 0.9332292107018557,
"num_tokens": 236735963.0,
"step": 1195
},
{
"entropy": 0.7484802782535553,
"epoch": 0.24021838034576887,
"grad_norm": 1.5747654438018799,
"learning_rate": 1.955738434163701e-06,
"loss": 0.2506,
"mean_token_accuracy": 0.9349917281757701,
"num_tokens": 237476602.0,
"step": 1200
},
{
"epoch": 0.24021838034576887,
"eval_entropy": 0.6878872777594894,
"eval_loss": 0.19709168374538422,
"eval_mean_token_accuracy": 0.9429968146027111,
"eval_num_tokens": 237476602.0,
"eval_runtime": 7.0622,
"eval_samples_per_second": 137.775,
"eval_steps_per_second": 8.637,
"step": 1200
},
{
"entropy": 0.671830934827978,
"epoch": 0.24121929026387626,
"grad_norm": 0.8599943518638611,
"learning_rate": 1.954626334519573e-06,
"loss": 0.2366,
"mean_token_accuracy": 0.9383657791397788,
"num_tokens": 238617406.0,
"step": 1205
},
{
"entropy": 0.7293940170244737,
"epoch": 0.24222020018198362,
"grad_norm": 0.754350483417511,
"learning_rate": 1.9535142348754447e-06,
"loss": 0.2512,
"mean_token_accuracy": 0.9323639192364432,
"num_tokens": 239700088.0,
"step": 1210
},
{
"entropy": 0.7499282219193198,
"epoch": 0.24322111010009098,
"grad_norm": 0.7476288080215454,
"learning_rate": 1.9524021352313166e-06,
"loss": 0.2552,
"mean_token_accuracy": 0.9318941896611994,
"num_tokens": 240733335.0,
"step": 1215
},
{
"entropy": 0.7511982554739172,
"epoch": 0.24422202001819837,
"grad_norm": 0.6863506436347961,
"learning_rate": 1.9512900355871885e-06,
"loss": 0.243,
"mean_token_accuracy": 0.9356909887357192,
"num_tokens": 241687104.0,
"step": 1220
},
{
"entropy": 0.749161382154985,
"epoch": 0.24522292993630573,
"grad_norm": 1.631894826889038,
"learning_rate": 1.9501779359430603e-06,
"loss": 0.2514,
"mean_token_accuracy": 0.9346412853761152,
"num_tokens": 242426018.0,
"step": 1225
},
{
"entropy": 0.6726668021895669,
"epoch": 0.2462238398544131,
"grad_norm": 0.8596307635307312,
"learning_rate": 1.949065836298932e-06,
"loss": 0.2454,
"mean_token_accuracy": 0.9364431234923276,
"num_tokens": 243548718.0,
"step": 1230
},
{
"epoch": 0.2462238398544131,
"eval_entropy": 0.6812147097509416,
"eval_loss": 0.19748112559318542,
"eval_mean_token_accuracy": 0.943136929488573,
"eval_num_tokens": 243548718.0,
"eval_runtime": 7.0861,
"eval_samples_per_second": 137.311,
"eval_steps_per_second": 8.608,
"step": 1230
},
{
"entropy": 0.7223554464903745,
"epoch": 0.24722474977252049,
"grad_norm": 0.8182641863822937,
"learning_rate": 1.947953736654804e-06,
"loss": 0.2473,
"mean_token_accuracy": 0.9328928150913932,
"num_tokens": 244634262.0,
"step": 1235
},
{
"entropy": 0.7410072830590335,
"epoch": 0.24822565969062785,
"grad_norm": 0.831390380859375,
"learning_rate": 1.9468416370106763e-06,
"loss": 0.2458,
"mean_token_accuracy": 0.9338255047798156,
"num_tokens": 245677570.0,
"step": 1240
},
{
"entropy": 0.7599548085169359,
"epoch": 0.2492265696087352,
"grad_norm": 0.8275907635688782,
"learning_rate": 1.9457295373665477e-06,
"loss": 0.2424,
"mean_token_accuracy": 0.9356081453236667,
"num_tokens": 246647643.0,
"step": 1245
},
{
"entropy": 0.7624553501605987,
"epoch": 0.2502274795268426,
"grad_norm": 1.9468681812286377,
"learning_rate": 1.94461743772242e-06,
"loss": 0.2445,
"mean_token_accuracy": 0.9364055861126293,
"num_tokens": 247388979.0,
"step": 1250
},
{
"entropy": 0.6826613940975883,
"epoch": 0.25122838944494996,
"grad_norm": 0.8892253041267395,
"learning_rate": 1.943505338078292e-06,
"loss": 0.2377,
"mean_token_accuracy": 0.937986614487388,
"num_tokens": 248507582.0,
"step": 1255
},
{
"entropy": 0.7349318878217177,
"epoch": 0.2522292993630573,
"grad_norm": 0.7683637738227844,
"learning_rate": 1.9423932384341637e-06,
"loss": 0.2494,
"mean_token_accuracy": 0.9336408035321669,
"num_tokens": 249580005.0,
"step": 1260
},
{
"epoch": 0.2522292993630573,
"eval_entropy": 0.6866021556932418,
"eval_loss": 0.19629527628421783,
"eval_mean_token_accuracy": 0.9435793952863725,
"eval_num_tokens": 249580005.0,
"eval_runtime": 7.0601,
"eval_samples_per_second": 137.817,
"eval_steps_per_second": 8.64,
"step": 1260
},
{
"entropy": 0.7540321504527873,
"epoch": 0.2532302092811647,
"grad_norm": 0.7559732794761658,
"learning_rate": 1.9412811387900356e-06,
"loss": 0.2516,
"mean_token_accuracy": 0.9330432496287606,
"num_tokens": 250621468.0,
"step": 1265
},
{
"entropy": 0.7423492084849964,
"epoch": 0.25423111919927205,
"grad_norm": 0.7324007153511047,
"learning_rate": 1.9401690391459075e-06,
"loss": 0.2381,
"mean_token_accuracy": 0.9373159939592535,
"num_tokens": 251581237.0,
"step": 1270
},
{
"entropy": 0.7672662193124945,
"epoch": 0.2552320291173794,
"grad_norm": 1.4408397674560547,
"learning_rate": 1.9390569395017793e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.9368164999918505,
"num_tokens": 252303125.0,
"step": 1275
},
{
"entropy": 0.6668464682318948,
"epoch": 0.25623293903548683,
"grad_norm": 0.9180498123168945,
"learning_rate": 1.937944839857651e-06,
"loss": 0.2387,
"mean_token_accuracy": 0.9375127759846774,
"num_tokens": 253437743.0,
"step": 1280
},
{
"entropy": 0.7173917884176427,
"epoch": 0.2572338489535942,
"grad_norm": 0.7993113994598389,
"learning_rate": 1.936832740213523e-06,
"loss": 0.2435,
"mean_token_accuracy": 0.9355504203926434,
"num_tokens": 254543380.0,
"step": 1285
},
{
"entropy": 0.7427029658447613,
"epoch": 0.25823475887170155,
"grad_norm": 0.7974119186401367,
"learning_rate": 1.935720640569395e-06,
"loss": 0.2404,
"mean_token_accuracy": 0.9355508500879461,
"num_tokens": 255569004.0,
"step": 1290
},
{
"epoch": 0.25823475887170155,
"eval_entropy": 0.683321903963558,
"eval_loss": 0.197197824716568,
"eval_mean_token_accuracy": 0.9433203554544293,
"eval_num_tokens": 255569004.0,
"eval_runtime": 7.0779,
"eval_samples_per_second": 137.47,
"eval_steps_per_second": 8.618,
"step": 1290
},
{
"entropy": 0.7673221891576594,
"epoch": 0.2592356687898089,
"grad_norm": 0.6773776412010193,
"learning_rate": 1.9346085409252667e-06,
"loss": 0.2522,
"mean_token_accuracy": 0.9333479886705225,
"num_tokens": 256521955.0,
"step": 1295
},
{
"entropy": 0.7722088591618972,
"epoch": 0.2602365787079163,
"grad_norm": 1.5807671546936035,
"learning_rate": 1.9334964412811386e-06,
"loss": 0.2445,
"mean_token_accuracy": 0.9367749100381678,
"num_tokens": 257261891.0,
"step": 1300
},
{
"entropy": 0.6817871857773173,
"epoch": 0.26123748862602364,
"grad_norm": 0.8420500159263611,
"learning_rate": 1.932384341637011e-06,
"loss": 0.2307,
"mean_token_accuracy": 0.9392897643826225,
"num_tokens": 258422670.0,
"step": 1305
},
{
"entropy": 0.7291848995468834,
"epoch": 0.262238398544131,
"grad_norm": 0.8453850746154785,
"learning_rate": 1.9312722419928823e-06,
"loss": 0.2367,
"mean_token_accuracy": 0.9367011297832836,
"num_tokens": 259498893.0,
"step": 1310
},
{
"entropy": 0.7557943192395297,
"epoch": 0.2632393084622384,
"grad_norm": 0.7049674391746521,
"learning_rate": 1.930160142348754e-06,
"loss": 0.2394,
"mean_token_accuracy": 0.9362640223719857,
"num_tokens": 260532718.0,
"step": 1315
},
{
"entropy": 0.7660176255486228,
"epoch": 0.2642402183803458,
"grad_norm": 0.7112149596214294,
"learning_rate": 1.9290480427046265e-06,
"loss": 0.2442,
"mean_token_accuracy": 0.9359012392434207,
"num_tokens": 261477169.0,
"step": 1320
},
{
"epoch": 0.2642402183803458,
"eval_entropy": 0.6886299956040304,
"eval_loss": 0.19393064081668854,
"eval_mean_token_accuracy": 0.9443301275128224,
"eval_num_tokens": 261477169.0,
"eval_runtime": 7.0513,
"eval_samples_per_second": 137.989,
"eval_steps_per_second": 8.651,
"step": 1320
},
{
"entropy": 0.7740896999835968,
"epoch": 0.26524112829845314,
"grad_norm": 1.7373411655426025,
"learning_rate": 1.9279359430604983e-06,
"loss": 0.2382,
"mean_token_accuracy": 0.937723603031852,
"num_tokens": 262203299.0,
"step": 1325
},
{
"entropy": 0.6813056788661264,
"epoch": 0.2662420382165605,
"grad_norm": 0.8700944185256958,
"learning_rate": 1.9268238434163697e-06,
"loss": 0.2344,
"mean_token_accuracy": 0.9385422473604029,
"num_tokens": 263358422.0,
"step": 1330
},
{
"entropy": 0.7247711669314991,
"epoch": 0.26724294813466787,
"grad_norm": 0.7497351169586182,
"learning_rate": 1.925711743772242e-06,
"loss": 0.2399,
"mean_token_accuracy": 0.9360633611679077,
"num_tokens": 264437282.0,
"step": 1335
},
{
"entropy": 0.7492102563381196,
"epoch": 0.26824385805277523,
"grad_norm": 0.712761402130127,
"learning_rate": 1.924599644128114e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.9369383118369362,
"num_tokens": 265476998.0,
"step": 1340
},
{
"entropy": 0.7662336116487329,
"epoch": 0.26924476797088265,
"grad_norm": 1.0059868097305298,
"learning_rate": 1.9234875444839857e-06,
"loss": 0.235,
"mean_token_accuracy": 0.9384725668213584,
"num_tokens": 266433276.0,
"step": 1345
},
{
"entropy": 0.779201509735801,
"epoch": 0.27024567788899,
"grad_norm": 1.7948832511901855,
"learning_rate": 1.9223754448398576e-06,
"loss": 0.2454,
"mean_token_accuracy": 0.93651580973105,
"num_tokens": 267147370.0,
"step": 1350
},
{
"epoch": 0.27024567788899,
"eval_entropy": 0.6936582659111649,
"eval_loss": 0.18916112184524536,
"eval_mean_token_accuracy": 0.9457350138758049,
"eval_num_tokens": 267147370.0,
"eval_runtime": 7.0725,
"eval_samples_per_second": 137.575,
"eval_steps_per_second": 8.625,
"step": 1350
},
{
"entropy": 0.6807152347131209,
"epoch": 0.2712465878070974,
"grad_norm": 0.8464104533195496,
"learning_rate": 1.9212633451957295e-06,
"loss": 0.2364,
"mean_token_accuracy": 0.9381036953492599,
"num_tokens": 268288956.0,
"step": 1355
},
{
"entropy": 0.7280165471813895,
"epoch": 0.27224749772520473,
"grad_norm": 0.828230082988739,
"learning_rate": 1.9201512455516013e-06,
"loss": 0.2385,
"mean_token_accuracy": 0.9357607359235937,
"num_tokens": 269355784.0,
"step": 1360
},
{
"entropy": 0.7481856107711792,
"epoch": 0.2732484076433121,
"grad_norm": 0.7362084984779358,
"learning_rate": 1.919039145907473e-06,
"loss": 0.244,
"mean_token_accuracy": 0.9355522545901211,
"num_tokens": 270398456.0,
"step": 1365
},
{
"entropy": 0.7570679174228148,
"epoch": 0.27424931756141946,
"grad_norm": 0.6655718684196472,
"learning_rate": 1.917927046263345e-06,
"loss": 0.2337,
"mean_token_accuracy": 0.938739211992784,
"num_tokens": 271357374.0,
"step": 1370
},
{
"entropy": 0.7706907001408664,
"epoch": 0.2752502274795268,
"grad_norm": 1.7031316757202148,
"learning_rate": 1.916814946619217e-06,
"loss": 0.2383,
"mean_token_accuracy": 0.9374835350296714,
"num_tokens": 272091770.0,
"step": 1375
},
{
"entropy": 0.6735027275302193,
"epoch": 0.27625113739763424,
"grad_norm": 0.847005307674408,
"learning_rate": 1.9157028469750887e-06,
"loss": 0.2313,
"mean_token_accuracy": 0.9391226519237865,
"num_tokens": 273228350.0,
"step": 1380
},
{
"epoch": 0.27625113739763424,
"eval_entropy": 0.6851567674855716,
"eval_loss": 0.19188910722732544,
"eval_mean_token_accuracy": 0.9446489283295928,
"eval_num_tokens": 273228350.0,
"eval_runtime": 7.0137,
"eval_samples_per_second": 138.728,
"eval_steps_per_second": 8.697,
"step": 1380
},
{
"entropy": 0.7211063027381897,
"epoch": 0.2772520473157416,
"grad_norm": 0.7908993363380432,
"learning_rate": 1.914590747330961e-06,
"loss": 0.2372,
"mean_token_accuracy": 0.9372067868709564,
"num_tokens": 274295508.0,
"step": 1385
},
{
"entropy": 0.745275920087641,
"epoch": 0.27825295723384896,
"grad_norm": 0.7628899216651917,
"learning_rate": 1.913478647686833e-06,
"loss": 0.2376,
"mean_token_accuracy": 0.9368884086608886,
"num_tokens": 275339917.0,
"step": 1390
},
{
"entropy": 0.760984147678722,
"epoch": 0.2792538671519563,
"grad_norm": 0.6237201690673828,
"learning_rate": 1.9123665480427043e-06,
"loss": 0.2349,
"mean_token_accuracy": 0.9373009134422648,
"num_tokens": 276295274.0,
"step": 1395
},
{
"entropy": 0.7674431963400408,
"epoch": 0.2802547770700637,
"grad_norm": 1.5829390287399292,
"learning_rate": 1.9112544483985766e-06,
"loss": 0.2381,
"mean_token_accuracy": 0.9387489958242936,
"num_tokens": 277028591.0,
"step": 1400
},
{
"entropy": 0.6689763746478341,
"epoch": 0.28125568698817105,
"grad_norm": 0.9000157713890076,
"learning_rate": 1.9101423487544485e-06,
"loss": 0.2285,
"mean_token_accuracy": 0.9405502384359187,
"num_tokens": 278145836.0,
"step": 1405
},
{
"entropy": 0.7273861186070876,
"epoch": 0.2822565969062784,
"grad_norm": 0.7861266732215881,
"learning_rate": 1.9090302491103203e-06,
"loss": 0.2402,
"mean_token_accuracy": 0.9364809323440898,
"num_tokens": 279207214.0,
"step": 1410
},
{
"epoch": 0.2822565969062784,
"eval_entropy": 0.6891253717610093,
"eval_loss": 0.19434459507465363,
"eval_mean_token_accuracy": 0.9439800313261689,
"eval_num_tokens": 279207214.0,
"eval_runtime": 7.0731,
"eval_samples_per_second": 137.563,
"eval_steps_per_second": 8.624,
"step": 1410
},
{
"entropy": 0.7461361895908009,
"epoch": 0.28325750682438583,
"grad_norm": 0.7425960898399353,
"learning_rate": 1.907918149466192e-06,
"loss": 0.2326,
"mean_token_accuracy": 0.936630117893219,
"num_tokens": 280228528.0,
"step": 1415
},
{
"entropy": 0.7540929274125533,
"epoch": 0.2842584167424932,
"grad_norm": 0.6490366458892822,
"learning_rate": 1.906806049822064e-06,
"loss": 0.2303,
"mean_token_accuracy": 0.938701045513153,
"num_tokens": 281179143.0,
"step": 1420
},
{
"entropy": 0.7628308453343131,
"epoch": 0.28525932666060055,
"grad_norm": 1.7688848972320557,
"learning_rate": 1.9056939501779359e-06,
"loss": 0.2352,
"mean_token_accuracy": 0.938917446678335,
"num_tokens": 281909203.0,
"step": 1425
},
{
"entropy": 0.6694089114665985,
"epoch": 0.2862602365787079,
"grad_norm": 0.8820457458496094,
"learning_rate": 1.9045818505338077e-06,
"loss": 0.2298,
"mean_token_accuracy": 0.9397172857414592,
"num_tokens": 283042814.0,
"step": 1430
},
{
"entropy": 0.7282295411283319,
"epoch": 0.2872611464968153,
"grad_norm": 0.7975929379463196,
"learning_rate": 1.9034697508896796e-06,
"loss": 0.241,
"mean_token_accuracy": 0.9352296363223683,
"num_tokens": 284128852.0,
"step": 1435
},
{
"entropy": 0.7595264895395799,
"epoch": 0.28826205641492264,
"grad_norm": 0.734137773513794,
"learning_rate": 1.9023576512455515e-06,
"loss": 0.2438,
"mean_token_accuracy": 0.9358488120815971,
"num_tokens": 285148293.0,
"step": 1440
},
{
"epoch": 0.28826205641492264,
"eval_entropy": 0.6968571543693542,
"eval_loss": 0.19376739859580994,
"eval_mean_token_accuracy": 0.9440957708436934,
"eval_num_tokens": 285148293.0,
"eval_runtime": 7.0823,
"eval_samples_per_second": 137.384,
"eval_steps_per_second": 8.613,
"step": 1440
},
{
"entropy": 0.774249031868848,
"epoch": 0.28926296633303,
"grad_norm": 0.6629706025123596,
"learning_rate": 1.9012455516014233e-06,
"loss": 0.2369,
"mean_token_accuracy": 0.9372912900014357,
"num_tokens": 286110916.0,
"step": 1445
},
{
"entropy": 0.7704967883500186,
"epoch": 0.2902638762511374,
"grad_norm": 1.558838129043579,
"learning_rate": 1.9001334519572954e-06,
"loss": 0.2389,
"mean_token_accuracy": 0.9377495538104664,
"num_tokens": 286850014.0,
"step": 1450
},
{
"entropy": 0.6755463258786635,
"epoch": 0.2912647861692448,
"grad_norm": 0.8654264211654663,
"learning_rate": 1.899021352313167e-06,
"loss": 0.2256,
"mean_token_accuracy": 0.9411474087021567,
"num_tokens": 287979759.0,
"step": 1455
},
{
"entropy": 0.7257778595794331,
"epoch": 0.29226569608735214,
"grad_norm": 0.771135687828064,
"learning_rate": 1.897909252669039e-06,
"loss": 0.2318,
"mean_token_accuracy": 0.9382907439361919,
"num_tokens": 289095571.0,
"step": 1460
},
{
"entropy": 0.7421090098944577,
"epoch": 0.2932666060054595,
"grad_norm": 0.7648544907569885,
"learning_rate": 1.896797153024911e-06,
"loss": 0.2368,
"mean_token_accuracy": 0.9367521686987443,
"num_tokens": 290146846.0,
"step": 1465
},
{
"entropy": 0.7553750325332989,
"epoch": 0.29426751592356687,
"grad_norm": 0.7135232090950012,
"learning_rate": 1.8956850533807828e-06,
"loss": 0.2312,
"mean_token_accuracy": 0.9393588678403334,
"num_tokens": 291083010.0,
"step": 1470
},
{
"epoch": 0.29426751592356687,
"eval_entropy": 0.6849908721251566,
"eval_loss": 0.192779079079628,
"eval_mean_token_accuracy": 0.945117437448658,
"eval_num_tokens": 291083010.0,
"eval_runtime": 7.1063,
"eval_samples_per_second": 136.92,
"eval_steps_per_second": 8.584,
"step": 1470
},
{
"entropy": 0.764943554726514,
"epoch": 0.29526842584167423,
"grad_norm": 1.642973780632019,
"learning_rate": 1.8945729537366549e-06,
"loss": 0.2358,
"mean_token_accuracy": 0.9390517413616181,
"num_tokens": 291805745.0,
"step": 1475
},
{
"entropy": 0.6666032200509852,
"epoch": 0.2962693357597816,
"grad_norm": 0.9155055284500122,
"learning_rate": 1.8934608540925265e-06,
"loss": 0.2214,
"mean_token_accuracy": 0.9421197701584209,
"num_tokens": 292942990.0,
"step": 1480
},
{
"entropy": 0.7196945285255258,
"epoch": 0.297270245677889,
"grad_norm": 0.8412073254585266,
"learning_rate": 1.8923487544483984e-06,
"loss": 0.2319,
"mean_token_accuracy": 0.9379522231492129,
"num_tokens": 294035917.0,
"step": 1485
},
{
"entropy": 0.7461063379591162,
"epoch": 0.2982711555959964,
"grad_norm": 0.7782725095748901,
"learning_rate": 1.8912366548042705e-06,
"loss": 0.2237,
"mean_token_accuracy": 0.939490559426221,
"num_tokens": 295053863.0,
"step": 1490
},
{
"entropy": 0.7554086994041096,
"epoch": 0.29927206551410374,
"grad_norm": 0.6107756495475769,
"learning_rate": 1.8901245551601423e-06,
"loss": 0.2238,
"mean_token_accuracy": 0.9404414875940843,
"num_tokens": 296006683.0,
"step": 1495
},
{
"entropy": 0.7677417294545608,
"epoch": 0.3002729754322111,
"grad_norm": 1.474409580230713,
"learning_rate": 1.889012455516014e-06,
"loss": 0.2235,
"mean_token_accuracy": 0.9412395347248425,
"num_tokens": 296732879.0,
"step": 1500
},
{
"epoch": 0.3002729754322111,
"eval_entropy": 0.689288352845145,
"eval_loss": 0.19140712916851044,
"eval_mean_token_accuracy": 0.9447863365783066,
"eval_num_tokens": 296732879.0,
"eval_runtime": 7.0341,
"eval_samples_per_second": 138.327,
"eval_steps_per_second": 8.672,
"step": 1500
},
{
"entropy": 0.6815881165591153,
"epoch": 0.30127388535031846,
"grad_norm": 0.898077130317688,
"learning_rate": 1.887900355871886e-06,
"loss": 0.2252,
"mean_token_accuracy": 0.9407751598141411,
"num_tokens": 297865911.0,
"step": 1505
},
{
"entropy": 0.7220306786623868,
"epoch": 0.3022747952684258,
"grad_norm": 0.7506076693534851,
"learning_rate": 1.8867882562277579e-06,
"loss": 0.2271,
"mean_token_accuracy": 0.9396520457484505,
"num_tokens": 298941429.0,
"step": 1510
},
{
"entropy": 0.7438318740237843,
"epoch": 0.3032757051865332,
"grad_norm": 0.6981909275054932,
"learning_rate": 1.88567615658363e-06,
"loss": 0.2306,
"mean_token_accuracy": 0.9382837609811263,
"num_tokens": 299985729.0,
"step": 1515
},
{
"entropy": 0.7525452472946861,
"epoch": 0.3042766151046406,
"grad_norm": 0.6050431728363037,
"learning_rate": 1.8845640569395016e-06,
"loss": 0.2281,
"mean_token_accuracy": 0.9401122011921622,
"num_tokens": 300946365.0,
"step": 1520
},
{
"entropy": 0.7545472253452647,
"epoch": 0.30527752502274796,
"grad_norm": 1.560426115989685,
"learning_rate": 1.8834519572953735e-06,
"loss": 0.2295,
"mean_token_accuracy": 0.9403627395629883,
"num_tokens": 301692068.0,
"step": 1525
},
{
"entropy": 0.678287308324467,
"epoch": 0.3062784349408553,
"grad_norm": 0.9419786334037781,
"learning_rate": 1.8823398576512455e-06,
"loss": 0.2251,
"mean_token_accuracy": 0.9410501371730458,
"num_tokens": 302836523.0,
"step": 1530
},
{
"epoch": 0.3062784349408553,
"eval_entropy": 0.6856205536693823,
"eval_loss": 0.19144752621650696,
"eval_mean_token_accuracy": 0.9452652296081918,
"eval_num_tokens": 302836523.0,
"eval_runtime": 7.0363,
"eval_samples_per_second": 138.283,
"eval_steps_per_second": 8.669,
"step": 1530
},
{
"entropy": 0.7217613477598537,
"epoch": 0.3072793448589627,
"grad_norm": 0.7879256010055542,
"learning_rate": 1.8812277580071174e-06,
"loss": 0.2274,
"mean_token_accuracy": 0.9390739977359772,
"num_tokens": 303914943.0,
"step": 1535
},
{
"entropy": 0.7384155148809607,
"epoch": 0.30828025477707005,
"grad_norm": 0.7203854918479919,
"learning_rate": 1.880115658362989e-06,
"loss": 0.2313,
"mean_token_accuracy": 0.9380901526321065,
"num_tokens": 304955992.0,
"step": 1540
},
{
"entropy": 0.7574896769090133,
"epoch": 0.3092811646951774,
"grad_norm": 0.6372812986373901,
"learning_rate": 1.8790035587188611e-06,
"loss": 0.2324,
"mean_token_accuracy": 0.9386772296645425,
"num_tokens": 305902295.0,
"step": 1545
},
{
"entropy": 0.7646282634951852,
"epoch": 0.31028207461328483,
"grad_norm": 1.6246287822723389,
"learning_rate": 1.877891459074733e-06,
"loss": 0.2342,
"mean_token_accuracy": 0.93938661867922,
"num_tokens": 306621753.0,
"step": 1550
},
{
"entropy": 0.6615015620535071,
"epoch": 0.3112829845313922,
"grad_norm": 0.8894542455673218,
"learning_rate": 1.876779359430605e-06,
"loss": 0.2159,
"mean_token_accuracy": 0.9429294396530498,
"num_tokens": 307776834.0,
"step": 1555
},
{
"entropy": 0.7109584380279887,
"epoch": 0.31228389444949956,
"grad_norm": 0.7467630505561829,
"learning_rate": 1.8756672597864769e-06,
"loss": 0.2262,
"mean_token_accuracy": 0.9396391229196028,
"num_tokens": 308870945.0,
"step": 1560
},
{
"epoch": 0.31228389444949956,
"eval_entropy": 0.6883383735281522,
"eval_loss": 0.18963798880577087,
"eval_mean_token_accuracy": 0.94547944967864,
"eval_num_tokens": 308870945.0,
"eval_runtime": 7.0626,
"eval_samples_per_second": 137.768,
"eval_steps_per_second": 8.637,
"step": 1560
},
{
"entropy": 0.7411357695406133,
"epoch": 0.3132848043676069,
"grad_norm": 0.7028961181640625,
"learning_rate": 1.8745551601423485e-06,
"loss": 0.2327,
"mean_token_accuracy": 0.9375578728589145,
"num_tokens": 309929138.0,
"step": 1565
},
{
"entropy": 0.7481155969879844,
"epoch": 0.3142857142857143,
"grad_norm": 0.6543077230453491,
"learning_rate": 1.8734430604982206e-06,
"loss": 0.2229,
"mean_token_accuracy": 0.9416344767267054,
"num_tokens": 310870701.0,
"step": 1570
},
{
"entropy": 0.7611248016357421,
"epoch": 0.31528662420382164,
"grad_norm": 1.6195554733276367,
"learning_rate": 1.8723309608540925e-06,
"loss": 0.226,
"mean_token_accuracy": 0.9410505023869601,
"num_tokens": 311585943.0,
"step": 1575
},
{
"entropy": 0.6666329188780351,
"epoch": 0.316287534121929,
"grad_norm": 0.9081742167472839,
"learning_rate": 1.8712188612099643e-06,
"loss": 0.2203,
"mean_token_accuracy": 0.9423610427162864,
"num_tokens": 312697721.0,
"step": 1580
},
{
"entropy": 0.7309835661541332,
"epoch": 0.3172884440400364,
"grad_norm": 0.7687853574752808,
"learning_rate": 1.8701067615658362e-06,
"loss": 0.2313,
"mean_token_accuracy": 0.9385158609260212,
"num_tokens": 313772951.0,
"step": 1585
},
{
"entropy": 0.747816955501383,
"epoch": 0.3182893539581438,
"grad_norm": 0.7106137871742249,
"learning_rate": 1.868994661921708e-06,
"loss": 0.2306,
"mean_token_accuracy": 0.9387970268726349,
"num_tokens": 314805910.0,
"step": 1590
},
{
"epoch": 0.3182893539581438,
"eval_entropy": 0.6863706659098141,
"eval_loss": 0.1875828355550766,
"eval_mean_token_accuracy": 0.9464720380110819,
"eval_num_tokens": 314805910.0,
"eval_runtime": 7.0006,
"eval_samples_per_second": 138.987,
"eval_steps_per_second": 8.713,
"step": 1590
},
{
"entropy": 0.7534966772252863,
"epoch": 0.31929026387625115,
"grad_norm": 0.6492555141448975,
"learning_rate": 1.86788256227758e-06,
"loss": 0.2231,
"mean_token_accuracy": 0.9415834470228716,
"num_tokens": 315758423.0,
"step": 1595
},
{
"entropy": 0.7610360833731564,
"epoch": 0.3202911737943585,
"grad_norm": 1.5349500179290771,
"learning_rate": 1.866770462633452e-06,
"loss": 0.2281,
"mean_token_accuracy": 0.9405443429946899,
"num_tokens": 316494038.0,
"step": 1600
},
{
"entropy": 0.6749826358123259,
"epoch": 0.32129208371246587,
"grad_norm": 0.8564639091491699,
"learning_rate": 1.8656583629893236e-06,
"loss": 0.2168,
"mean_token_accuracy": 0.9434851581400091,
"num_tokens": 317637128.0,
"step": 1605
},
{
"entropy": 0.7281479911370711,
"epoch": 0.32229299363057323,
"grad_norm": 0.7530900239944458,
"learning_rate": 1.8645462633451957e-06,
"loss": 0.2306,
"mean_token_accuracy": 0.9385505968874152,
"num_tokens": 318711437.0,
"step": 1610
},
{
"entropy": 0.7480952777645805,
"epoch": 0.3232939035486806,
"grad_norm": 0.7651330828666687,
"learning_rate": 1.8634341637010675e-06,
"loss": 0.2222,
"mean_token_accuracy": 0.9407213232733986,
"num_tokens": 319762432.0,
"step": 1615
},
{
"entropy": 0.7627674433318051,
"epoch": 0.324294813466788,
"grad_norm": 1.319263219833374,
"learning_rate": 1.8623220640569394e-06,
"loss": 0.2291,
"mean_token_accuracy": 0.9400727407498793,
"num_tokens": 320714756.0,
"step": 1620
},
{
"epoch": 0.324294813466788,
"eval_entropy": 0.6917840105588319,
"eval_loss": 0.1892538070678711,
"eval_mean_token_accuracy": 0.9453025753380823,
"eval_num_tokens": 320714756.0,
"eval_runtime": 7.0555,
"eval_samples_per_second": 137.907,
"eval_steps_per_second": 8.646,
"step": 1620
},
{
"entropy": 0.7565292894840241,
"epoch": 0.3252957233848954,
"grad_norm": 1.500351905822754,
"learning_rate": 1.8612099644128113e-06,
"loss": 0.2253,
"mean_token_accuracy": 0.9414621531963349,
"num_tokens": 321455218.0,
"step": 1625
},
{
"entropy": 0.6765736005522988,
"epoch": 0.32629663330300274,
"grad_norm": 0.8855953216552734,
"learning_rate": 1.8600978647686831e-06,
"loss": 0.2215,
"mean_token_accuracy": 0.9421357078985735,
"num_tokens": 322588730.0,
"step": 1630
},
{
"entropy": 0.731200877644799,
"epoch": 0.3272975432211101,
"grad_norm": 0.8593675494194031,
"learning_rate": 1.8589857651245552e-06,
"loss": 0.2211,
"mean_token_accuracy": 0.940478920394724,
"num_tokens": 323655165.0,
"step": 1635
},
{
"entropy": 0.7489201041785154,
"epoch": 0.32829845313921746,
"grad_norm": 0.6950727105140686,
"learning_rate": 1.857873665480427e-06,
"loss": 0.2267,
"mean_token_accuracy": 0.9399041311307387,
"num_tokens": 324696694.0,
"step": 1640
},
{
"entropy": 0.7550950695167888,
"epoch": 0.3292993630573248,
"grad_norm": 0.6508896946907043,
"learning_rate": 1.8567615658362989e-06,
"loss": 0.2215,
"mean_token_accuracy": 0.9421004755930467,
"num_tokens": 325659634.0,
"step": 1645
},
{
"entropy": 0.7663576700470665,
"epoch": 0.3303002729754322,
"grad_norm": 1.634458065032959,
"learning_rate": 1.8556494661921708e-06,
"loss": 0.2207,
"mean_token_accuracy": 0.9429123352874409,
"num_tokens": 326395974.0,
"step": 1650
},
{
"epoch": 0.3303002729754322,
"eval_entropy": 0.6862643673771718,
"eval_loss": 0.18751998245716095,
"eval_mean_token_accuracy": 0.9456404789549405,
"eval_num_tokens": 326395974.0,
"eval_runtime": 7.1493,
"eval_samples_per_second": 136.097,
"eval_steps_per_second": 8.532,
"step": 1650
},
{
"entropy": 0.6814083841713992,
"epoch": 0.3313011828935396,
"grad_norm": 0.8884172439575195,
"learning_rate": 1.8545373665480426e-06,
"loss": 0.2205,
"mean_token_accuracy": 0.9427086288278753,
"num_tokens": 327518250.0,
"step": 1655
},
{
"entropy": 0.7267371454022148,
"epoch": 0.33230209281164697,
"grad_norm": 0.864007294178009,
"learning_rate": 1.8534252669039145e-06,
"loss": 0.225,
"mean_token_accuracy": 0.9397233930501071,
"num_tokens": 328618461.0,
"step": 1660
},
{
"entropy": 0.7420350654558702,
"epoch": 0.3333030027297543,
"grad_norm": 0.7210493087768555,
"learning_rate": 1.8523131672597865e-06,
"loss": 0.2179,
"mean_token_accuracy": 0.9413225569508292,
"num_tokens": 329644537.0,
"step": 1665
},
{
"entropy": 0.7561123300682414,
"epoch": 0.3343039126478617,
"grad_norm": 0.6487271785736084,
"learning_rate": 1.8512010676156582e-06,
"loss": 0.2254,
"mean_token_accuracy": 0.9409215840426358,
"num_tokens": 330598048.0,
"step": 1670
},
{
"entropy": 0.7530049925500696,
"epoch": 0.33530482256596905,
"grad_norm": 1.4161484241485596,
"learning_rate": 1.85008896797153e-06,
"loss": 0.2262,
"mean_token_accuracy": 0.9407056949355386,
"num_tokens": 331335749.0,
"step": 1675
},
{
"entropy": 0.6709848523139954,
"epoch": 0.3363057324840764,
"grad_norm": 0.8709940314292908,
"learning_rate": 1.8489768683274021e-06,
"loss": 0.2126,
"mean_token_accuracy": 0.9443153614347631,
"num_tokens": 332446173.0,
"step": 1680
},
{
"epoch": 0.3363057324840764,
"eval_entropy": 0.6809069924667234,
"eval_loss": 0.18685181438922882,
"eval_mean_token_accuracy": 0.9458920027388901,
"eval_num_tokens": 332446173.0,
"eval_runtime": 7.0534,
"eval_samples_per_second": 137.948,
"eval_steps_per_second": 8.648,
"step": 1680
},
{
"entropy": 0.7158765223893252,
"epoch": 0.3373066424021838,
"grad_norm": 0.7924162745475769,
"learning_rate": 1.847864768683274e-06,
"loss": 0.2193,
"mean_token_accuracy": 0.9411095483736558,
"num_tokens": 333540296.0,
"step": 1685
},
{
"entropy": 0.7432989163832231,
"epoch": 0.3383075523202912,
"grad_norm": 0.7170067429542542,
"learning_rate": 1.8467526690391458e-06,
"loss": 0.2231,
"mean_token_accuracy": 0.9399533082138408,
"num_tokens": 334567842.0,
"step": 1690
},
{
"entropy": 0.7567171161825007,
"epoch": 0.33930846223839856,
"grad_norm": 0.8179503679275513,
"learning_rate": 1.8456405693950177e-06,
"loss": 0.2163,
"mean_token_accuracy": 0.9416104576804422,
"num_tokens": 335513564.0,
"step": 1695
},
{
"entropy": 0.7489974737167359,
"epoch": 0.3403093721565059,
"grad_norm": 1.53611421585083,
"learning_rate": 1.8445284697508895e-06,
"loss": 0.2204,
"mean_token_accuracy": 0.942312642661008,
"num_tokens": 336241111.0,
"step": 1700
},
{
"entropy": 0.654525652256879,
"epoch": 0.3413102820746133,
"grad_norm": 0.8707150816917419,
"learning_rate": 1.8434163701067616e-06,
"loss": 0.2149,
"mean_token_accuracy": 0.9436011032624678,
"num_tokens": 337388829.0,
"step": 1705
},
{
"entropy": 0.7030858914960515,
"epoch": 0.34231119199272064,
"grad_norm": 0.7638726830482483,
"learning_rate": 1.8423042704626333e-06,
"loss": 0.2166,
"mean_token_accuracy": 0.9418383235281164,
"num_tokens": 338487950.0,
"step": 1710
},
{
"epoch": 0.34231119199272064,
"eval_entropy": 0.6751082666584702,
"eval_loss": 0.18752196431159973,
"eval_mean_token_accuracy": 0.9461400889959491,
"eval_num_tokens": 338487950.0,
"eval_runtime": 7.0608,
"eval_samples_per_second": 137.804,
"eval_steps_per_second": 8.639,
"step": 1710
},
{
"entropy": 0.7331013533202084,
"epoch": 0.343312101910828,
"grad_norm": 0.7417324781417847,
"learning_rate": 1.8411921708185051e-06,
"loss": 0.2186,
"mean_token_accuracy": 0.9412295021794059,
"num_tokens": 339528555.0,
"step": 1715
},
{
"entropy": 0.7409337301145901,
"epoch": 0.3443130118289354,
"grad_norm": 0.5683432817459106,
"learning_rate": 1.8400800711743772e-06,
"loss": 0.2216,
"mean_token_accuracy": 0.9424649195237593,
"num_tokens": 340471062.0,
"step": 1720
},
{
"entropy": 0.7573754261840474,
"epoch": 0.3453139217470428,
"grad_norm": 1.6265780925750732,
"learning_rate": 1.838967971530249e-06,
"loss": 0.2232,
"mean_token_accuracy": 0.9421446426348252,
"num_tokens": 341200791.0,
"step": 1725
},
{
"entropy": 0.6634017543359236,
"epoch": 0.34631483166515015,
"grad_norm": 0.8331648111343384,
"learning_rate": 1.8378558718861211e-06,
"loss": 0.215,
"mean_token_accuracy": 0.9439348957755349,
"num_tokens": 342354243.0,
"step": 1730
},
{
"entropy": 0.7234609690579501,
"epoch": 0.3473157415832575,
"grad_norm": 0.7795122861862183,
"learning_rate": 1.8367437722419928e-06,
"loss": 0.2218,
"mean_token_accuracy": 0.9413307238708842,
"num_tokens": 343460007.0,
"step": 1735
},
{
"entropy": 0.7389553557742726,
"epoch": 0.34831665150136487,
"grad_norm": 0.7650998830795288,
"learning_rate": 1.8356316725978646e-06,
"loss": 0.2221,
"mean_token_accuracy": 0.9402573309161446,
"num_tokens": 344485585.0,
"step": 1740
},
{
"epoch": 0.34831665150136487,
"eval_entropy": 0.6805569617474665,
"eval_loss": 0.18589681386947632,
"eval_mean_token_accuracy": 0.9470884516590932,
"eval_num_tokens": 344485585.0,
"eval_runtime": 7.0844,
"eval_samples_per_second": 137.343,
"eval_steps_per_second": 8.61,
"step": 1740
},
{
"entropy": 0.7529873455112631,
"epoch": 0.34931756141947223,
"grad_norm": 0.6545958518981934,
"learning_rate": 1.8345195729537367e-06,
"loss": 0.2154,
"mean_token_accuracy": 0.9425264336846092,
"num_tokens": 345437559.0,
"step": 1745
},
{
"entropy": 0.761711223558946,
"epoch": 0.3503184713375796,
"grad_norm": 1.5173709392547607,
"learning_rate": 1.8334074733096085e-06,
"loss": 0.2199,
"mean_token_accuracy": 0.9430206764828075,
"num_tokens": 346169843.0,
"step": 1750
},
{
"entropy": 0.6663856636394154,
"epoch": 0.351319381255687,
"grad_norm": 0.9439927935600281,
"learning_rate": 1.8322953736654802e-06,
"loss": 0.2083,
"mean_token_accuracy": 0.9455912454561753,
"num_tokens": 347311478.0,
"step": 1755
},
{
"entropy": 0.7240512457760897,
"epoch": 0.3523202911737944,
"grad_norm": 0.8567253351211548,
"learning_rate": 1.8311832740213523e-06,
"loss": 0.2201,
"mean_token_accuracy": 0.94097445119511,
"num_tokens": 348391019.0,
"step": 1760
},
{
"entropy": 0.7536248537627134,
"epoch": 0.35332120109190174,
"grad_norm": 0.7871220707893372,
"learning_rate": 1.8300711743772241e-06,
"loss": 0.2221,
"mean_token_accuracy": 0.9408430018208244,
"num_tokens": 349427873.0,
"step": 1765
},
{
"entropy": 0.752256919037212,
"epoch": 0.3543221110100091,
"grad_norm": 0.7607414722442627,
"learning_rate": 1.8289590747330962e-06,
"loss": 0.2133,
"mean_token_accuracy": 0.9429961264133453,
"num_tokens": 350368302.0,
"step": 1770
},
{
"epoch": 0.3543221110100091,
"eval_entropy": 0.6793914603405311,
"eval_loss": 0.18565388023853302,
"eval_mean_token_accuracy": 0.9467462975470746,
"eval_num_tokens": 350368302.0,
"eval_runtime": 7.0807,
"eval_samples_per_second": 137.416,
"eval_steps_per_second": 8.615,
"step": 1770
},
{
"entropy": 0.7683494849638506,
"epoch": 0.35532302092811646,
"grad_norm": 1.6403712034225464,
"learning_rate": 1.8278469750889678e-06,
"loss": 0.2249,
"mean_token_accuracy": 0.9412687144496225,
"num_tokens": 351095025.0,
"step": 1775
},
{
"entropy": 0.6670152826742692,
"epoch": 0.3563239308462238,
"grad_norm": 0.8832131624221802,
"learning_rate": 1.8267348754448397e-06,
"loss": 0.205,
"mean_token_accuracy": 0.9458218796686693,
"num_tokens": 352193150.0,
"step": 1780
},
{
"entropy": 0.7185638297687877,
"epoch": 0.3573248407643312,
"grad_norm": 0.8331535458564758,
"learning_rate": 1.8256227758007118e-06,
"loss": 0.2204,
"mean_token_accuracy": 0.9414176198569211,
"num_tokens": 353272050.0,
"step": 1785
},
{
"entropy": 0.7420683860778808,
"epoch": 0.3583257506824386,
"grad_norm": 0.8582054376602173,
"learning_rate": 1.8245106761565836e-06,
"loss": 0.2194,
"mean_token_accuracy": 0.9416669065302069,
"num_tokens": 354301218.0,
"step": 1790
},
{
"entropy": 0.7551353487101469,
"epoch": 0.35932666060054597,
"grad_norm": 0.6729021668434143,
"learning_rate": 1.8233985765124555e-06,
"loss": 0.2176,
"mean_token_accuracy": 0.9424347899176858,
"num_tokens": 355250303.0,
"step": 1795
},
{
"entropy": 0.751355068250136,
"epoch": 0.36032757051865333,
"grad_norm": 1.6505812406539917,
"learning_rate": 1.8222864768683273e-06,
"loss": 0.2153,
"mean_token_accuracy": 0.9439191401004792,
"num_tokens": 355986825.0,
"step": 1800
},
{
"epoch": 0.36032757051865333,
"eval_entropy": 0.6828553637520212,
"eval_loss": 0.18496987223625183,
"eval_mean_token_accuracy": 0.946765600657854,
"eval_num_tokens": 355986825.0,
"eval_runtime": 7.196,
"eval_samples_per_second": 135.214,
"eval_steps_per_second": 8.477,
"step": 1800
},
{
"entropy": 0.671533118052916,
"epoch": 0.3613284804367607,
"grad_norm": 0.9078927040100098,
"learning_rate": 1.8211743772241992e-06,
"loss": 0.2158,
"mean_token_accuracy": 0.9436379351399161,
"num_tokens": 357130061.0,
"step": 1805
},
{
"entropy": 0.7248334830457513,
"epoch": 0.36232939035486805,
"grad_norm": 0.8133084177970886,
"learning_rate": 1.820062277580071e-06,
"loss": 0.2168,
"mean_token_accuracy": 0.9420726684006777,
"num_tokens": 358203695.0,
"step": 1810
},
{
"entropy": 0.749659313396974,
"epoch": 0.3633303002729754,
"grad_norm": 0.7274289727210999,
"learning_rate": 1.8189501779359431e-06,
"loss": 0.2183,
"mean_token_accuracy": 0.9410731163891879,
"num_tokens": 359255272.0,
"step": 1815
},
{
"entropy": 0.7492427452044054,
"epoch": 0.3643312101910828,
"grad_norm": 0.6970122456550598,
"learning_rate": 1.8178380782918148e-06,
"loss": 0.2122,
"mean_token_accuracy": 0.9437575806270946,
"num_tokens": 360205760.0,
"step": 1820
},
{
"entropy": 0.7630361009727825,
"epoch": 0.3653321201091902,
"grad_norm": 1.5553841590881348,
"learning_rate": 1.8167259786476868e-06,
"loss": 0.2171,
"mean_token_accuracy": 0.9435903998938474,
"num_tokens": 360930756.0,
"step": 1825
},
{
"entropy": 0.6604581854560159,
"epoch": 0.36633303002729756,
"grad_norm": 0.8930894136428833,
"learning_rate": 1.8156138790035587e-06,
"loss": 0.2064,
"mean_token_accuracy": 0.9453142903067849,
"num_tokens": 362073768.0,
"step": 1830
},
{
"epoch": 0.36633303002729756,
"eval_entropy": 0.6797945323537607,
"eval_loss": 0.18627671897411346,
"eval_mean_token_accuracy": 0.9462826427866201,
"eval_num_tokens": 362073768.0,
"eval_runtime": 7.0766,
"eval_samples_per_second": 137.496,
"eval_steps_per_second": 8.62,
"step": 1830
},
{
"entropy": 0.7122279877012426,
"epoch": 0.3673339399454049,
"grad_norm": 0.7932081818580627,
"learning_rate": 1.8145017793594305e-06,
"loss": 0.2163,
"mean_token_accuracy": 0.9423632816834884,
"num_tokens": 363174699.0,
"step": 1835
},
{
"entropy": 0.7323072322390296,
"epoch": 0.3683348498635123,
"grad_norm": 0.8120712637901306,
"learning_rate": 1.8133896797153024e-06,
"loss": 0.2135,
"mean_token_accuracy": 0.9426818517121401,
"num_tokens": 364220503.0,
"step": 1840
},
{
"entropy": 0.7379515593702143,
"epoch": 0.36933575978161964,
"grad_norm": 0.6464109420776367,
"learning_rate": 1.8122775800711743e-06,
"loss": 0.2155,
"mean_token_accuracy": 0.9436933148990978,
"num_tokens": 365183344.0,
"step": 1845
},
{
"entropy": 0.747268967736851,
"epoch": 0.370336669699727,
"grad_norm": 1.5971249341964722,
"learning_rate": 1.8111654804270461e-06,
"loss": 0.2143,
"mean_token_accuracy": 0.9438003106550736,
"num_tokens": 365919847.0,
"step": 1850
},
{
"entropy": 0.6582850561900573,
"epoch": 0.37133757961783437,
"grad_norm": 0.863985538482666,
"learning_rate": 1.8100533807829182e-06,
"loss": 0.2095,
"mean_token_accuracy": 0.9448756602677432,
"num_tokens": 367064732.0,
"step": 1855
},
{
"entropy": 0.7067332503470507,
"epoch": 0.3723384895359418,
"grad_norm": 0.8197696208953857,
"learning_rate": 1.8089412811387898e-06,
"loss": 0.2105,
"mean_token_accuracy": 0.9437748323787343,
"num_tokens": 368151926.0,
"step": 1860
},
{
"epoch": 0.3723384895359418,
"eval_entropy": 0.6697740437554531,
"eval_loss": 0.18541671335697174,
"eval_mean_token_accuracy": 0.9467641076103586,
"eval_num_tokens": 368151926.0,
"eval_runtime": 7.2316,
"eval_samples_per_second": 134.548,
"eval_steps_per_second": 8.435,
"step": 1860
},
{
"entropy": 0.7188923353498632,
"epoch": 0.37333939945404915,
"grad_norm": 0.7241514921188354,
"learning_rate": 1.807829181494662e-06,
"loss": 0.2159,
"mean_token_accuracy": 0.9426933927969499,
"num_tokens": 369193883.0,
"step": 1865
},
{
"entropy": 0.7283467411994934,
"epoch": 0.3743403093721565,
"grad_norm": 0.6511433720588684,
"learning_rate": 1.8067170818505338e-06,
"loss": 0.2046,
"mean_token_accuracy": 0.9453721604563973,
"num_tokens": 370160842.0,
"step": 1870
},
{
"entropy": 0.7400695074688305,
"epoch": 0.37534121929026387,
"grad_norm": 1.732633113861084,
"learning_rate": 1.8056049822064056e-06,
"loss": 0.2073,
"mean_token_accuracy": 0.9456492299383337,
"num_tokens": 370892250.0,
"step": 1875
},
{
"entropy": 0.6502942296591672,
"epoch": 0.37634212920837123,
"grad_norm": 0.9057416915893555,
"learning_rate": 1.8044928825622777e-06,
"loss": 0.2069,
"mean_token_accuracy": 0.9456757783889771,
"num_tokens": 372022804.0,
"step": 1880
},
{
"entropy": 0.6950947940349579,
"epoch": 0.3773430391264786,
"grad_norm": 0.7539053559303284,
"learning_rate": 1.8033807829181493e-06,
"loss": 0.2123,
"mean_token_accuracy": 0.9439136678522283,
"num_tokens": 373111663.0,
"step": 1885
},
{
"entropy": 0.7123507954857566,
"epoch": 0.378343949044586,
"grad_norm": 0.7048158645629883,
"learning_rate": 1.8022686832740212e-06,
"loss": 0.2048,
"mean_token_accuracy": 0.944564142552289,
"num_tokens": 374149694.0,
"step": 1890
},
{
"epoch": 0.378343949044586,
"eval_entropy": 0.6676561910598005,
"eval_loss": 0.1879517138004303,
"eval_mean_token_accuracy": 0.9460887654882962,
"eval_num_tokens": 374149694.0,
"eval_runtime": 7.0566,
"eval_samples_per_second": 137.886,
"eval_steps_per_second": 8.644,
"step": 1890
},
{
"entropy": 0.7297647855498574,
"epoch": 0.3793448589626934,
"grad_norm": 0.6239880323410034,
"learning_rate": 1.8011565836298933e-06,
"loss": 0.2072,
"mean_token_accuracy": 0.9445389471270821,
"num_tokens": 375098313.0,
"step": 1895
},
{
"entropy": 0.7302077791907571,
"epoch": 0.38034576888080074,
"grad_norm": 1.6145358085632324,
"learning_rate": 1.8000444839857651e-06,
"loss": 0.203,
"mean_token_accuracy": 0.9472867906093597,
"num_tokens": 375816250.0,
"step": 1900
},
{
"entropy": 0.651517802476883,
"epoch": 0.3813466787989081,
"grad_norm": 0.9150720834732056,
"learning_rate": 1.7989323843416368e-06,
"loss": 0.2025,
"mean_token_accuracy": 0.9464400454000993,
"num_tokens": 376972772.0,
"step": 1905
},
{
"entropy": 0.7013754468072545,
"epoch": 0.38234758871701546,
"grad_norm": 0.7586289048194885,
"learning_rate": 1.7978202846975088e-06,
"loss": 0.2136,
"mean_token_accuracy": 0.9431337069381367,
"num_tokens": 378053259.0,
"step": 1910
},
{
"entropy": 0.7290194546634501,
"epoch": 0.3833484986351228,
"grad_norm": 0.7516461610794067,
"learning_rate": 1.7967081850533807e-06,
"loss": 0.2104,
"mean_token_accuracy": 0.9425576074556871,
"num_tokens": 379103040.0,
"step": 1915
},
{
"entropy": 0.729288539019498,
"epoch": 0.3843494085532302,
"grad_norm": 0.620888352394104,
"learning_rate": 1.7955960854092528e-06,
"loss": 0.2161,
"mean_token_accuracy": 0.9428860014135187,
"num_tokens": 380068398.0,
"step": 1920
},
{
"epoch": 0.3843494085532302,
"eval_entropy": 0.6771206176671826,
"eval_loss": 0.18380054831504822,
"eval_mean_token_accuracy": 0.9472967024709358,
"eval_num_tokens": 380068398.0,
"eval_runtime": 7.0603,
"eval_samples_per_second": 137.813,
"eval_steps_per_second": 8.64,
"step": 1920
},
{
"entropy": 0.7446090530265461,
"epoch": 0.3853503184713376,
"grad_norm": 1.8233861923217773,
"learning_rate": 1.7944839857651244e-06,
"loss": 0.2103,
"mean_token_accuracy": 0.9449324960058386,
"num_tokens": 380798426.0,
"step": 1925
},
{
"entropy": 0.6486159508878534,
"epoch": 0.38635122838944497,
"grad_norm": 0.8591449856758118,
"learning_rate": 1.7933718861209963e-06,
"loss": 0.1985,
"mean_token_accuracy": 0.9478086097673937,
"num_tokens": 381921279.0,
"step": 1930
},
{
"entropy": 0.6935458053242076,
"epoch": 0.38735213830755233,
"grad_norm": 0.7733218669891357,
"learning_rate": 1.7922597864768683e-06,
"loss": 0.2105,
"mean_token_accuracy": 0.9436502153223211,
"num_tokens": 383034624.0,
"step": 1935
},
{
"entropy": 0.7152742207050323,
"epoch": 0.3883530482256597,
"grad_norm": 0.7645531296730042,
"learning_rate": 1.7911476868327402e-06,
"loss": 0.2141,
"mean_token_accuracy": 0.9426664742556485,
"num_tokens": 384082302.0,
"step": 1940
},
{
"entropy": 0.72131880142472,
"epoch": 0.38935395814376705,
"grad_norm": 0.6350061893463135,
"learning_rate": 1.7900355871886118e-06,
"loss": 0.2139,
"mean_token_accuracy": 0.9435091322118586,
"num_tokens": 385027404.0,
"step": 1945
},
{
"entropy": 0.7312435442751104,
"epoch": 0.3903548680618744,
"grad_norm": 1.5735912322998047,
"learning_rate": 1.788923487544484e-06,
"loss": 0.2147,
"mean_token_accuracy": 0.9442145147106864,
"num_tokens": 385747602.0,
"step": 1950
},
{
"epoch": 0.3903548680618744,
"eval_entropy": 0.6652554879423047,
"eval_loss": 0.18200834095478058,
"eval_mean_token_accuracy": 0.9481365035791867,
"eval_num_tokens": 385747602.0,
"eval_runtime": 7.0932,
"eval_samples_per_second": 137.173,
"eval_steps_per_second": 8.6,
"step": 1950
},
{
"entropy": 0.6431225692684001,
"epoch": 0.3913557779799818,
"grad_norm": 0.8696854710578918,
"learning_rate": 1.7878113879003558e-06,
"loss": 0.2025,
"mean_token_accuracy": 0.9468498847701333,
"num_tokens": 386887767.0,
"step": 1955
},
{
"entropy": 0.7034454665400766,
"epoch": 0.3923566878980892,
"grad_norm": 0.7643694877624512,
"learning_rate": 1.7866992882562278e-06,
"loss": 0.2126,
"mean_token_accuracy": 0.9431325663219798,
"num_tokens": 387964944.0,
"step": 1960
},
{
"entropy": 0.7246899883855473,
"epoch": 0.39335759781619656,
"grad_norm": 0.7786898016929626,
"learning_rate": 1.7855871886120997e-06,
"loss": 0.2089,
"mean_token_accuracy": 0.9432727011767301,
"num_tokens": 389004308.0,
"step": 1965
},
{
"entropy": 0.7278256719762629,
"epoch": 0.3943585077343039,
"grad_norm": 0.6474554538726807,
"learning_rate": 1.7844750889679713e-06,
"loss": 0.2103,
"mean_token_accuracy": 0.9444690823554993,
"num_tokens": 389960897.0,
"step": 1970
},
{
"entropy": 0.7353235575285825,
"epoch": 0.3953594176524113,
"grad_norm": 1.4766101837158203,
"learning_rate": 1.7833629893238434e-06,
"loss": 0.2122,
"mean_token_accuracy": 0.9448792641813105,
"num_tokens": 390690930.0,
"step": 1975
},
{
"entropy": 0.6459478435191242,
"epoch": 0.39636032757051864,
"grad_norm": 0.8893265128135681,
"learning_rate": 1.7822508896797153e-06,
"loss": 0.1952,
"mean_token_accuracy": 0.9480626046657562,
"num_tokens": 391824072.0,
"step": 1980
},
{
"epoch": 0.39636032757051864,
"eval_entropy": 0.6699851712242502,
"eval_loss": 0.18414077162742615,
"eval_mean_token_accuracy": 0.9468610022888809,
"eval_num_tokens": 391824072.0,
"eval_runtime": 7.0451,
"eval_samples_per_second": 138.11,
"eval_steps_per_second": 8.659,
"step": 1980
},
{
"entropy": 0.6953091193329204,
"epoch": 0.397361237488626,
"grad_norm": 0.7777345776557922,
"learning_rate": 1.7811387900355871e-06,
"loss": 0.1999,
"mean_token_accuracy": 0.9463782939043912,
"num_tokens": 392922191.0,
"step": 1985
},
{
"entropy": 0.7238586826757951,
"epoch": 0.39836214740673337,
"grad_norm": 0.7229942679405212,
"learning_rate": 1.780026690391459e-06,
"loss": 0.2091,
"mean_token_accuracy": 0.9436769247055053,
"num_tokens": 393951915.0,
"step": 1990
},
{
"entropy": 0.7331149523908441,
"epoch": 0.3993630573248408,
"grad_norm": 4.645083427429199,
"learning_rate": 1.7789145907473308e-06,
"loss": 0.1999,
"mean_token_accuracy": 0.9466928687962619,
"num_tokens": 394911168.0,
"step": 1995
},
{
"entropy": 0.7309780456803062,
"epoch": 0.40036396724294815,
"grad_norm": 1.4829082489013672,
"learning_rate": 1.777802491103203e-06,
"loss": 0.2012,
"mean_token_accuracy": 0.9472140420566906,
"num_tokens": 395641494.0,
"step": 2000
},
{
"entropy": 0.6425018229267814,
"epoch": 0.4013648771610555,
"grad_norm": 0.9035446047782898,
"learning_rate": 1.7766903914590748e-06,
"loss": 0.2018,
"mean_token_accuracy": 0.9465541931715878,
"num_tokens": 396806735.0,
"step": 2005
},
{
"entropy": 0.6907033795660192,
"epoch": 0.4023657870791629,
"grad_norm": 0.7838383913040161,
"learning_rate": 1.7755782918149464e-06,
"loss": 0.202,
"mean_token_accuracy": 0.9456824893301183,
"num_tokens": 397897808.0,
"step": 2010
},
{
"epoch": 0.4023657870791629,
"eval_entropy": 0.6626140348246841,
"eval_loss": 0.18410223722457886,
"eval_mean_token_accuracy": 0.9472184445037216,
"eval_num_tokens": 397897808.0,
"eval_runtime": 7.0819,
"eval_samples_per_second": 137.392,
"eval_steps_per_second": 8.613,
"step": 2010
},
{
"entropy": 0.726021606000987,
"epoch": 0.40336669699727024,
"grad_norm": 0.7114729881286621,
"learning_rate": 1.7744661921708185e-06,
"loss": 0.2095,
"mean_token_accuracy": 0.9433100288564509,
"num_tokens": 398940578.0,
"step": 2015
},
{
"entropy": 0.734506199576638,
"epoch": 0.4043676069153776,
"grad_norm": 0.6169213652610779,
"learning_rate": 1.7733540925266903e-06,
"loss": 0.2013,
"mean_token_accuracy": 0.9469331833449277,
"num_tokens": 399905944.0,
"step": 2020
},
{
"entropy": 0.7397109557281841,
"epoch": 0.40536851683348496,
"grad_norm": 1.60407292842865,
"learning_rate": 1.7722419928825622e-06,
"loss": 0.2062,
"mean_token_accuracy": 0.945603883266449,
"num_tokens": 400634292.0,
"step": 2025
},
{
"entropy": 0.6445516106757251,
"epoch": 0.4063694267515924,
"grad_norm": 0.8769928216934204,
"learning_rate": 1.771129893238434e-06,
"loss": 0.1977,
"mean_token_accuracy": 0.9475978217341683,
"num_tokens": 401779381.0,
"step": 2030
},
{
"entropy": 0.7098718599839644,
"epoch": 0.40737033666969974,
"grad_norm": 0.7846065163612366,
"learning_rate": 1.770017793594306e-06,
"loss": 0.2102,
"mean_token_accuracy": 0.9430582647973841,
"num_tokens": 402847416.0,
"step": 2035
},
{
"entropy": 0.7275996368039738,
"epoch": 0.4083712465878071,
"grad_norm": 0.7160109877586365,
"learning_rate": 1.7689056939501778e-06,
"loss": 0.2096,
"mean_token_accuracy": 0.9433997192166068,
"num_tokens": 403874433.0,
"step": 2040
},
{
"epoch": 0.4083712465878071,
"eval_entropy": 0.6705086065120385,
"eval_loss": 0.18461880087852478,
"eval_mean_token_accuracy": 0.9473435878753662,
"eval_num_tokens": 403874433.0,
"eval_runtime": 7.0408,
"eval_samples_per_second": 138.194,
"eval_steps_per_second": 8.664,
"step": 2040
},
{
"entropy": 0.7333347418091514,
"epoch": 0.40937215650591446,
"grad_norm": 0.6281505823135376,
"learning_rate": 1.7677935943060498e-06,
"loss": 0.2069,
"mean_token_accuracy": 0.9447664049538699,
"num_tokens": 404833962.0,
"step": 2045
},
{
"entropy": 0.7319487474181435,
"epoch": 0.4103730664240218,
"grad_norm": 1.4577454328536987,
"learning_rate": 1.7666814946619217e-06,
"loss": 0.201,
"mean_token_accuracy": 0.9480147020383315,
"num_tokens": 405569513.0,
"step": 2050
},
{
"entropy": 0.6517360118302432,
"epoch": 0.4113739763421292,
"grad_norm": 0.8996165990829468,
"learning_rate": 1.7655693950177935e-06,
"loss": 0.2004,
"mean_token_accuracy": 0.9475146136500618,
"num_tokens": 406712661.0,
"step": 2055
},
{
"entropy": 0.7000083235177127,
"epoch": 0.4123748862602366,
"grad_norm": 0.8524026274681091,
"learning_rate": 1.7644572953736654e-06,
"loss": 0.2011,
"mean_token_accuracy": 0.9456206717274406,
"num_tokens": 407800620.0,
"step": 2060
},
{
"entropy": 0.7201575975526463,
"epoch": 0.41337579617834397,
"grad_norm": 0.7635099291801453,
"learning_rate": 1.7633451957295373e-06,
"loss": 0.2058,
"mean_token_accuracy": 0.9447005716237155,
"num_tokens": 408832671.0,
"step": 2065
},
{
"entropy": 0.7325827403502031,
"epoch": 0.41437670609645133,
"grad_norm": 0.6240025758743286,
"learning_rate": 1.7622330960854093e-06,
"loss": 0.2053,
"mean_token_accuracy": 0.9453733026981354,
"num_tokens": 409777940.0,
"step": 2070
},
{
"epoch": 0.41437670609645133,
"eval_entropy": 0.6683724786414474,
"eval_loss": 0.18311864137649536,
"eval_mean_token_accuracy": 0.9476352060427431,
"eval_num_tokens": 409777940.0,
"eval_runtime": 7.0794,
"eval_samples_per_second": 137.44,
"eval_steps_per_second": 8.617,
"step": 2070
},
{
"entropy": 0.7300297211516987,
"epoch": 0.4153776160145587,
"grad_norm": 1.6698415279388428,
"learning_rate": 1.761120996441281e-06,
"loss": 0.2006,
"mean_token_accuracy": 0.9477990069172599,
"num_tokens": 410500764.0,
"step": 2075
},
{
"entropy": 0.6337598004124382,
"epoch": 0.41637852593266605,
"grad_norm": 0.881401002407074,
"learning_rate": 1.7600088967971528e-06,
"loss": 0.1969,
"mean_token_accuracy": 0.9473873138427734,
"num_tokens": 411638552.0,
"step": 2080
},
{
"entropy": 0.6980531494725835,
"epoch": 0.4173794358507734,
"grad_norm": 0.7880724668502808,
"learning_rate": 1.758896797153025e-06,
"loss": 0.2046,
"mean_token_accuracy": 0.9452372453429482,
"num_tokens": 412726553.0,
"step": 2085
},
{
"entropy": 0.721489062092521,
"epoch": 0.4183803457688808,
"grad_norm": 0.7589449882507324,
"learning_rate": 1.7577846975088968e-06,
"loss": 0.2078,
"mean_token_accuracy": 0.9432421592148867,
"num_tokens": 413743869.0,
"step": 2090
},
{
"entropy": 0.7336713259870355,
"epoch": 0.4193812556869882,
"grad_norm": 1.2119146585464478,
"learning_rate": 1.7566725978647686e-06,
"loss": 0.2028,
"mean_token_accuracy": 0.9461878337643364,
"num_tokens": 414696099.0,
"step": 2095
},
{
"entropy": 0.7307607759128917,
"epoch": 0.42038216560509556,
"grad_norm": 1.6642305850982666,
"learning_rate": 1.7555604982206405e-06,
"loss": 0.2065,
"mean_token_accuracy": 0.9463564807718451,
"num_tokens": 415430423.0,
"step": 2100
},
{
"epoch": 0.42038216560509556,
"eval_entropy": 0.6673324142323166,
"eval_loss": 0.1833495795726776,
"eval_mean_token_accuracy": 0.9478640595420462,
"eval_num_tokens": 415430423.0,
"eval_runtime": 7.0408,
"eval_samples_per_second": 138.194,
"eval_steps_per_second": 8.664,
"step": 2100
},
{
"entropy": 0.6451762475750663,
"epoch": 0.4213830755232029,
"grad_norm": 0.9465289115905762,
"learning_rate": 1.7544483985765123e-06,
"loss": 0.1989,
"mean_token_accuracy": 0.9476843329993161,
"num_tokens": 416551707.0,
"step": 2105
},
{
"entropy": 0.6898594834587791,
"epoch": 0.4223839854413103,
"grad_norm": 0.7581867575645447,
"learning_rate": 1.7533362989323844e-06,
"loss": 0.2012,
"mean_token_accuracy": 0.9459905033761805,
"num_tokens": 417645350.0,
"step": 2110
},
{
"entropy": 0.71252573186701,
"epoch": 0.42338489535941765,
"grad_norm": 0.7181347608566284,
"learning_rate": 1.752224199288256e-06,
"loss": 0.2085,
"mean_token_accuracy": 0.9445584345947612,
"num_tokens": 418716115.0,
"step": 2115
},
{
"entropy": 0.7133055407892573,
"epoch": 0.424385805277525,
"grad_norm": 0.6408241987228394,
"learning_rate": 1.751112099644128e-06,
"loss": 0.2027,
"mean_token_accuracy": 0.9469801886515183,
"num_tokens": 419663774.0,
"step": 2120
},
{
"entropy": 0.711867922002619,
"epoch": 0.42538671519563237,
"grad_norm": 1.6646242141723633,
"learning_rate": 1.75e-06,
"loss": 0.1944,
"mean_token_accuracy": 0.9488317923112349,
"num_tokens": 420383624.0,
"step": 2125
},
{
"entropy": 0.6277808471159502,
"epoch": 0.4263876251137398,
"grad_norm": 0.8758793473243713,
"learning_rate": 1.7488879003558718e-06,
"loss": 0.1946,
"mean_token_accuracy": 0.949054852398959,
"num_tokens": 421525651.0,
"step": 2130
},
{
"epoch": 0.4263876251137398,
"eval_entropy": 0.6551624673311828,
"eval_loss": 0.18077890574932098,
"eval_mean_token_accuracy": 0.9479863975868851,
"eval_num_tokens": 421525651.0,
"eval_runtime": 7.2475,
"eval_samples_per_second": 134.253,
"eval_steps_per_second": 8.417,
"step": 2130
},
{
"entropy": 0.6754489562728189,
"epoch": 0.42738853503184715,
"grad_norm": 0.8031311631202698,
"learning_rate": 1.747775800711744e-06,
"loss": 0.2031,
"mean_token_accuracy": 0.9457703324881467,
"num_tokens": 422612779.0,
"step": 2135
},
{
"entropy": 0.7074506429108707,
"epoch": 0.4283894449499545,
"grad_norm": 0.7394730448722839,
"learning_rate": 1.7466637010676155e-06,
"loss": 0.2102,
"mean_token_accuracy": 0.9433455748991533,
"num_tokens": 423647547.0,
"step": 2140
},
{
"entropy": 0.719589533589103,
"epoch": 0.4293903548680619,
"grad_norm": 0.7077816128730774,
"learning_rate": 1.7455516014234874e-06,
"loss": 0.2003,
"mean_token_accuracy": 0.9472663749348034,
"num_tokens": 424592674.0,
"step": 2145
},
{
"entropy": 0.7104438646273179,
"epoch": 0.43039126478616924,
"grad_norm": 1.5614880323410034,
"learning_rate": 1.7444395017793595e-06,
"loss": 0.2026,
"mean_token_accuracy": 0.9470372357151725,
"num_tokens": 425315977.0,
"step": 2150
},
{
"entropy": 0.6318686853755604,
"epoch": 0.4313921747042766,
"grad_norm": 0.9640966057777405,
"learning_rate": 1.7433274021352313e-06,
"loss": 0.1927,
"mean_token_accuracy": 0.9490994160825555,
"num_tokens": 426462515.0,
"step": 2155
},
{
"entropy": 0.6816817018118771,
"epoch": 0.43239308462238396,
"grad_norm": 0.7858216762542725,
"learning_rate": 1.742215302491103e-06,
"loss": 0.2074,
"mean_token_accuracy": 0.9445357848297465,
"num_tokens": 427573865.0,
"step": 2160
},
{
"epoch": 0.43239308462238396,
"eval_entropy": 0.6511259249976424,
"eval_loss": 0.1821490079164505,
"eval_mean_token_accuracy": 0.9482881309556179,
"eval_num_tokens": 427573865.0,
"eval_runtime": 7.0417,
"eval_samples_per_second": 138.178,
"eval_steps_per_second": 8.663,
"step": 2160
},
{
"entropy": 0.6951464016329159,
"epoch": 0.4333939945404914,
"grad_norm": 0.7225281000137329,
"learning_rate": 1.741103202846975e-06,
"loss": 0.2008,
"mean_token_accuracy": 0.9459953931244937,
"num_tokens": 428613823.0,
"step": 2165
},
{
"entropy": 0.7063021380792964,
"epoch": 0.43439490445859874,
"grad_norm": 0.6970705986022949,
"learning_rate": 1.739991103202847e-06,
"loss": 0.201,
"mean_token_accuracy": 0.9472221742976795,
"num_tokens": 429555485.0,
"step": 2170
},
{
"entropy": 0.7152685084126212,
"epoch": 0.4353958143767061,
"grad_norm": 1.6057738065719604,
"learning_rate": 1.7388790035587188e-06,
"loss": 0.1993,
"mean_token_accuracy": 0.9480575875802474,
"num_tokens": 430289824.0,
"step": 2175
},
{
"entropy": 0.6313129712234844,
"epoch": 0.43639672429481347,
"grad_norm": 0.8678532838821411,
"learning_rate": 1.7377669039145906e-06,
"loss": 0.1918,
"mean_token_accuracy": 0.9491723103956743,
"num_tokens": 431416002.0,
"step": 2180
},
{
"entropy": 0.6787468869577754,
"epoch": 0.4373976342129208,
"grad_norm": 0.8015258312225342,
"learning_rate": 1.7366548042704625e-06,
"loss": 0.2065,
"mean_token_accuracy": 0.9452338890595869,
"num_tokens": 432495759.0,
"step": 2185
},
{
"entropy": 0.6967111151326787,
"epoch": 0.4383985441310282,
"grad_norm": 0.7446882128715515,
"learning_rate": 1.7355427046263345e-06,
"loss": 0.2016,
"mean_token_accuracy": 0.9461214417761022,
"num_tokens": 433540979.0,
"step": 2190
},
{
"epoch": 0.4383985441310282,
"eval_entropy": 0.6525588817283755,
"eval_loss": 0.1822170466184616,
"eval_mean_token_accuracy": 0.9486938636811053,
"eval_num_tokens": 433540979.0,
"eval_runtime": 7.0989,
"eval_samples_per_second": 137.064,
"eval_steps_per_second": 8.593,
"step": 2190
},
{
"entropy": 0.7109386972405694,
"epoch": 0.43939945404913555,
"grad_norm": 0.6667978763580322,
"learning_rate": 1.7344306049822064e-06,
"loss": 0.197,
"mean_token_accuracy": 0.947896112095226,
"num_tokens": 434493365.0,
"step": 2195
},
{
"entropy": 0.7043106214566665,
"epoch": 0.44040036396724297,
"grad_norm": 1.5319359302520752,
"learning_rate": 1.733318505338078e-06,
"loss": 0.1958,
"mean_token_accuracy": 0.9489297693425959,
"num_tokens": 435226568.0,
"step": 2200
},
{
"entropy": 0.6357729142362422,
"epoch": 0.44140127388535033,
"grad_norm": 0.9023846983909607,
"learning_rate": 1.7322064056939501e-06,
"loss": 0.1931,
"mean_token_accuracy": 0.9492196435278112,
"num_tokens": 436329189.0,
"step": 2205
},
{
"entropy": 0.6714753104881807,
"epoch": 0.4424021838034577,
"grad_norm": 0.8319141864776611,
"learning_rate": 1.731094306049822e-06,
"loss": 0.1977,
"mean_token_accuracy": 0.9467895637858997,
"num_tokens": 437424508.0,
"step": 2210
},
{
"entropy": 0.6920918749137358,
"epoch": 0.44340309372156506,
"grad_norm": 0.7682778239250183,
"learning_rate": 1.7299822064056938e-06,
"loss": 0.1971,
"mean_token_accuracy": 0.946890014410019,
"num_tokens": 438454313.0,
"step": 2215
},
{
"entropy": 0.7164150400595232,
"epoch": 0.4444040036396724,
"grad_norm": 0.7394376397132874,
"learning_rate": 1.728870106761566e-06,
"loss": 0.203,
"mean_token_accuracy": 0.9466155610301278,
"num_tokens": 439405721.0,
"step": 2220
},
{
"epoch": 0.4444040036396724,
"eval_entropy": 0.66180199140408,
"eval_loss": 0.18257291615009308,
"eval_mean_token_accuracy": 0.9479424806891895,
"eval_num_tokens": 439405721.0,
"eval_runtime": 7.0937,
"eval_samples_per_second": 137.163,
"eval_steps_per_second": 8.599,
"step": 2220
},
{
"entropy": 0.7252338122237812,
"epoch": 0.4454049135577798,
"grad_norm": 1.6949454545974731,
"learning_rate": 1.7277580071174375e-06,
"loss": 0.1974,
"mean_token_accuracy": 0.9479389017278498,
"num_tokens": 440135915.0,
"step": 2225
},
{
"entropy": 0.639058459888805,
"epoch": 0.44640582347588714,
"grad_norm": 0.9639801383018494,
"learning_rate": 1.7266459074733096e-06,
"loss": 0.1946,
"mean_token_accuracy": 0.9484925963661888,
"num_tokens": 441271311.0,
"step": 2230
},
{
"entropy": 0.6790904402732849,
"epoch": 0.44740673339399456,
"grad_norm": 0.9020050168037415,
"learning_rate": 1.7255338078291815e-06,
"loss": 0.1974,
"mean_token_accuracy": 0.9472232580184936,
"num_tokens": 442357624.0,
"step": 2235
},
{
"entropy": 0.7040961652994155,
"epoch": 0.4484076433121019,
"grad_norm": 0.7891590595245361,
"learning_rate": 1.7244217081850533e-06,
"loss": 0.2016,
"mean_token_accuracy": 0.9471836420622739,
"num_tokens": 443388566.0,
"step": 2240
},
{
"entropy": 0.726311549002474,
"epoch": 0.4494085532302093,
"grad_norm": 0.7292259931564331,
"learning_rate": 1.7233096085409252e-06,
"loss": 0.1985,
"mean_token_accuracy": 0.9474069486964832,
"num_tokens": 444351363.0,
"step": 2245
},
{
"entropy": 0.7288991510868073,
"epoch": 0.45040946314831665,
"grad_norm": 1.6061440706253052,
"learning_rate": 1.722197508896797e-06,
"loss": 0.1958,
"mean_token_accuracy": 0.9497716730291194,
"num_tokens": 445084627.0,
"step": 2250
},
{
"epoch": 0.45040946314831665,
"eval_entropy": 0.6615917594706426,
"eval_loss": 0.18212804198265076,
"eval_mean_token_accuracy": 0.9479947060835167,
"eval_num_tokens": 445084627.0,
"eval_runtime": 7.1076,
"eval_samples_per_second": 136.895,
"eval_steps_per_second": 8.582,
"step": 2250
},
{
"entropy": 0.6405609087510542,
"epoch": 0.451410373066424,
"grad_norm": 0.9403535723686218,
"learning_rate": 1.721085409252669e-06,
"loss": 0.1883,
"mean_token_accuracy": 0.9504117098721591,
"num_tokens": 446228365.0,
"step": 2255
},
{
"entropy": 0.6863735209811818,
"epoch": 0.45241128298453137,
"grad_norm": 0.9006670117378235,
"learning_rate": 1.719973309608541e-06,
"loss": 0.201,
"mean_token_accuracy": 0.9467043746601451,
"num_tokens": 447304171.0,
"step": 2260
},
{
"entropy": 0.7067891944538464,
"epoch": 0.4534121929026388,
"grad_norm": 0.7302769422531128,
"learning_rate": 1.7188612099644126e-06,
"loss": 0.2,
"mean_token_accuracy": 0.9461231302131307,
"num_tokens": 448339733.0,
"step": 2265
},
{
"entropy": 0.7219751954078675,
"epoch": 0.45441310282074615,
"grad_norm": 0.6865111589431763,
"learning_rate": 1.7177491103202845e-06,
"loss": 0.1941,
"mean_token_accuracy": 0.948776437477632,
"num_tokens": 449292470.0,
"step": 2270
},
{
"entropy": 0.7329446852207184,
"epoch": 0.4554140127388535,
"grad_norm": 1.658300757408142,
"learning_rate": 1.7166370106761565e-06,
"loss": 0.1954,
"mean_token_accuracy": 0.9491787666624243,
"num_tokens": 450008277.0,
"step": 2275
},
{
"entropy": 0.6389858687465841,
"epoch": 0.4564149226569609,
"grad_norm": 0.9273704290390015,
"learning_rate": 1.7155249110320284e-06,
"loss": 0.1898,
"mean_token_accuracy": 0.9499964730306105,
"num_tokens": 451148916.0,
"step": 2280
},
{
"epoch": 0.4564149226569609,
"eval_entropy": 0.6680871458327184,
"eval_loss": 0.18033559620380402,
"eval_mean_token_accuracy": 0.9485022337710272,
"eval_num_tokens": 451148916.0,
"eval_runtime": 7.0712,
"eval_samples_per_second": 137.6,
"eval_steps_per_second": 8.627,
"step": 2280
},
{
"entropy": 0.698534585129131,
"epoch": 0.45741583257506824,
"grad_norm": 0.8281445503234863,
"learning_rate": 1.7144128113879003e-06,
"loss": 0.1988,
"mean_token_accuracy": 0.9470174924893813,
"num_tokens": 452233124.0,
"step": 2285
},
{
"entropy": 0.7227046912366694,
"epoch": 0.4584167424931756,
"grad_norm": 0.7393911480903625,
"learning_rate": 1.7133007117437721e-06,
"loss": 0.1935,
"mean_token_accuracy": 0.9472934657877142,
"num_tokens": 453277649.0,
"step": 2290
},
{
"entropy": 0.7179521495645697,
"epoch": 0.45941765241128296,
"grad_norm": 0.6617256999015808,
"learning_rate": 1.712188612099644e-06,
"loss": 0.1963,
"mean_token_accuracy": 0.9475378876382654,
"num_tokens": 454237241.0,
"step": 2295
},
{
"entropy": 0.7308073119683699,
"epoch": 0.4604185623293904,
"grad_norm": 1.585522174835205,
"learning_rate": 1.711076512455516e-06,
"loss": 0.194,
"mean_token_accuracy": 0.9490763951431621,
"num_tokens": 454963043.0,
"step": 2300
},
{
"entropy": 0.6407526796514338,
"epoch": 0.46141947224749774,
"grad_norm": 0.9008808732032776,
"learning_rate": 1.709964412811388e-06,
"loss": 0.1895,
"mean_token_accuracy": 0.9498597432266582,
"num_tokens": 456106735.0,
"step": 2305
},
{
"entropy": 0.6931318703022871,
"epoch": 0.4624203821656051,
"grad_norm": 0.8182635307312012,
"learning_rate": 1.7088523131672596e-06,
"loss": 0.198,
"mean_token_accuracy": 0.9471178504553708,
"num_tokens": 457176055.0,
"step": 2310
},
{
"epoch": 0.4624203821656051,
"eval_entropy": 0.6684376890542078,
"eval_loss": 0.18013876676559448,
"eval_mean_token_accuracy": 0.9488128394377037,
"eval_num_tokens": 457176055.0,
"eval_runtime": 7.0798,
"eval_samples_per_second": 137.434,
"eval_steps_per_second": 8.616,
"step": 2310
},
{
"entropy": 0.7130835376002572,
"epoch": 0.46342129208371247,
"grad_norm": 0.699058473110199,
"learning_rate": 1.7077402135231316e-06,
"loss": 0.1902,
"mean_token_accuracy": 0.9488946026021784,
"num_tokens": 458210295.0,
"step": 2315
},
{
"entropy": 0.7252059096639807,
"epoch": 0.46442220200181983,
"grad_norm": 0.6226775646209717,
"learning_rate": 1.7066281138790035e-06,
"loss": 0.1908,
"mean_token_accuracy": 0.9491387551481073,
"num_tokens": 459167123.0,
"step": 2320
},
{
"entropy": 0.7374506094238975,
"epoch": 0.4654231119199272,
"grad_norm": 1.667640209197998,
"learning_rate": 1.7055160142348755e-06,
"loss": 0.1973,
"mean_token_accuracy": 0.9483266061002558,
"num_tokens": 459902189.0,
"step": 2325
},
{
"entropy": 0.647046386924657,
"epoch": 0.46642402183803455,
"grad_norm": 0.9029215574264526,
"learning_rate": 1.7044039145907472e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9509288961237127,
"num_tokens": 461035882.0,
"step": 2330
},
{
"entropy": 0.6890198341824791,
"epoch": 0.46742493175614197,
"grad_norm": 0.8259357213973999,
"learning_rate": 1.703291814946619e-06,
"loss": 0.1888,
"mean_token_accuracy": 0.9496154855598103,
"num_tokens": 462118087.0,
"step": 2335
},
{
"entropy": 0.7170812931927768,
"epoch": 0.46842584167424933,
"grad_norm": 0.7089883685112,
"learning_rate": 1.7021797153024911e-06,
"loss": 0.1937,
"mean_token_accuracy": 0.9480150136080655,
"num_tokens": 463164516.0,
"step": 2340
},
{
"epoch": 0.46842584167424933,
"eval_entropy": 0.662775921039894,
"eval_loss": 0.18436135351657867,
"eval_mean_token_accuracy": 0.9480926853711488,
"eval_num_tokens": 463164516.0,
"eval_runtime": 7.3033,
"eval_samples_per_second": 133.228,
"eval_steps_per_second": 8.352,
"step": 2340
},
{
"entropy": 0.7226597574624148,
"epoch": 0.4694267515923567,
"grad_norm": 0.6807421445846558,
"learning_rate": 1.701067615658363e-06,
"loss": 0.1919,
"mean_token_accuracy": 0.9482672154903412,
"num_tokens": 464120922.0,
"step": 2345
},
{
"entropy": 0.7318018051710996,
"epoch": 0.47042766151046406,
"grad_norm": 1.6226071119308472,
"learning_rate": 1.6999555160142346e-06,
"loss": 0.1978,
"mean_token_accuracy": 0.9488935736092654,
"num_tokens": 464841415.0,
"step": 2350
},
{
"entropy": 0.6446803179654208,
"epoch": 0.4714285714285714,
"grad_norm": 0.8962976932525635,
"learning_rate": 1.6988434163701067e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.952340427311984,
"num_tokens": 465966413.0,
"step": 2355
},
{
"entropy": 0.6918975301764229,
"epoch": 0.4724294813466788,
"grad_norm": 0.7808786034584045,
"learning_rate": 1.6977313167259786e-06,
"loss": 0.1912,
"mean_token_accuracy": 0.9489104704423384,
"num_tokens": 467047557.0,
"step": 2360
},
{
"entropy": 0.713727774403312,
"epoch": 0.47343039126478614,
"grad_norm": 0.709165632724762,
"learning_rate": 1.6966192170818506e-06,
"loss": 0.1897,
"mean_token_accuracy": 0.948500750281594,
"num_tokens": 468103404.0,
"step": 2365
},
{
"entropy": 0.7288664657961238,
"epoch": 0.47443130118289356,
"grad_norm": 0.6595885753631592,
"learning_rate": 1.6955071174377223e-06,
"loss": 0.1932,
"mean_token_accuracy": 0.9496505059979179,
"num_tokens": 469059545.0,
"step": 2370
},
{
"epoch": 0.47443130118289356,
"eval_entropy": 0.6668027595418399,
"eval_loss": 0.1822473108768463,
"eval_mean_token_accuracy": 0.9485983565205434,
"eval_num_tokens": 469059545.0,
"eval_runtime": 7.0789,
"eval_samples_per_second": 137.45,
"eval_steps_per_second": 8.617,
"step": 2370
},
{
"entropy": 0.7264184512875297,
"epoch": 0.4754322111010009,
"grad_norm": 1.5922738313674927,
"learning_rate": 1.6943950177935941e-06,
"loss": 0.1903,
"mean_token_accuracy": 0.9503626883029938,
"num_tokens": 469786008.0,
"step": 2375
},
{
"entropy": 0.6371802779761228,
"epoch": 0.4764331210191083,
"grad_norm": 0.9065341949462891,
"learning_rate": 1.6932829181494662e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.9514370045878671,
"num_tokens": 470929765.0,
"step": 2380
},
{
"entropy": 0.6833381346680901,
"epoch": 0.47743403093721565,
"grad_norm": 0.7842475175857544,
"learning_rate": 1.692170818505338e-06,
"loss": 0.1919,
"mean_token_accuracy": 0.9486998016184026,
"num_tokens": 472010349.0,
"step": 2385
},
{
"entropy": 0.7116641182791102,
"epoch": 0.478434940855323,
"grad_norm": 0.7250556349754333,
"learning_rate": 1.69105871886121e-06,
"loss": 0.1994,
"mean_token_accuracy": 0.9462671198628165,
"num_tokens": 473044662.0,
"step": 2390
},
{
"entropy": 0.7203712877902118,
"epoch": 0.47943585077343037,
"grad_norm": 0.6930083632469177,
"learning_rate": 1.6899466192170818e-06,
"loss": 0.1951,
"mean_token_accuracy": 0.9483953313394027,
"num_tokens": 473994468.0,
"step": 2395
},
{
"entropy": 0.7180362874811346,
"epoch": 0.48043676069153773,
"grad_norm": 1.6153886318206787,
"learning_rate": 1.6888345195729536e-06,
"loss": 0.1877,
"mean_token_accuracy": 0.9512313040820035,
"num_tokens": 474722958.0,
"step": 2400
},
{
"epoch": 0.48043676069153773,
"eval_entropy": 0.6513270115266081,
"eval_loss": 0.18145401775836945,
"eval_mean_token_accuracy": 0.9486565795101103,
"eval_num_tokens": 474722958.0,
"eval_runtime": 7.0749,
"eval_samples_per_second": 137.528,
"eval_steps_per_second": 8.622,
"step": 2400
},
{
"entropy": 0.631208659843965,
"epoch": 0.48143767060964515,
"grad_norm": 0.8330217003822327,
"learning_rate": 1.6877224199288255e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.9511312663555145,
"num_tokens": 475864637.0,
"step": 2405
},
{
"entropy": 0.6784997463226319,
"epoch": 0.4824385805277525,
"grad_norm": 0.8162450194358826,
"learning_rate": 1.6866103202846975e-06,
"loss": 0.1877,
"mean_token_accuracy": 0.948906400528821,
"num_tokens": 476949278.0,
"step": 2410
},
{
"entropy": 0.7088199880990115,
"epoch": 0.4834394904458599,
"grad_norm": 0.7941007614135742,
"learning_rate": 1.6854982206405692e-06,
"loss": 0.1943,
"mean_token_accuracy": 0.9472430489280007,
"num_tokens": 477976989.0,
"step": 2415
},
{
"entropy": 0.7254412935538725,
"epoch": 0.48444040036396724,
"grad_norm": 0.6573625802993774,
"learning_rate": 1.6843861209964413e-06,
"loss": 0.1898,
"mean_token_accuracy": 0.9490737053481015,
"num_tokens": 478923190.0,
"step": 2420
},
{
"entropy": 0.7253351894291964,
"epoch": 0.4854413102820746,
"grad_norm": 1.5769827365875244,
"learning_rate": 1.6832740213523131e-06,
"loss": 0.1865,
"mean_token_accuracy": 0.9515808555212888,
"num_tokens": 479660032.0,
"step": 2425
},
{
"entropy": 0.6324506296352906,
"epoch": 0.48644222020018196,
"grad_norm": 0.8730674982070923,
"learning_rate": 1.682161921708185e-06,
"loss": 0.1722,
"mean_token_accuracy": 0.9541928350925446,
"num_tokens": 480777956.0,
"step": 2430
},
{
"epoch": 0.48644222020018196,
"eval_entropy": 0.6624198419148805,
"eval_loss": 0.18191221356391907,
"eval_mean_token_accuracy": 0.948937864577184,
"eval_num_tokens": 480777956.0,
"eval_runtime": 7.0984,
"eval_samples_per_second": 137.073,
"eval_steps_per_second": 8.594,
"step": 2430
},
{
"entropy": 0.6939558029174805,
"epoch": 0.4874431301182894,
"grad_norm": 0.8086444735527039,
"learning_rate": 1.6810498220640568e-06,
"loss": 0.1983,
"mean_token_accuracy": 0.9463418738408522,
"num_tokens": 481850511.0,
"step": 2435
},
{
"entropy": 0.7101692611520941,
"epoch": 0.48844404003639674,
"grad_norm": 0.7348644733428955,
"learning_rate": 1.6799377224199287e-06,
"loss": 0.1924,
"mean_token_accuracy": 0.9481533868746324,
"num_tokens": 482899017.0,
"step": 2440
},
{
"entropy": 0.71939834383401,
"epoch": 0.4894449499545041,
"grad_norm": 0.6322587132453918,
"learning_rate": 1.6788256227758006e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9497276311570948,
"num_tokens": 483867488.0,
"step": 2445
},
{
"entropy": 0.7129947499795394,
"epoch": 0.49044585987261147,
"grad_norm": 1.5901869535446167,
"learning_rate": 1.6777135231316726e-06,
"loss": 0.1817,
"mean_token_accuracy": 0.9520208895206451,
"num_tokens": 484607972.0,
"step": 2450
},
{
"entropy": 0.6236480347134851,
"epoch": 0.49144676979071883,
"grad_norm": 0.9655255079269409,
"learning_rate": 1.6766014234875443e-06,
"loss": 0.1794,
"mean_token_accuracy": 0.9524587219411677,
"num_tokens": 485738406.0,
"step": 2455
},
{
"entropy": 0.6700492823665792,
"epoch": 0.4924476797088262,
"grad_norm": 0.7714277505874634,
"learning_rate": 1.6754893238434163e-06,
"loss": 0.1904,
"mean_token_accuracy": 0.9482547689567913,
"num_tokens": 486828631.0,
"step": 2460
},
{
"epoch": 0.4924476797088262,
"eval_entropy": 0.6517836798409946,
"eval_loss": 0.18481405079364777,
"eval_mean_token_accuracy": 0.9480296187713498,
"eval_num_tokens": 486828631.0,
"eval_runtime": 7.0759,
"eval_samples_per_second": 137.509,
"eval_steps_per_second": 8.621,
"step": 2460
},
{
"entropy": 0.7008583583615043,
"epoch": 0.49344858962693355,
"grad_norm": 0.7727804780006409,
"learning_rate": 1.6743772241992882e-06,
"loss": 0.1935,
"mean_token_accuracy": 0.9479362059723248,
"num_tokens": 487864440.0,
"step": 2465
},
{
"entropy": 0.703695898977193,
"epoch": 0.49444949954504097,
"grad_norm": 0.6300666928291321,
"learning_rate": 1.67326512455516e-06,
"loss": 0.1843,
"mean_token_accuracy": 0.950631813027642,
"num_tokens": 488824928.0,
"step": 2470
},
{
"entropy": 0.7131873472170396,
"epoch": 0.49545040946314833,
"grad_norm": 1.5668567419052124,
"learning_rate": 1.6721530249110321e-06,
"loss": 0.1891,
"mean_token_accuracy": 0.950605512749065,
"num_tokens": 489558417.0,
"step": 2475
},
{
"entropy": 0.6274211555719376,
"epoch": 0.4964513193812557,
"grad_norm": 0.8960671424865723,
"learning_rate": 1.6710409252669038e-06,
"loss": 0.1808,
"mean_token_accuracy": 0.9516859596425836,
"num_tokens": 490705705.0,
"step": 2480
},
{
"entropy": 0.6673310320485722,
"epoch": 0.49745222929936306,
"grad_norm": 0.7930068373680115,
"learning_rate": 1.6699288256227756e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9504123080860485,
"num_tokens": 491806156.0,
"step": 2485
},
{
"entropy": 0.6888049499555068,
"epoch": 0.4984531392174704,
"grad_norm": 0.7340999245643616,
"learning_rate": 1.6688167259786477e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9502738291567022,
"num_tokens": 492861174.0,
"step": 2490
},
{
"epoch": 0.4984531392174704,
"eval_entropy": 0.6484415804753538,
"eval_loss": 0.18193961679935455,
"eval_mean_token_accuracy": 0.9487551157591773,
"eval_num_tokens": 492861174.0,
"eval_runtime": 7.0365,
"eval_samples_per_second": 138.278,
"eval_steps_per_second": 8.669,
"step": 2490
},
{
"entropy": 0.7058779543096368,
"epoch": 0.4994540491355778,
"grad_norm": 0.6413493156433105,
"learning_rate": 1.6677046263345196e-06,
"loss": 0.1878,
"mean_token_accuracy": 0.9502467995340174,
"num_tokens": 493819119.0,
"step": 2495
},
{
"entropy": 0.7087403059005737,
"epoch": 0.5004549590536852,
"grad_norm": 1.787781834602356,
"learning_rate": 1.6665925266903912e-06,
"loss": 0.1787,
"mean_token_accuracy": 0.9526755853132768,
"num_tokens": 494543819.0,
"step": 2500
},
{
"entropy": 0.6390963711521842,
"epoch": 0.5014558689717925,
"grad_norm": 0.9745569825172424,
"learning_rate": 1.6654804270462633e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.9514152586460114,
"num_tokens": 495662161.0,
"step": 2505
},
{
"entropy": 0.6857650220394135,
"epoch": 0.5024567788898999,
"grad_norm": 0.7823364734649658,
"learning_rate": 1.6643683274021351e-06,
"loss": 0.1904,
"mean_token_accuracy": 0.9487736772407185,
"num_tokens": 496756895.0,
"step": 2510
},
{
"entropy": 0.7007447817108848,
"epoch": 0.5034576888080072,
"grad_norm": 0.7864211201667786,
"learning_rate": 1.6632562277580072e-06,
"loss": 0.1886,
"mean_token_accuracy": 0.949703172120181,
"num_tokens": 497791268.0,
"step": 2515
},
{
"entropy": 0.7246824242851951,
"epoch": 0.5044585987261146,
"grad_norm": 0.7322613596916199,
"learning_rate": 1.6621441281138788e-06,
"loss": 0.1911,
"mean_token_accuracy": 0.9495543116872961,
"num_tokens": 498738442.0,
"step": 2520
},
{
"epoch": 0.5044585987261146,
"eval_entropy": 0.6589406611489468,
"eval_loss": 0.1833851933479309,
"eval_mean_token_accuracy": 0.9485713624563373,
"eval_num_tokens": 498738442.0,
"eval_runtime": 7.0805,
"eval_samples_per_second": 137.419,
"eval_steps_per_second": 8.615,
"step": 2520
},
{
"entropy": 0.7190561719916083,
"epoch": 0.5054595086442221,
"grad_norm": 1.6090604066848755,
"learning_rate": 1.6610320284697507e-06,
"loss": 0.1893,
"mean_token_accuracy": 0.9508534241806377,
"num_tokens": 499466730.0,
"step": 2525
},
{
"entropy": 0.6477170705795288,
"epoch": 0.5064604185623294,
"grad_norm": 0.9411379098892212,
"learning_rate": 1.6599199288256228e-06,
"loss": 0.1869,
"mean_token_accuracy": 0.950556813586842,
"num_tokens": 500573434.0,
"step": 2530
},
{
"entropy": 0.6901047151197087,
"epoch": 0.5074613284804368,
"grad_norm": 0.8624676465988159,
"learning_rate": 1.6588078291814946e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.9498747451738878,
"num_tokens": 501667610.0,
"step": 2535
},
{
"entropy": 0.6946311796253378,
"epoch": 0.5084622383985441,
"grad_norm": 0.7686476707458496,
"learning_rate": 1.6576957295373665e-06,
"loss": 0.1867,
"mean_token_accuracy": 0.9502540312030099,
"num_tokens": 502710197.0,
"step": 2540
},
{
"entropy": 0.7155399157242341,
"epoch": 0.5094631483166515,
"grad_norm": 0.6873733997344971,
"learning_rate": 1.6565836298932383e-06,
"loss": 0.1864,
"mean_token_accuracy": 0.9502958644520153,
"num_tokens": 503664692.0,
"step": 2545
},
{
"entropy": 0.7136711597442627,
"epoch": 0.5104640582347588,
"grad_norm": 1.6547688245773315,
"learning_rate": 1.6554715302491102e-06,
"loss": 0.1833,
"mean_token_accuracy": 0.9522597675973719,
"num_tokens": 504389185.0,
"step": 2550
},
{
"epoch": 0.5104640582347588,
"eval_entropy": 0.6552407546121566,
"eval_loss": 0.18142500519752502,
"eval_mean_token_accuracy": 0.9480548022223301,
"eval_num_tokens": 504389185.0,
"eval_runtime": 7.0471,
"eval_samples_per_second": 138.071,
"eval_steps_per_second": 8.656,
"step": 2550
},
{
"entropy": 0.6178887445818294,
"epoch": 0.5114649681528662,
"grad_norm": 0.913406491279602,
"learning_rate": 1.6543594306049823e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9535938934846357,
"num_tokens": 505531871.0,
"step": 2555
},
{
"entropy": 0.6729612290859223,
"epoch": 0.5124658780709737,
"grad_norm": 0.944691002368927,
"learning_rate": 1.6532473309608541e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9497326493263245,
"num_tokens": 506624859.0,
"step": 2560
},
{
"entropy": 0.6915393555706197,
"epoch": 0.513466787989081,
"grad_norm": 0.7944353818893433,
"learning_rate": 1.6521352313167258e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.951506213166497,
"num_tokens": 507654709.0,
"step": 2565
},
{
"entropy": 0.7096973836421967,
"epoch": 0.5144676979071884,
"grad_norm": 0.6219519972801208,
"learning_rate": 1.6510231316725978e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9509352364323356,
"num_tokens": 508600841.0,
"step": 2570
},
{
"entropy": 0.7088993831114335,
"epoch": 0.5154686078252957,
"grad_norm": 1.649739384651184,
"learning_rate": 1.6499110320284697e-06,
"loss": 0.1774,
"mean_token_accuracy": 0.9526963141831485,
"num_tokens": 509326741.0,
"step": 2575
},
{
"entropy": 0.6317986461249265,
"epoch": 0.5164695177434031,
"grad_norm": 0.8715526461601257,
"learning_rate": 1.6487989323843416e-06,
"loss": 0.1724,
"mean_token_accuracy": 0.9545585063370792,
"num_tokens": 510445233.0,
"step": 2580
},
{
"epoch": 0.5164695177434031,
"eval_entropy": 0.65945937154723,
"eval_loss": 0.18114233016967773,
"eval_mean_token_accuracy": 0.9490967617660272,
"eval_num_tokens": 510445233.0,
"eval_runtime": 7.1179,
"eval_samples_per_second": 136.698,
"eval_steps_per_second": 8.57,
"step": 2580
},
{
"entropy": 0.6800177308646116,
"epoch": 0.5174704276615104,
"grad_norm": 0.8096470832824707,
"learning_rate": 1.6476868327402134e-06,
"loss": 0.1868,
"mean_token_accuracy": 0.9498075154694644,
"num_tokens": 511535453.0,
"step": 2585
},
{
"entropy": 0.6936602221293883,
"epoch": 0.5184713375796178,
"grad_norm": 0.7464794516563416,
"learning_rate": 1.6465747330960853e-06,
"loss": 0.1846,
"mean_token_accuracy": 0.950649511272257,
"num_tokens": 512578283.0,
"step": 2590
},
{
"entropy": 0.7119996157559482,
"epoch": 0.5194722474977252,
"grad_norm": 0.6432804465293884,
"learning_rate": 1.6454626334519573e-06,
"loss": 0.1886,
"mean_token_accuracy": 0.9499502631750973,
"num_tokens": 513537315.0,
"step": 2595
},
{
"entropy": 0.7189283625646071,
"epoch": 0.5204731574158326,
"grad_norm": 1.5802396535873413,
"learning_rate": 1.6443505338078292e-06,
"loss": 0.1834,
"mean_token_accuracy": 0.9515245573087172,
"num_tokens": 514261680.0,
"step": 2600
},
{
"entropy": 0.6315281949260018,
"epoch": 0.52147406733394,
"grad_norm": 0.8882037401199341,
"learning_rate": 1.6432384341637008e-06,
"loss": 0.1771,
"mean_token_accuracy": 0.9534684544259852,
"num_tokens": 515406090.0,
"step": 2605
},
{
"entropy": 0.6785016363317317,
"epoch": 0.5224749772520473,
"grad_norm": 0.8530360460281372,
"learning_rate": 1.642126334519573e-06,
"loss": 0.1837,
"mean_token_accuracy": 0.9502465849572962,
"num_tokens": 516486142.0,
"step": 2610
},
{
"epoch": 0.5224749772520473,
"eval_entropy": 0.6628204916344315,
"eval_loss": 0.1812705546617508,
"eval_mean_token_accuracy": 0.948223913302187,
"eval_num_tokens": 516486142.0,
"eval_runtime": 7.0951,
"eval_samples_per_second": 137.138,
"eval_steps_per_second": 8.598,
"step": 2610
},
{
"entropy": 0.7054530333388935,
"epoch": 0.5234758871701547,
"grad_norm": 0.7850746512413025,
"learning_rate": 1.6410142348754448e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9509090060537512,
"num_tokens": 517532763.0,
"step": 2615
},
{
"entropy": 0.7255071458491412,
"epoch": 0.524476797088262,
"grad_norm": 0.6404737234115601,
"learning_rate": 1.6399021352313166e-06,
"loss": 0.1885,
"mean_token_accuracy": 0.9504103817723014,
"num_tokens": 518475304.0,
"step": 2620
},
{
"entropy": 0.7156178160147233,
"epoch": 0.5254777070063694,
"grad_norm": 1.8426103591918945,
"learning_rate": 1.6387900355871887e-06,
"loss": 0.1817,
"mean_token_accuracy": 0.9521985969760202,
"num_tokens": 519192476.0,
"step": 2625
},
{
"entropy": 0.6239164311777462,
"epoch": 0.5264786169244768,
"grad_norm": 0.9522146582603455,
"learning_rate": 1.6376779359430603e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9539390303871849,
"num_tokens": 520333033.0,
"step": 2630
},
{
"entropy": 0.6752398347312754,
"epoch": 0.5274795268425841,
"grad_norm": 0.809895396232605,
"learning_rate": 1.6365658362989322e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.9511864114891398,
"num_tokens": 521436825.0,
"step": 2635
},
{
"entropy": 0.690356595678763,
"epoch": 0.5284804367606916,
"grad_norm": 0.7257580161094666,
"learning_rate": 1.6354537366548043e-06,
"loss": 0.183,
"mean_token_accuracy": 0.9509031973101876,
"num_tokens": 522480077.0,
"step": 2640
},
{
"epoch": 0.5284804367606916,
"eval_entropy": 0.6481362190402922,
"eval_loss": 0.18130838871002197,
"eval_mean_token_accuracy": 0.9484924390667775,
"eval_num_tokens": 522480077.0,
"eval_runtime": 7.2611,
"eval_samples_per_second": 134.002,
"eval_steps_per_second": 8.401,
"step": 2640
},
{
"entropy": 0.704322841221636,
"epoch": 0.5294813466787989,
"grad_norm": 0.6928062438964844,
"learning_rate": 1.6343416370106761e-06,
"loss": 0.1791,
"mean_token_accuracy": 0.9527296142144637,
"num_tokens": 523435044.0,
"step": 2645
},
{
"entropy": 0.7071298014033924,
"epoch": 0.5304822565969063,
"grad_norm": 1.618299961090088,
"learning_rate": 1.633229537366548e-06,
"loss": 0.1773,
"mean_token_accuracy": 0.9537473727356304,
"num_tokens": 524169591.0,
"step": 2650
},
{
"entropy": 0.6280927267941562,
"epoch": 0.5314831665150136,
"grad_norm": 0.8909013867378235,
"learning_rate": 1.6321174377224198e-06,
"loss": 0.1787,
"mean_token_accuracy": 0.9528289025480097,
"num_tokens": 525350154.0,
"step": 2655
},
{
"entropy": 0.6755202797326174,
"epoch": 0.532484076433121,
"grad_norm": 0.7887572646141052,
"learning_rate": 1.6310053380782917e-06,
"loss": 0.1805,
"mean_token_accuracy": 0.9506685668771917,
"num_tokens": 526444507.0,
"step": 2660
},
{
"entropy": 0.6938620551065965,
"epoch": 0.5334849863512284,
"grad_norm": 0.7199150919914246,
"learning_rate": 1.6298932384341638e-06,
"loss": 0.1816,
"mean_token_accuracy": 0.9513141361149875,
"num_tokens": 527503125.0,
"step": 2665
},
{
"entropy": 0.7077038569883867,
"epoch": 0.5344858962693357,
"grad_norm": 0.7154355645179749,
"learning_rate": 1.6287811387900354e-06,
"loss": 0.1768,
"mean_token_accuracy": 0.9529465122656389,
"num_tokens": 528449606.0,
"step": 2670
},
{
"epoch": 0.5344858962693357,
"eval_entropy": 0.6517275425254322,
"eval_loss": 0.17996351420879364,
"eval_mean_token_accuracy": 0.9491330697888234,
"eval_num_tokens": 528449606.0,
"eval_runtime": 7.0362,
"eval_samples_per_second": 138.285,
"eval_steps_per_second": 8.669,
"step": 2670
},
{
"entropy": 0.7160382747650147,
"epoch": 0.5354868061874432,
"grad_norm": 1.6408438682556152,
"learning_rate": 1.6276690391459073e-06,
"loss": 0.1817,
"mean_token_accuracy": 0.9525136871771379,
"num_tokens": 529188247.0,
"step": 2675
},
{
"entropy": 0.6233914153142409,
"epoch": 0.5364877161055505,
"grad_norm": 0.9246336221694946,
"learning_rate": 1.6265569395017793e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.954184738072482,
"num_tokens": 530332960.0,
"step": 2680
},
{
"entropy": 0.6778095136989247,
"epoch": 0.5374886260236579,
"grad_norm": 0.8488349914550781,
"learning_rate": 1.6254448398576512e-06,
"loss": 0.184,
"mean_token_accuracy": 0.950068386034532,
"num_tokens": 531420665.0,
"step": 2685
},
{
"entropy": 0.6974740269509229,
"epoch": 0.5384895359417653,
"grad_norm": 0.7441538572311401,
"learning_rate": 1.624332740213523e-06,
"loss": 0.1816,
"mean_token_accuracy": 0.9505076592618769,
"num_tokens": 532457212.0,
"step": 2690
},
{
"entropy": 0.7071244949644262,
"epoch": 0.5394904458598726,
"grad_norm": 0.6449328660964966,
"learning_rate": 1.623220640569395e-06,
"loss": 0.1789,
"mean_token_accuracy": 0.9523183091120286,
"num_tokens": 533421804.0,
"step": 2695
},
{
"entropy": 0.7078518325632269,
"epoch": 0.54049135577798,
"grad_norm": 1.4397194385528564,
"learning_rate": 1.6221085409252668e-06,
"loss": 0.1795,
"mean_token_accuracy": 0.9529074343768034,
"num_tokens": 534147596.0,
"step": 2700
},
{
"epoch": 0.54049135577798,
"eval_entropy": 0.6496536575379919,
"eval_loss": 0.18150362372398376,
"eval_mean_token_accuracy": 0.948933268179659,
"eval_num_tokens": 534147596.0,
"eval_runtime": 7.1231,
"eval_samples_per_second": 136.598,
"eval_steps_per_second": 8.564,
"step": 2700
},
{
"entropy": 0.6244452785361897,
"epoch": 0.5414922656960873,
"grad_norm": 0.8995323181152344,
"learning_rate": 1.6209964412811388e-06,
"loss": 0.173,
"mean_token_accuracy": 0.9548817623745312,
"num_tokens": 535275617.0,
"step": 2705
},
{
"entropy": 0.6707280232147736,
"epoch": 0.5424931756141947,
"grad_norm": 0.776010274887085,
"learning_rate": 1.6198843416370107e-06,
"loss": 0.182,
"mean_token_accuracy": 0.9512561657211998,
"num_tokens": 536378664.0,
"step": 2710
},
{
"entropy": 0.6926113960417835,
"epoch": 0.543494085532302,
"grad_norm": 0.7570468783378601,
"learning_rate": 1.6187722419928823e-06,
"loss": 0.1795,
"mean_token_accuracy": 0.9521033758466894,
"num_tokens": 537435601.0,
"step": 2715
},
{
"entropy": 0.7047263752330434,
"epoch": 0.5444949954504095,
"grad_norm": 0.726445198059082,
"learning_rate": 1.6176601423487544e-06,
"loss": 0.1784,
"mean_token_accuracy": 0.9523573181845925,
"num_tokens": 538399382.0,
"step": 2720
},
{
"entropy": 0.7051477047530088,
"epoch": 0.5454959053685169,
"grad_norm": 1.5214438438415527,
"learning_rate": 1.6165480427046263e-06,
"loss": 0.179,
"mean_token_accuracy": 0.9527212950316343,
"num_tokens": 539145348.0,
"step": 2725
},
{
"entropy": 0.6262573410164226,
"epoch": 0.5464968152866242,
"grad_norm": 0.8618422150611877,
"learning_rate": 1.6154359430604983e-06,
"loss": 0.1697,
"mean_token_accuracy": 0.9551108999685808,
"num_tokens": 540256367.0,
"step": 2730
},
{
"epoch": 0.5464968152866242,
"eval_entropy": 0.6563674035619517,
"eval_loss": 0.17830216884613037,
"eval_mean_token_accuracy": 0.9493675964777587,
"eval_num_tokens": 540256367.0,
"eval_runtime": 7.0929,
"eval_samples_per_second": 137.179,
"eval_steps_per_second": 8.6,
"step": 2730
},
{
"entropy": 0.6828876172954386,
"epoch": 0.5474977252047316,
"grad_norm": 0.819709062576294,
"learning_rate": 1.61432384341637e-06,
"loss": 0.1797,
"mean_token_accuracy": 0.9517435491085052,
"num_tokens": 541348519.0,
"step": 2735
},
{
"entropy": 0.7007702973755923,
"epoch": 0.5484986351228389,
"grad_norm": 0.8111042976379395,
"learning_rate": 1.6132117437722418e-06,
"loss": 0.1811,
"mean_token_accuracy": 0.9512450467456471,
"num_tokens": 542379850.0,
"step": 2740
},
{
"entropy": 0.720324994488196,
"epoch": 0.5494995450409463,
"grad_norm": 0.8011656403541565,
"learning_rate": 1.612099644128114e-06,
"loss": 0.181,
"mean_token_accuracy": 0.9518745259805159,
"num_tokens": 543352827.0,
"step": 2745
},
{
"entropy": 0.7194907562299209,
"epoch": 0.5505004549590536,
"grad_norm": 1.7628045082092285,
"learning_rate": 1.6109875444839858e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9550723487680609,
"num_tokens": 544083387.0,
"step": 2750
},
{
"entropy": 0.6324405716224151,
"epoch": 0.5515013648771611,
"grad_norm": 0.9303938746452332,
"learning_rate": 1.6098754448398574e-06,
"loss": 0.1765,
"mean_token_accuracy": 0.9533820531585,
"num_tokens": 545240701.0,
"step": 2755
},
{
"entropy": 0.6824088400060481,
"epoch": 0.5525022747952685,
"grad_norm": 0.7898913025856018,
"learning_rate": 1.6087633451957295e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9526371836662293,
"num_tokens": 546313405.0,
"step": 2760
},
{
"epoch": 0.5525022747952685,
"eval_entropy": 0.6607245241032272,
"eval_loss": 0.18013106286525726,
"eval_mean_token_accuracy": 0.9495335397173147,
"eval_num_tokens": 546313405.0,
"eval_runtime": 6.9999,
"eval_samples_per_second": 139.003,
"eval_steps_per_second": 8.714,
"step": 2760
},
{
"entropy": 0.7003410195762461,
"epoch": 0.5535031847133758,
"grad_norm": 0.8289366364479065,
"learning_rate": 1.6076512455516013e-06,
"loss": 0.1794,
"mean_token_accuracy": 0.9513268579136241,
"num_tokens": 547348958.0,
"step": 2765
},
{
"entropy": 0.7149239838123321,
"epoch": 0.5545040946314832,
"grad_norm": 0.6572934985160828,
"learning_rate": 1.6065391459074732e-06,
"loss": 0.1772,
"mean_token_accuracy": 0.9532779801975597,
"num_tokens": 548314371.0,
"step": 2770
},
{
"entropy": 0.7151028931140899,
"epoch": 0.5555050045495905,
"grad_norm": 1.5846747159957886,
"learning_rate": 1.605427046263345e-06,
"loss": 0.174,
"mean_token_accuracy": 0.9540536219423468,
"num_tokens": 549045682.0,
"step": 2775
},
{
"entropy": 0.6212464993650263,
"epoch": 0.5565059144676979,
"grad_norm": 0.9082188010215759,
"learning_rate": 1.604314946619217e-06,
"loss": 0.169,
"mean_token_accuracy": 0.9555608651854776,
"num_tokens": 550218972.0,
"step": 2780
},
{
"entropy": 0.6785877959294753,
"epoch": 0.5575068243858052,
"grad_norm": 0.8211308121681213,
"learning_rate": 1.603202846975089e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9523198788816278,
"num_tokens": 551311414.0,
"step": 2785
},
{
"entropy": 0.6972319600257006,
"epoch": 0.5585077343039127,
"grad_norm": 0.7803236246109009,
"learning_rate": 1.6020907473309608e-06,
"loss": 0.1847,
"mean_token_accuracy": 0.9511200400916013,
"num_tokens": 552341292.0,
"step": 2790
},
{
"epoch": 0.5585077343039127,
"eval_entropy": 0.6579079383709392,
"eval_loss": 0.17929911613464355,
"eval_mean_token_accuracy": 0.9495372889471836,
"eval_num_tokens": 552341292.0,
"eval_runtime": 7.0916,
"eval_samples_per_second": 137.205,
"eval_steps_per_second": 8.602,
"step": 2790
},
{
"entropy": 0.7150760515169664,
"epoch": 0.5595086442220201,
"grad_norm": 0.7405098080635071,
"learning_rate": 1.6009786476868327e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9534146623177961,
"num_tokens": 553288742.0,
"step": 2795
},
{
"entropy": 0.7124831140041351,
"epoch": 0.5605095541401274,
"grad_norm": 1.5424504280090332,
"learning_rate": 1.5998665480427046e-06,
"loss": 0.1741,
"mean_token_accuracy": 0.9544130991805684,
"num_tokens": 554020302.0,
"step": 2800
},
{
"entropy": 0.6266002264889804,
"epoch": 0.5615104640582348,
"grad_norm": 0.8750737309455872,
"learning_rate": 1.5987544483985764e-06,
"loss": 0.1693,
"mean_token_accuracy": 0.955253835699775,
"num_tokens": 555155713.0,
"step": 2805
},
{
"entropy": 0.6803346837108786,
"epoch": 0.5625113739763421,
"grad_norm": 0.8157915472984314,
"learning_rate": 1.5976423487544483e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.950763221762397,
"num_tokens": 556265345.0,
"step": 2810
},
{
"entropy": 0.6968496783213182,
"epoch": 0.5635122838944495,
"grad_norm": 0.7749494910240173,
"learning_rate": 1.5965302491103203e-06,
"loss": 0.1783,
"mean_token_accuracy": 0.9526708814230832,
"num_tokens": 557289078.0,
"step": 2815
},
{
"entropy": 0.7060149173844944,
"epoch": 0.5645131938125568,
"grad_norm": 0.6400516033172607,
"learning_rate": 1.595418149466192e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9539100381461056,
"num_tokens": 558245587.0,
"step": 2820
},
{
"epoch": 0.5645131938125568,
"eval_entropy": 0.6540780389895204,
"eval_loss": 0.18272945284843445,
"eval_mean_token_accuracy": 0.9483291203858423,
"eval_num_tokens": 558245587.0,
"eval_runtime": 7.0762,
"eval_samples_per_second": 137.503,
"eval_steps_per_second": 8.62,
"step": 2820
},
{
"entropy": 0.7129332022233443,
"epoch": 0.5655141037306642,
"grad_norm": 1.6981102228164673,
"learning_rate": 1.594306049822064e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9546095479618419,
"num_tokens": 558972025.0,
"step": 2825
},
{
"entropy": 0.6247928223826669,
"epoch": 0.5665150136487717,
"grad_norm": 0.9023392796516418,
"learning_rate": 1.593193950177936e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.9542244569821792,
"num_tokens": 560119210.0,
"step": 2830
},
{
"entropy": 0.6733576593073931,
"epoch": 0.567515923566879,
"grad_norm": 0.7821236252784729,
"learning_rate": 1.5920818505338078e-06,
"loss": 0.1767,
"mean_token_accuracy": 0.951957995783199,
"num_tokens": 561195725.0,
"step": 2835
},
{
"entropy": 0.7028111894022334,
"epoch": 0.5685168334849864,
"grad_norm": 0.745370626449585,
"learning_rate": 1.5909697508896796e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9535439290783622,
"num_tokens": 562247195.0,
"step": 2840
},
{
"entropy": 0.7104091267694127,
"epoch": 0.5695177434030937,
"grad_norm": 0.606145977973938,
"learning_rate": 1.5898576512455515e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.9540031877431002,
"num_tokens": 563213222.0,
"step": 2845
},
{
"entropy": 0.7080640597776933,
"epoch": 0.5705186533212011,
"grad_norm": 1.5943220853805542,
"learning_rate": 1.5887455516014233e-06,
"loss": 0.1775,
"mean_token_accuracy": 0.9533238221298564,
"num_tokens": 563958366.0,
"step": 2850
},
{
"epoch": 0.5705186533212011,
"eval_entropy": 0.6547147309193846,
"eval_loss": 0.18087640404701233,
"eval_mean_token_accuracy": 0.9491668822335415,
"eval_num_tokens": 563958366.0,
"eval_runtime": 7.112,
"eval_samples_per_second": 136.811,
"eval_steps_per_second": 8.577,
"step": 2850
},
{
"entropy": 0.6236652623523365,
"epoch": 0.5715195632393084,
"grad_norm": 0.9184789061546326,
"learning_rate": 1.5876334519572954e-06,
"loss": 0.1674,
"mean_token_accuracy": 0.9561629105697979,
"num_tokens": 565085789.0,
"step": 2855
},
{
"entropy": 0.677449183030562,
"epoch": 0.5725204731574158,
"grad_norm": 0.8137527704238892,
"learning_rate": 1.586521352313167e-06,
"loss": 0.1773,
"mean_token_accuracy": 0.9524840208617124,
"num_tokens": 566182679.0,
"step": 2860
},
{
"entropy": 0.6954392140561884,
"epoch": 0.5735213830755233,
"grad_norm": 0.7889710068702698,
"learning_rate": 1.585409252669039e-06,
"loss": 0.1776,
"mean_token_accuracy": 0.9529242997819727,
"num_tokens": 567225094.0,
"step": 2865
},
{
"entropy": 0.6996893541379409,
"epoch": 0.5745222929936306,
"grad_norm": 0.678756058216095,
"learning_rate": 1.584297153024911e-06,
"loss": 0.17,
"mean_token_accuracy": 0.9549407476728613,
"num_tokens": 568176148.0,
"step": 2870
},
{
"entropy": 0.701075277003375,
"epoch": 0.575523202911738,
"grad_norm": 1.6416276693344116,
"learning_rate": 1.5831850533807828e-06,
"loss": 0.1726,
"mean_token_accuracy": 0.9550453679128127,
"num_tokens": 568919638.0,
"step": 2875
},
{
"entropy": 0.6135414800860665,
"epoch": 0.5765241128298453,
"grad_norm": 0.9266515374183655,
"learning_rate": 1.582072953736655e-06,
"loss": 0.1698,
"mean_token_accuracy": 0.9551390783353285,
"num_tokens": 570088747.0,
"step": 2880
},
{
"epoch": 0.5765241128298453,
"eval_entropy": 0.6452143426801338,
"eval_loss": 0.17846497893333435,
"eval_mean_token_accuracy": 0.9499536039399319,
"eval_num_tokens": 570088747.0,
"eval_runtime": 7.1363,
"eval_samples_per_second": 136.344,
"eval_steps_per_second": 8.548,
"step": 2880
},
{
"entropy": 0.6611321086233313,
"epoch": 0.5775250227479527,
"grad_norm": 0.8491476774215698,
"learning_rate": 1.5809608540925266e-06,
"loss": 0.1699,
"mean_token_accuracy": 0.9547013607892123,
"num_tokens": 571174715.0,
"step": 2885
},
{
"entropy": 0.6863727271556854,
"epoch": 0.57852593266606,
"grad_norm": 0.7314031720161438,
"learning_rate": 1.5798487544483984e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9533765917474574,
"num_tokens": 572199578.0,
"step": 2890
},
{
"entropy": 0.6956823023882779,
"epoch": 0.5795268425841674,
"grad_norm": 0.6401289105415344,
"learning_rate": 1.5787366548042705e-06,
"loss": 0.167,
"mean_token_accuracy": 0.9559003060514276,
"num_tokens": 573157944.0,
"step": 2895
},
{
"entropy": 0.7030810995535417,
"epoch": 0.5805277525022748,
"grad_norm": 1.6768332719802856,
"learning_rate": 1.5776245551601423e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.954583527283235,
"num_tokens": 573890695.0,
"step": 2900
},
{
"entropy": 0.6071485982699828,
"epoch": 0.5815286624203821,
"grad_norm": 0.887630045413971,
"learning_rate": 1.576512455516014e-06,
"loss": 0.1616,
"mean_token_accuracy": 0.9568493794311177,
"num_tokens": 575040060.0,
"step": 2905
},
{
"entropy": 0.670040900869803,
"epoch": 0.5825295723384896,
"grad_norm": 0.8263089060783386,
"learning_rate": 1.575400355871886e-06,
"loss": 0.1751,
"mean_token_accuracy": 0.9534888061610135,
"num_tokens": 576133302.0,
"step": 2910
},
{
"epoch": 0.5825295723384896,
"eval_entropy": 0.6536685472629109,
"eval_loss": 0.18158204853534698,
"eval_mean_token_accuracy": 0.9491394347831851,
"eval_num_tokens": 576133302.0,
"eval_runtime": 7.0798,
"eval_samples_per_second": 137.433,
"eval_steps_per_second": 8.616,
"step": 2910
},
{
"entropy": 0.6866442734544927,
"epoch": 0.5835304822565969,
"grad_norm": 0.7574155926704407,
"learning_rate": 1.574288256227758e-06,
"loss": 0.1705,
"mean_token_accuracy": 0.9546013420278375,
"num_tokens": 577182306.0,
"step": 2915
},
{
"entropy": 0.6976790409196507,
"epoch": 0.5845313921747043,
"grad_norm": 0.7004702687263489,
"learning_rate": 1.57317615658363e-06,
"loss": 0.1683,
"mean_token_accuracy": 0.9545241507616911,
"num_tokens": 578143268.0,
"step": 2920
},
{
"entropy": 0.6992522610859437,
"epoch": 0.5855323020928116,
"grad_norm": 1.5859788656234741,
"learning_rate": 1.5720640569395016e-06,
"loss": 0.1649,
"mean_token_accuracy": 0.957004501061006,
"num_tokens": 578880587.0,
"step": 2925
},
{
"entropy": 0.6218387118794702,
"epoch": 0.586533212010919,
"grad_norm": 0.9949880838394165,
"learning_rate": 1.5709519572953735e-06,
"loss": 0.1693,
"mean_token_accuracy": 0.955338594046506,
"num_tokens": 580027281.0,
"step": 2930
},
{
"entropy": 0.6679159836335615,
"epoch": 0.5875341219290264,
"grad_norm": 0.7988151907920837,
"learning_rate": 1.5698398576512456e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9529053601351651,
"num_tokens": 581093873.0,
"step": 2935
},
{
"entropy": 0.698068075559356,
"epoch": 0.5885350318471337,
"grad_norm": 0.7373477816581726,
"learning_rate": 1.5687277580071174e-06,
"loss": 0.17,
"mean_token_accuracy": 0.9542166119272059,
"num_tokens": 582126221.0,
"step": 2940
},
{
"epoch": 0.5885350318471337,
"eval_entropy": 0.6519819942654156,
"eval_loss": 0.1814005970954895,
"eval_mean_token_accuracy": 0.9493043364071455,
"eval_num_tokens": 582126221.0,
"eval_runtime": 7.1253,
"eval_samples_per_second": 136.555,
"eval_steps_per_second": 8.561,
"step": 2940
},
{
"entropy": 0.7105110601945357,
"epoch": 0.5895359417652412,
"grad_norm": 0.6741722822189331,
"learning_rate": 1.567615658362989e-06,
"loss": 0.1753,
"mean_token_accuracy": 0.9536024895581332,
"num_tokens": 583074496.0,
"step": 2945
},
{
"entropy": 0.7069875858046791,
"epoch": 0.5905368516833485,
"grad_norm": 1.6635076999664307,
"learning_rate": 1.5665035587188611e-06,
"loss": 0.1708,
"mean_token_accuracy": 0.9553401833230799,
"num_tokens": 583801034.0,
"step": 2950
},
{
"entropy": 0.6250316668640483,
"epoch": 0.5915377616014559,
"grad_norm": 0.9111377596855164,
"learning_rate": 1.565391459074733e-06,
"loss": 0.1675,
"mean_token_accuracy": 0.9554700141603296,
"num_tokens": 584923018.0,
"step": 2955
},
{
"entropy": 0.6771905362606049,
"epoch": 0.5925386715195632,
"grad_norm": 0.8384516835212708,
"learning_rate": 1.564279359430605e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.9523442192511125,
"num_tokens": 586002885.0,
"step": 2960
},
{
"entropy": 0.6946805049072612,
"epoch": 0.5935395814376706,
"grad_norm": 0.8512565493583679,
"learning_rate": 1.563167259786477e-06,
"loss": 0.1748,
"mean_token_accuracy": 0.9534324310042641,
"num_tokens": 587033715.0,
"step": 2965
},
{
"entropy": 0.7199535329233516,
"epoch": 0.594540491355778,
"grad_norm": 0.9583800435066223,
"learning_rate": 1.5620551601423486e-06,
"loss": 0.1718,
"mean_token_accuracy": 0.9540681860663675,
"num_tokens": 587973781.0,
"step": 2970
},
{
"epoch": 0.594540491355778,
"eval_entropy": 0.6598269665827516,
"eval_loss": 0.17958512902259827,
"eval_mean_token_accuracy": 0.9494309190843926,
"eval_num_tokens": 587973781.0,
"eval_runtime": 7.1961,
"eval_samples_per_second": 135.212,
"eval_steps_per_second": 8.477,
"step": 2970
},
{
"entropy": 0.7052282398397273,
"epoch": 0.5955414012738853,
"grad_norm": 1.6504220962524414,
"learning_rate": 1.5609430604982206e-06,
"loss": 0.1642,
"mean_token_accuracy": 0.9569182553074577,
"num_tokens": 588703254.0,
"step": 2975
},
{
"entropy": 0.6265523303638805,
"epoch": 0.5965423111919927,
"grad_norm": 0.9333195090293884,
"learning_rate": 1.5598309608540925e-06,
"loss": 0.1649,
"mean_token_accuracy": 0.95631789077412,
"num_tokens": 589845740.0,
"step": 2980
},
{
"entropy": 0.679840994423086,
"epoch": 0.5975432211101,
"grad_norm": 0.8418663740158081,
"learning_rate": 1.5587188612099643e-06,
"loss": 0.1789,
"mean_token_accuracy": 0.9520544881170446,
"num_tokens": 590927438.0,
"step": 2985
},
{
"entropy": 0.7017120361328125,
"epoch": 0.5985441310282075,
"grad_norm": 0.7685884833335876,
"learning_rate": 1.5576067615658362e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.9537156251343814,
"num_tokens": 591971110.0,
"step": 2990
},
{
"entropy": 0.7107149928808212,
"epoch": 0.5995450409463148,
"grad_norm": 0.7431087493896484,
"learning_rate": 1.556494661921708e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.954865367304195,
"num_tokens": 592933967.0,
"step": 2995
},
{
"entropy": 0.7013540771874515,
"epoch": 0.6005459508644222,
"grad_norm": 1.5792326927185059,
"learning_rate": 1.55538256227758e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.9574482348832217,
"num_tokens": 593674210.0,
"step": 3000
},
{
"epoch": 0.6005459508644222,
"eval_entropy": 0.6583689304648853,
"eval_loss": 0.18258357048034668,
"eval_mean_token_accuracy": 0.9492516068161511,
"eval_num_tokens": 593674210.0,
"eval_runtime": 7.2234,
"eval_samples_per_second": 134.702,
"eval_steps_per_second": 8.445,
"step": 3000
},
{
"entropy": 0.6216048316522078,
"epoch": 0.6015468607825296,
"grad_norm": 0.9363070130348206,
"learning_rate": 1.554270462633452e-06,
"loss": 0.167,
"mean_token_accuracy": 0.9563505985520103,
"num_tokens": 594804599.0,
"step": 3005
},
{
"entropy": 0.6832993455908515,
"epoch": 0.6025477707006369,
"grad_norm": 0.8691864609718323,
"learning_rate": 1.5531583629893236e-06,
"loss": 0.1699,
"mean_token_accuracy": 0.9539537310600281,
"num_tokens": 595873162.0,
"step": 3010
},
{
"entropy": 0.7007348017259077,
"epoch": 0.6035486806187443,
"grad_norm": 0.7877841591835022,
"learning_rate": 1.5520462633451957e-06,
"loss": 0.1714,
"mean_token_accuracy": 0.9534028855237093,
"num_tokens": 596910846.0,
"step": 3015
},
{
"entropy": 0.7141089824112978,
"epoch": 0.6045495905368516,
"grad_norm": 0.6574937105178833,
"learning_rate": 1.5509341637010676e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9544164771383459,
"num_tokens": 597853139.0,
"step": 3020
},
{
"entropy": 0.7040485745126551,
"epoch": 0.6055505004549591,
"grad_norm": 1.726340889930725,
"learning_rate": 1.5498220640569394e-06,
"loss": 0.1645,
"mean_token_accuracy": 0.9566915214061738,
"num_tokens": 598577635.0,
"step": 3025
},
{
"entropy": 0.6146221702749078,
"epoch": 0.6065514103730664,
"grad_norm": 0.9489019513130188,
"learning_rate": 1.5487099644128113e-06,
"loss": 0.1643,
"mean_token_accuracy": 0.9564465994184668,
"num_tokens": 599712819.0,
"step": 3030
},
{
"epoch": 0.6065514103730664,
"eval_entropy": 0.6538055861582521,
"eval_loss": 0.18010924756526947,
"eval_mean_token_accuracy": 0.9496048923398628,
"eval_num_tokens": 599712819.0,
"eval_runtime": 6.9982,
"eval_samples_per_second": 139.035,
"eval_steps_per_second": 8.716,
"step": 3030
},
{
"entropy": 0.6649877328764309,
"epoch": 0.6075523202911738,
"grad_norm": 0.7716711163520813,
"learning_rate": 1.5475978647686831e-06,
"loss": 0.1674,
"mean_token_accuracy": 0.9545083528215235,
"num_tokens": 600810295.0,
"step": 3035
},
{
"entropy": 0.6926416860385375,
"epoch": 0.6085532302092812,
"grad_norm": 0.7539160251617432,
"learning_rate": 1.546485765124555e-06,
"loss": 0.1712,
"mean_token_accuracy": 0.9541196048259735,
"num_tokens": 601845533.0,
"step": 3040
},
{
"entropy": 0.7097321336919611,
"epoch": 0.6095541401273885,
"grad_norm": 0.954349160194397,
"learning_rate": 1.545373665480427e-06,
"loss": 0.1657,
"mean_token_accuracy": 0.9558960502797907,
"num_tokens": 602800251.0,
"step": 3045
},
{
"entropy": 0.7092912332578138,
"epoch": 0.6105550500454959,
"grad_norm": 1.6350897550582886,
"learning_rate": 1.544261565836299e-06,
"loss": 0.1696,
"mean_token_accuracy": 0.9553542906587774,
"num_tokens": 603536113.0,
"step": 3050
},
{
"entropy": 0.6223157600923018,
"epoch": 0.6115559599636032,
"grad_norm": 0.913377583026886,
"learning_rate": 1.5431494661921708e-06,
"loss": 0.1621,
"mean_token_accuracy": 0.9570931732654572,
"num_tokens": 604670520.0,
"step": 3055
},
{
"entropy": 0.6658590446818958,
"epoch": 0.6125568698817107,
"grad_norm": 0.8451462984085083,
"learning_rate": 1.5420373665480426e-06,
"loss": 0.1634,
"mean_token_accuracy": 0.9555998970161784,
"num_tokens": 605754295.0,
"step": 3060
},
{
"epoch": 0.6125568698817107,
"eval_entropy": 0.6519810190943421,
"eval_loss": 0.18205370008945465,
"eval_mean_token_accuracy": 0.9495203221430544,
"eval_num_tokens": 605754295.0,
"eval_runtime": 7.058,
"eval_samples_per_second": 137.859,
"eval_steps_per_second": 8.643,
"step": 3060
},
{
"entropy": 0.691884211789478,
"epoch": 0.6135577797998181,
"grad_norm": 0.7430902123451233,
"learning_rate": 1.5409252669039145e-06,
"loss": 0.1689,
"mean_token_accuracy": 0.9547361292622306,
"num_tokens": 606778333.0,
"step": 3065
},
{
"entropy": 0.7035522225228223,
"epoch": 0.6145586897179254,
"grad_norm": 0.6056403517723083,
"learning_rate": 1.5398131672597866e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9561255595900796,
"num_tokens": 607736004.0,
"step": 3070
},
{
"entropy": 0.7008915657346899,
"epoch": 0.6155595996360328,
"grad_norm": 1.5326563119888306,
"learning_rate": 1.5387010676156582e-06,
"loss": 0.1652,
"mean_token_accuracy": 0.9567563999782909,
"num_tokens": 608474829.0,
"step": 3075
},
{
"entropy": 0.6191458604552529,
"epoch": 0.6165605095541401,
"grad_norm": 0.9065665602684021,
"learning_rate": 1.53758896797153e-06,
"loss": 0.1582,
"mean_token_accuracy": 0.9579779603264549,
"num_tokens": 609604074.0,
"step": 3080
},
{
"entropy": 0.6666526832363823,
"epoch": 0.6175614194722475,
"grad_norm": 0.8556954860687256,
"learning_rate": 1.5364768683274021e-06,
"loss": 0.167,
"mean_token_accuracy": 0.9543328859589316,
"num_tokens": 610710246.0,
"step": 3085
},
{
"entropy": 0.6837726238099011,
"epoch": 0.6185623293903548,
"grad_norm": 0.8487630486488342,
"learning_rate": 1.535364768683274e-06,
"loss": 0.1688,
"mean_token_accuracy": 0.955160356651653,
"num_tokens": 611758840.0,
"step": 3090
},
{
"epoch": 0.6185623293903548,
"eval_entropy": 0.6574973884176035,
"eval_loss": 0.179812490940094,
"eval_mean_token_accuracy": 0.949747121724926,
"eval_num_tokens": 611758840.0,
"eval_runtime": 7.067,
"eval_samples_per_second": 137.683,
"eval_steps_per_second": 8.632,
"step": 3090
},
{
"entropy": 0.7014839009805159,
"epoch": 0.6195632393084622,
"grad_norm": 0.6837453246116638,
"learning_rate": 1.5342526690391456e-06,
"loss": 0.1687,
"mean_token_accuracy": 0.9557089160789143,
"num_tokens": 612699784.0,
"step": 3095
},
{
"entropy": 0.7075679730285298,
"epoch": 0.6205641492265697,
"grad_norm": 1.7436314821243286,
"learning_rate": 1.5331405693950177e-06,
"loss": 0.169,
"mean_token_accuracy": 0.9560685184868899,
"num_tokens": 613436633.0,
"step": 3100
},
{
"entropy": 0.6202307202599265,
"epoch": 0.621565059144677,
"grad_norm": 0.9475667476654053,
"learning_rate": 1.5320284697508896e-06,
"loss": 0.1597,
"mean_token_accuracy": 0.9571539521217346,
"num_tokens": 614596800.0,
"step": 3105
},
{
"entropy": 0.6672575666145845,
"epoch": 0.6225659690627844,
"grad_norm": 0.8185185194015503,
"learning_rate": 1.5309163701067616e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.9542947286909277,
"num_tokens": 615704382.0,
"step": 3110
},
{
"entropy": 0.6795628358017315,
"epoch": 0.6235668789808917,
"grad_norm": 0.7307755351066589,
"learning_rate": 1.5298042704626333e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.9561434664509513,
"num_tokens": 616757442.0,
"step": 3115
},
{
"entropy": 0.6966196049343456,
"epoch": 0.6245677888989991,
"grad_norm": 0.6424974799156189,
"learning_rate": 1.5286921708185051e-06,
"loss": 0.167,
"mean_token_accuracy": 0.9560314021327279,
"num_tokens": 617714286.0,
"step": 3120
},
{
"epoch": 0.6245677888989991,
"eval_entropy": 0.6488942484386632,
"eval_loss": 0.18082934617996216,
"eval_mean_token_accuracy": 0.9497447346077591,
"eval_num_tokens": 617714286.0,
"eval_runtime": 7.0333,
"eval_samples_per_second": 138.342,
"eval_steps_per_second": 8.673,
"step": 3120
},
{
"entropy": 0.6996615810827775,
"epoch": 0.6255686988171064,
"grad_norm": 1.6187618970870972,
"learning_rate": 1.5275800711743772e-06,
"loss": 0.1604,
"mean_token_accuracy": 0.9579841077327729,
"num_tokens": 618452310.0,
"step": 3125
},
{
"entropy": 0.6223932883956216,
"epoch": 0.6265696087352138,
"grad_norm": 0.9200411438941956,
"learning_rate": 1.526467971530249e-06,
"loss": 0.1598,
"mean_token_accuracy": 0.9576196242462505,
"num_tokens": 619611422.0,
"step": 3130
},
{
"entropy": 0.6693487199870023,
"epoch": 0.6275705186533213,
"grad_norm": 0.8405710458755493,
"learning_rate": 1.525355871886121e-06,
"loss": 0.1661,
"mean_token_accuracy": 0.9551270468668505,
"num_tokens": 620685183.0,
"step": 3135
},
{
"entropy": 0.6933458956805143,
"epoch": 0.6285714285714286,
"grad_norm": 0.9356978535652161,
"learning_rate": 1.5242437722419928e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9554397490891543,
"num_tokens": 621712160.0,
"step": 3140
},
{
"entropy": 0.6994579350406473,
"epoch": 0.629572338489536,
"grad_norm": 0.8684320449829102,
"learning_rate": 1.5231316725978646e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.9550400712273338,
"num_tokens": 622669547.0,
"step": 3145
},
{
"entropy": 0.7034677063876932,
"epoch": 0.6305732484076433,
"grad_norm": 1.671410083770752,
"learning_rate": 1.5220195729537367e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.9572344660758972,
"num_tokens": 623402046.0,
"step": 3150
},
{
"epoch": 0.6305732484076433,
"eval_entropy": 0.6508955388772683,
"eval_loss": 0.18214590847492218,
"eval_mean_token_accuracy": 0.9497071631619187,
"eval_num_tokens": 623402046.0,
"eval_runtime": 7.0379,
"eval_samples_per_second": 138.251,
"eval_steps_per_second": 8.667,
"step": 3150
},
{
"entropy": 0.6169803651896391,
"epoch": 0.6315741583257507,
"grad_norm": 0.9965471625328064,
"learning_rate": 1.5209074733096086e-06,
"loss": 0.159,
"mean_token_accuracy": 0.9576476606455716,
"num_tokens": 624557115.0,
"step": 3155
},
{
"entropy": 0.6633053736253218,
"epoch": 0.632575068243858,
"grad_norm": 0.8597959280014038,
"learning_rate": 1.5197953736654802e-06,
"loss": 0.1664,
"mean_token_accuracy": 0.9550956438888203,
"num_tokens": 625630354.0,
"step": 3160
},
{
"entropy": 0.6893996604464271,
"epoch": 0.6335759781619654,
"grad_norm": 0.7524270415306091,
"learning_rate": 1.5186832740213523e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.9566006205298684,
"num_tokens": 626655720.0,
"step": 3165
},
{
"entropy": 0.7058270321650939,
"epoch": 0.6345768880800728,
"grad_norm": 0.6807648539543152,
"learning_rate": 1.5175711743772241e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.9561190323396163,
"num_tokens": 627597028.0,
"step": 3170
},
{
"entropy": 0.703032106702978,
"epoch": 0.6355777979981801,
"grad_norm": 1.6004669666290283,
"learning_rate": 1.516459074733096e-06,
"loss": 0.158,
"mean_token_accuracy": 0.9586949603124099,
"num_tokens": 628310494.0,
"step": 3175
},
{
"entropy": 0.6183769884434613,
"epoch": 0.6365787079162876,
"grad_norm": 0.9058781862258911,
"learning_rate": 1.5153469750889679e-06,
"loss": 0.1559,
"mean_token_accuracy": 0.9587956016713922,
"num_tokens": 629439229.0,
"step": 3180
},
{
"epoch": 0.6365787079162876,
"eval_entropy": 0.646690167364527,
"eval_loss": 0.18040700256824493,
"eval_mean_token_accuracy": 0.9494497678318962,
"eval_num_tokens": 629439229.0,
"eval_runtime": 7.0218,
"eval_samples_per_second": 138.569,
"eval_steps_per_second": 8.687,
"step": 3180
},
{
"entropy": 0.6690399782224135,
"epoch": 0.6375796178343949,
"grad_norm": 0.8933009505271912,
"learning_rate": 1.5142348754448397e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.955499666929245,
"num_tokens": 630526225.0,
"step": 3185
},
{
"entropy": 0.6908215509219603,
"epoch": 0.6385805277525023,
"grad_norm": 0.7805888056755066,
"learning_rate": 1.5131227758007118e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9543212841857563,
"num_tokens": 631544968.0,
"step": 3190
},
{
"entropy": 0.7041902406649156,
"epoch": 0.6395814376706096,
"grad_norm": 0.6156824827194214,
"learning_rate": 1.5120106761565836e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.957361562685533,
"num_tokens": 632498779.0,
"step": 3195
},
{
"entropy": 0.7018732940608805,
"epoch": 0.640582347588717,
"grad_norm": 1.7362315654754639,
"learning_rate": 1.5108985765124555e-06,
"loss": 0.1581,
"mean_token_accuracy": 0.9588197816501964,
"num_tokens": 633231622.0,
"step": 3200
},
{
"entropy": 0.6234849360856143,
"epoch": 0.6415832575068244,
"grad_norm": 0.9099482297897339,
"learning_rate": 1.5097864768683274e-06,
"loss": 0.1613,
"mean_token_accuracy": 0.9571101091124795,
"num_tokens": 634372922.0,
"step": 3205
},
{
"entropy": 0.6708677102218975,
"epoch": 0.6425841674249317,
"grad_norm": 0.8247345089912415,
"learning_rate": 1.5086743772241992e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9549422193657268,
"num_tokens": 635453965.0,
"step": 3210
},
{
"epoch": 0.6425841674249317,
"eval_entropy": 0.6494777632541344,
"eval_loss": 0.18078424036502838,
"eval_mean_token_accuracy": 0.9497652522853164,
"eval_num_tokens": 635453965.0,
"eval_runtime": 7.0204,
"eval_samples_per_second": 138.595,
"eval_steps_per_second": 8.689,
"step": 3210
},
{
"entropy": 0.6948196053504944,
"epoch": 0.6435850773430392,
"grad_norm": 0.7620670795440674,
"learning_rate": 1.507562277580071e-06,
"loss": 0.1629,
"mean_token_accuracy": 0.9559367472475225,
"num_tokens": 636472033.0,
"step": 3215
},
{
"entropy": 0.7079345066439021,
"epoch": 0.6445859872611465,
"grad_norm": 0.6674084663391113,
"learning_rate": 1.5064501779359431e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9555874754082073,
"num_tokens": 637412606.0,
"step": 3220
},
{
"entropy": 0.7107769147916274,
"epoch": 0.6455868971792539,
"grad_norm": 1.6964831352233887,
"learning_rate": 1.5053380782918148e-06,
"loss": 0.1647,
"mean_token_accuracy": 0.9574269002134149,
"num_tokens": 638133615.0,
"step": 3225
},
{
"entropy": 0.6168817777525295,
"epoch": 0.6465878070973612,
"grad_norm": 0.9298244118690491,
"learning_rate": 1.5042259786476866e-06,
"loss": 0.1569,
"mean_token_accuracy": 0.9585052159699526,
"num_tokens": 639270319.0,
"step": 3230
},
{
"entropy": 0.6654906495050951,
"epoch": 0.6475887170154686,
"grad_norm": 0.8299368023872375,
"learning_rate": 1.5031138790035587e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9553114105354655,
"num_tokens": 640343322.0,
"step": 3235
},
{
"entropy": 0.6908837380734357,
"epoch": 0.648589626933576,
"grad_norm": 0.7933794260025024,
"learning_rate": 1.5020017793594306e-06,
"loss": 0.1643,
"mean_token_accuracy": 0.9559738993644714,
"num_tokens": 641375050.0,
"step": 3240
},
{
"epoch": 0.648589626933576,
"eval_entropy": 0.6472069806739932,
"eval_loss": 0.18293221294879913,
"eval_mean_token_accuracy": 0.9496043148587962,
"eval_num_tokens": 641375050.0,
"eval_runtime": 7.0336,
"eval_samples_per_second": 138.335,
"eval_steps_per_second": 8.673,
"step": 3240
},
{
"entropy": 0.702154829827222,
"epoch": 0.6495905368516833,
"grad_norm": 0.6860081553459167,
"learning_rate": 1.5008896797153024e-06,
"loss": 0.1632,
"mean_token_accuracy": 0.9570187379013408,
"num_tokens": 642328426.0,
"step": 3245
},
{
"entropy": 0.6984548650004647,
"epoch": 0.6505914467697907,
"grad_norm": 1.5585992336273193,
"learning_rate": 1.4997775800711743e-06,
"loss": 0.1568,
"mean_token_accuracy": 0.9584351718425751,
"num_tokens": 643060158.0,
"step": 3250
},
{
"entropy": 0.6129773226651278,
"epoch": 0.6515923566878981,
"grad_norm": 0.9925711750984192,
"learning_rate": 1.4986654804270461e-06,
"loss": 0.1585,
"mean_token_accuracy": 0.958180884881453,
"num_tokens": 644209016.0,
"step": 3255
},
{
"entropy": 0.6673231913284822,
"epoch": 0.6525932666060055,
"grad_norm": 0.8757086992263794,
"learning_rate": 1.4975533807829182e-06,
"loss": 0.1648,
"mean_token_accuracy": 0.9552432694218376,
"num_tokens": 645283312.0,
"step": 3260
},
{
"entropy": 0.6877216878262433,
"epoch": 0.6535941765241128,
"grad_norm": 0.7658048272132874,
"learning_rate": 1.4964412811387899e-06,
"loss": 0.1604,
"mean_token_accuracy": 0.9569750054316087,
"num_tokens": 646304671.0,
"step": 3265
},
{
"entropy": 0.7039268119768662,
"epoch": 0.6545950864422202,
"grad_norm": 0.6302322149276733,
"learning_rate": 1.4953291814946617e-06,
"loss": 0.1626,
"mean_token_accuracy": 0.9568539722399279,
"num_tokens": 647257247.0,
"step": 3270
},
{
"epoch": 0.6545950864422202,
"eval_entropy": 0.6477563601048266,
"eval_loss": 0.1822510063648224,
"eval_mean_token_accuracy": 0.9499242520723187,
"eval_num_tokens": 647257247.0,
"eval_runtime": 7.0229,
"eval_samples_per_second": 138.547,
"eval_steps_per_second": 8.686,
"step": 3270
},
{
"entropy": 0.6974148641933094,
"epoch": 0.6555959963603276,
"grad_norm": 1.5133696794509888,
"learning_rate": 1.4942170818505338e-06,
"loss": 0.1554,
"mean_token_accuracy": 0.9593302407047966,
"num_tokens": 647996410.0,
"step": 3275
},
{
"entropy": 0.6135740900581533,
"epoch": 0.6565969062784349,
"grad_norm": 0.9020703434944153,
"learning_rate": 1.4931049822064056e-06,
"loss": 0.1536,
"mean_token_accuracy": 0.9590423145077446,
"num_tokens": 649158135.0,
"step": 3280
},
{
"entropy": 0.6622402567755092,
"epoch": 0.6575978161965423,
"grad_norm": 0.8561988472938538,
"learning_rate": 1.4919928825622777e-06,
"loss": 0.1606,
"mean_token_accuracy": 0.9568440372293646,
"num_tokens": 650238501.0,
"step": 3285
},
{
"entropy": 0.6838861806826158,
"epoch": 0.6585987261146496,
"grad_norm": 0.8391448259353638,
"learning_rate": 1.4908807829181494e-06,
"loss": 0.1585,
"mean_token_accuracy": 0.9573293057355013,
"num_tokens": 651266847.0,
"step": 3290
},
{
"entropy": 0.7013624326749281,
"epoch": 0.6595996360327571,
"grad_norm": 0.8588127493858337,
"learning_rate": 1.4897686832740212e-06,
"loss": 0.1598,
"mean_token_accuracy": 0.9582955512133512,
"num_tokens": 652220456.0,
"step": 3295
},
{
"entropy": 0.7058557136492296,
"epoch": 0.6606005459508644,
"grad_norm": 1.6215109825134277,
"learning_rate": 1.4886565836298933e-06,
"loss": 0.1558,
"mean_token_accuracy": 0.9595519033345309,
"num_tokens": 652954302.0,
"step": 3300
},
{
"epoch": 0.6606005459508644,
"eval_entropy": 0.6499209511475484,
"eval_loss": 0.1842976212501526,
"eval_mean_token_accuracy": 0.948709644255091,
"eval_num_tokens": 652954302.0,
"eval_runtime": 7.0384,
"eval_samples_per_second": 138.241,
"eval_steps_per_second": 8.667,
"step": 3300
},
{
"entropy": 0.621334047480063,
"epoch": 0.6616014558689718,
"grad_norm": 0.9623603820800781,
"learning_rate": 1.4875444839857651e-06,
"loss": 0.1577,
"mean_token_accuracy": 0.9584027409553528,
"num_tokens": 654085373.0,
"step": 3305
},
{
"entropy": 0.6754447023976933,
"epoch": 0.6626023657870792,
"grad_norm": 0.8001890778541565,
"learning_rate": 1.4864323843416368e-06,
"loss": 0.1625,
"mean_token_accuracy": 0.9564595872705633,
"num_tokens": 655160148.0,
"step": 3310
},
{
"entropy": 0.6801682867787101,
"epoch": 0.6636032757051865,
"grad_norm": 0.7729578018188477,
"learning_rate": 1.4853202846975089e-06,
"loss": 0.1614,
"mean_token_accuracy": 0.956423576853492,
"num_tokens": 656214275.0,
"step": 3315
},
{
"entropy": 0.6868817286057906,
"epoch": 0.6646041856232939,
"grad_norm": 0.6810210943222046,
"learning_rate": 1.4842081850533807e-06,
"loss": 0.1566,
"mean_token_accuracy": 0.9585105110298503,
"num_tokens": 657178543.0,
"step": 3320
},
{
"entropy": 0.7001429920846766,
"epoch": 0.6656050955414012,
"grad_norm": 1.6506801843643188,
"learning_rate": 1.4830960854092528e-06,
"loss": 0.1543,
"mean_token_accuracy": 0.9600406169891358,
"num_tokens": 657908037.0,
"step": 3325
},
{
"entropy": 0.6132907515222376,
"epoch": 0.6666060054595087,
"grad_norm": 0.8838356733322144,
"learning_rate": 1.4819839857651244e-06,
"loss": 0.1537,
"mean_token_accuracy": 0.9591153843836351,
"num_tokens": 659055420.0,
"step": 3330
},
{
"epoch": 0.6666060054595087,
"eval_entropy": 0.64075055913847,
"eval_loss": 0.18135496973991394,
"eval_mean_token_accuracy": 0.949864628862162,
"eval_num_tokens": 659055420.0,
"eval_runtime": 7.0585,
"eval_samples_per_second": 137.848,
"eval_steps_per_second": 8.642,
"step": 3330
},
{
"entropy": 0.6668086566708304,
"epoch": 0.667606915377616,
"grad_norm": 0.8098176121711731,
"learning_rate": 1.4808718861209963e-06,
"loss": 0.1621,
"mean_token_accuracy": 0.9567366887222637,
"num_tokens": 660143889.0,
"step": 3335
},
{
"entropy": 0.6945254163308577,
"epoch": 0.6686078252957234,
"grad_norm": 0.8045607209205627,
"learning_rate": 1.4797597864768684e-06,
"loss": 0.1623,
"mean_token_accuracy": 0.9565726925026287,
"num_tokens": 661162373.0,
"step": 3340
},
{
"entropy": 0.7033212970603596,
"epoch": 0.6696087352138308,
"grad_norm": 0.6349719762802124,
"learning_rate": 1.4786476868327402e-06,
"loss": 0.1617,
"mean_token_accuracy": 0.9573416639458049,
"num_tokens": 662112855.0,
"step": 3345
},
{
"entropy": 0.7066628607836637,
"epoch": 0.6706096451319381,
"grad_norm": 1.7244939804077148,
"learning_rate": 1.4775355871886119e-06,
"loss": 0.1561,
"mean_token_accuracy": 0.9585809138688174,
"num_tokens": 662839231.0,
"step": 3350
},
{
"entropy": 0.6230565829710527,
"epoch": 0.6716105550500455,
"grad_norm": 0.9981881380081177,
"learning_rate": 1.476423487544484e-06,
"loss": 0.1544,
"mean_token_accuracy": 0.9593272588469766,
"num_tokens": 663977485.0,
"step": 3355
},
{
"entropy": 0.6668084019964392,
"epoch": 0.6726114649681528,
"grad_norm": 0.8376649618148804,
"learning_rate": 1.4753113879003558e-06,
"loss": 0.1577,
"mean_token_accuracy": 0.9574681758880615,
"num_tokens": 665089323.0,
"step": 3360
},
{
"epoch": 0.6726114649681528,
"eval_entropy": 0.6494350780229099,
"eval_loss": 0.18222181499004364,
"eval_mean_token_accuracy": 0.9497965378839461,
"eval_num_tokens": 665089323.0,
"eval_runtime": 7.0401,
"eval_samples_per_second": 138.209,
"eval_steps_per_second": 8.665,
"step": 3360
},
{
"entropy": 0.6925623609261079,
"epoch": 0.6736123748862602,
"grad_norm": 0.786201000213623,
"learning_rate": 1.4741992882562276e-06,
"loss": 0.1547,
"mean_token_accuracy": 0.9576037005944685,
"num_tokens": 666117675.0,
"step": 3365
},
{
"entropy": 0.7104368925094604,
"epoch": 0.6746132848043676,
"grad_norm": 0.736659049987793,
"learning_rate": 1.4730871886120997e-06,
"loss": 0.1572,
"mean_token_accuracy": 0.9585932124744762,
"num_tokens": 667063051.0,
"step": 3370
},
{
"entropy": 0.7050894742662256,
"epoch": 0.675614194722475,
"grad_norm": 1.705169916152954,
"learning_rate": 1.4719750889679714e-06,
"loss": 0.1547,
"mean_token_accuracy": 0.9589919149875641,
"num_tokens": 667787190.0,
"step": 3375
},
{
"entropy": 0.6144024074077606,
"epoch": 0.6766151046405824,
"grad_norm": 0.940698504447937,
"learning_rate": 1.4708629893238434e-06,
"loss": 0.1504,
"mean_token_accuracy": 0.9602103607221083,
"num_tokens": 668931786.0,
"step": 3380
},
{
"entropy": 0.6675797638568011,
"epoch": 0.6776160145586897,
"grad_norm": 0.859804093837738,
"learning_rate": 1.4697508896797153e-06,
"loss": 0.1538,
"mean_token_accuracy": 0.9588768585161729,
"num_tokens": 669987112.0,
"step": 3385
},
{
"entropy": 0.6947231168096716,
"epoch": 0.6786169244767971,
"grad_norm": 0.7688744068145752,
"learning_rate": 1.4686387900355871e-06,
"loss": 0.1608,
"mean_token_accuracy": 0.9564679145812989,
"num_tokens": 671037275.0,
"step": 3390
},
{
"epoch": 0.6786169244767971,
"eval_entropy": 0.6460012275664533,
"eval_loss": 0.18215857446193695,
"eval_mean_token_accuracy": 0.9496130151826827,
"eval_num_tokens": 671037275.0,
"eval_runtime": 7.0121,
"eval_samples_per_second": 138.76,
"eval_steps_per_second": 8.699,
"step": 3390
},
{
"entropy": 0.7101943942633542,
"epoch": 0.6796178343949044,
"grad_norm": 0.6788628101348877,
"learning_rate": 1.467526690391459e-06,
"loss": 0.1595,
"mean_token_accuracy": 0.9584887022321874,
"num_tokens": 672001957.0,
"step": 3395
},
{
"entropy": 0.7138026226650585,
"epoch": 0.6806187443130118,
"grad_norm": 1.73914635181427,
"learning_rate": 1.4664145907473309e-06,
"loss": 0.1551,
"mean_token_accuracy": 0.9598342695019462,
"num_tokens": 672725611.0,
"step": 3400
},
{
"entropy": 0.6170999803326347,
"epoch": 0.6816196542311193,
"grad_norm": 0.9073975682258606,
"learning_rate": 1.4653024911032027e-06,
"loss": 0.1485,
"mean_token_accuracy": 0.960613077878952,
"num_tokens": 673851656.0,
"step": 3405
},
{
"entropy": 0.6696691445328973,
"epoch": 0.6826205641492266,
"grad_norm": 0.8153337836265564,
"learning_rate": 1.4641903914590748e-06,
"loss": 0.1569,
"mean_token_accuracy": 0.957689621773633,
"num_tokens": 674934713.0,
"step": 3410
},
{
"entropy": 0.6971980799328197,
"epoch": 0.683621474067334,
"grad_norm": 0.7351928949356079,
"learning_rate": 1.4630782918149464e-06,
"loss": 0.1554,
"mean_token_accuracy": 0.9583224740895357,
"num_tokens": 675967339.0,
"step": 3415
},
{
"entropy": 0.6999681651592254,
"epoch": 0.6846223839854413,
"grad_norm": 0.9492703676223755,
"learning_rate": 1.4619661921708185e-06,
"loss": 0.1508,
"mean_token_accuracy": 0.9597205470908772,
"num_tokens": 676914390.0,
"step": 3420
},
{
"epoch": 0.6846223839854413,
"eval_entropy": 0.6426876177553271,
"eval_loss": 0.18153499066829681,
"eval_mean_token_accuracy": 0.9500644890988459,
"eval_num_tokens": 676914390.0,
"eval_runtime": 7.0075,
"eval_samples_per_second": 138.851,
"eval_steps_per_second": 8.705,
"step": 3420
},
{
"entropy": 0.7060933086005124,
"epoch": 0.6856232939035487,
"grad_norm": 1.5791497230529785,
"learning_rate": 1.4608540925266904e-06,
"loss": 0.1552,
"mean_token_accuracy": 0.9589927136898041,
"num_tokens": 677638601.0,
"step": 3425
},
{
"entropy": 0.6128041752360084,
"epoch": 0.686624203821656,
"grad_norm": 0.9395958781242371,
"learning_rate": 1.4597419928825622e-06,
"loss": 0.1481,
"mean_token_accuracy": 0.9608120690692555,
"num_tokens": 678785268.0,
"step": 3430
},
{
"entropy": 0.6682237459854646,
"epoch": 0.6876251137397634,
"grad_norm": 0.8034733533859253,
"learning_rate": 1.458629893238434e-06,
"loss": 0.1563,
"mean_token_accuracy": 0.9580443588170138,
"num_tokens": 679846215.0,
"step": 3435
},
{
"entropy": 0.6912163682959297,
"epoch": 0.6886260236578708,
"grad_norm": 0.7852122187614441,
"learning_rate": 1.457517793594306e-06,
"loss": 0.1562,
"mean_token_accuracy": 0.957829516584223,
"num_tokens": 680888257.0,
"step": 3440
},
{
"entropy": 0.7082947004925121,
"epoch": 0.6896269335759782,
"grad_norm": 0.6746036410331726,
"learning_rate": 1.4564056939501778e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.9582202104004947,
"num_tokens": 681859900.0,
"step": 3445
},
{
"entropy": 0.7029309023510326,
"epoch": 0.6906278434940856,
"grad_norm": 1.6336463689804077,
"learning_rate": 1.4552935943060499e-06,
"loss": 0.1518,
"mean_token_accuracy": 0.9600752207365904,
"num_tokens": 682593875.0,
"step": 3450
},
{
"epoch": 0.6906278434940856,
"eval_entropy": 0.650386592403787,
"eval_loss": 0.18218755722045898,
"eval_mean_token_accuracy": 0.9497824721649045,
"eval_num_tokens": 682593875.0,
"eval_runtime": 6.9925,
"eval_samples_per_second": 139.149,
"eval_steps_per_second": 8.724,
"step": 3450
},
{
"entropy": 0.6209653827277097,
"epoch": 0.6916287534121929,
"grad_norm": 0.9056838750839233,
"learning_rate": 1.4541814946619217e-06,
"loss": 0.1525,
"mean_token_accuracy": 0.9596792773766951,
"num_tokens": 683728552.0,
"step": 3455
},
{
"entropy": 0.6720722063021226,
"epoch": 0.6926296633303003,
"grad_norm": 0.848111093044281,
"learning_rate": 1.4530693950177934e-06,
"loss": 0.1556,
"mean_token_accuracy": 0.9583455557172949,
"num_tokens": 684820291.0,
"step": 3460
},
{
"entropy": 0.6981573473323476,
"epoch": 0.6936305732484076,
"grad_norm": 0.8181219100952148,
"learning_rate": 1.4519572953736654e-06,
"loss": 0.1619,
"mean_token_accuracy": 0.9571636861020868,
"num_tokens": 685843222.0,
"step": 3465
},
{
"entropy": 0.6986917571587996,
"epoch": 0.694631483166515,
"grad_norm": 0.6309542059898376,
"learning_rate": 1.4508451957295373e-06,
"loss": 0.1487,
"mean_token_accuracy": 0.960549614646218,
"num_tokens": 686795079.0,
"step": 3470
},
{
"entropy": 0.6992609934373335,
"epoch": 0.6956323930846224,
"grad_norm": 1.6340988874435425,
"learning_rate": 1.4497330960854094e-06,
"loss": 0.1526,
"mean_token_accuracy": 0.9600770901549947,
"num_tokens": 687534250.0,
"step": 3475
},
{
"entropy": 0.6203337536616759,
"epoch": 0.6966333030027297,
"grad_norm": 0.9461851119995117,
"learning_rate": 1.448620996441281e-06,
"loss": 0.1462,
"mean_token_accuracy": 0.9612834345210682,
"num_tokens": 688659595.0,
"step": 3480
},
{
"epoch": 0.6966333030027297,
"eval_entropy": 0.6457425873787677,
"eval_loss": 0.18017198145389557,
"eval_mean_token_accuracy": 0.9507011683260809,
"eval_num_tokens": 688659595.0,
"eval_runtime": 7.1026,
"eval_samples_per_second": 136.992,
"eval_steps_per_second": 8.588,
"step": 3480
},
{
"entropy": 0.6704084252769297,
"epoch": 0.6976342129208372,
"grad_norm": 0.8556516766548157,
"learning_rate": 1.4475088967971529e-06,
"loss": 0.1551,
"mean_token_accuracy": 0.9571811556816101,
"num_tokens": 689757561.0,
"step": 3485
},
{
"entropy": 0.6925178121436726,
"epoch": 0.6986351228389445,
"grad_norm": 0.7813107967376709,
"learning_rate": 1.446396797153025e-06,
"loss": 0.1571,
"mean_token_accuracy": 0.958230844410983,
"num_tokens": 690812177.0,
"step": 3490
},
{
"entropy": 0.716183881055225,
"epoch": 0.6996360327570519,
"grad_norm": 0.6608054637908936,
"learning_rate": 1.4452846975088968e-06,
"loss": 0.1551,
"mean_token_accuracy": 0.9585723102092742,
"num_tokens": 691767285.0,
"step": 3495
},
{
"entropy": 0.7106182558970018,
"epoch": 0.7006369426751592,
"grad_norm": 1.6784389019012451,
"learning_rate": 1.4441725978647684e-06,
"loss": 0.1527,
"mean_token_accuracy": 0.959889015826312,
"num_tokens": 692508977.0,
"step": 3500
},
{
"entropy": 0.6119228395548734,
"epoch": 0.7016378525932666,
"grad_norm": 0.9040566086769104,
"learning_rate": 1.4430604982206405e-06,
"loss": 0.1463,
"mean_token_accuracy": 0.9611663525754756,
"num_tokens": 693677838.0,
"step": 3505
},
{
"entropy": 0.6678372830152511,
"epoch": 0.702638762511374,
"grad_norm": 0.8394715189933777,
"learning_rate": 1.4419483985765124e-06,
"loss": 0.1591,
"mean_token_accuracy": 0.9574539330872622,
"num_tokens": 694757367.0,
"step": 3510
},
{
"epoch": 0.702638762511374,
"eval_entropy": 0.6457846052333956,
"eval_loss": 0.18403349816799164,
"eval_mean_token_accuracy": 0.9498394170745474,
"eval_num_tokens": 694757367.0,
"eval_runtime": 7.199,
"eval_samples_per_second": 135.158,
"eval_steps_per_second": 8.473,
"step": 3510
},
{
"entropy": 0.6866304833780635,
"epoch": 0.7036396724294813,
"grad_norm": 0.7923183441162109,
"learning_rate": 1.4408362989323844e-06,
"loss": 0.1548,
"mean_token_accuracy": 0.9585600186478008,
"num_tokens": 695807809.0,
"step": 3515
},
{
"entropy": 0.698677041313865,
"epoch": 0.7046405823475888,
"grad_norm": 0.6395448446273804,
"learning_rate": 1.439724199288256e-06,
"loss": 0.1552,
"mean_token_accuracy": 0.9591563501141288,
"num_tokens": 696762341.0,
"step": 3520
},
{
"entropy": 0.7000049211762168,
"epoch": 0.7056414922656961,
"grad_norm": 1.704610824584961,
"learning_rate": 1.438612099644128e-06,
"loss": 0.1483,
"mean_token_accuracy": 0.9610144132917577,
"num_tokens": 697491178.0,
"step": 3525
},
{
"entropy": 0.616519127108834,
"epoch": 0.7066424021838035,
"grad_norm": 0.938034176826477,
"learning_rate": 1.4375e-06,
"loss": 0.1466,
"mean_token_accuracy": 0.961005428162488,
"num_tokens": 698636465.0,
"step": 3530
},
{
"entropy": 0.6671069808981636,
"epoch": 0.7076433121019108,
"grad_norm": 0.8367746472358704,
"learning_rate": 1.4363879003558719e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.957984118569981,
"num_tokens": 699733654.0,
"step": 3535
},
{
"entropy": 0.686641216007146,
"epoch": 0.7086442220200182,
"grad_norm": 0.7497020959854126,
"learning_rate": 1.4352758007117437e-06,
"loss": 0.1528,
"mean_token_accuracy": 0.9585920035839081,
"num_tokens": 700769270.0,
"step": 3540
},
{
"epoch": 0.7086442220200182,
"eval_entropy": 0.6463534255496791,
"eval_loss": 0.1813378483057022,
"eval_mean_token_accuracy": 0.9505206639649438,
"eval_num_tokens": 700769270.0,
"eval_runtime": 7.0738,
"eval_samples_per_second": 137.55,
"eval_steps_per_second": 8.623,
"step": 3540
},
{
"entropy": 0.6957107446410439,
"epoch": 0.7096451319381256,
"grad_norm": 0.6717329025268555,
"learning_rate": 1.4341637010676156e-06,
"loss": 0.1505,
"mean_token_accuracy": 0.960306400602514,
"num_tokens": 701718494.0,
"step": 3545
},
{
"entropy": 0.7116701342842796,
"epoch": 0.7106460418562329,
"grad_norm": 1.768558144569397,
"learning_rate": 1.4330516014234874e-06,
"loss": 0.1514,
"mean_token_accuracy": 0.9600772873921828,
"num_tokens": 702436336.0,
"step": 3550
},
{
"entropy": 0.6127610867673701,
"epoch": 0.7116469517743403,
"grad_norm": 1.0162396430969238,
"learning_rate": 1.4319395017793595e-06,
"loss": 0.1489,
"mean_token_accuracy": 0.9602883994579315,
"num_tokens": 703581833.0,
"step": 3555
},
{
"entropy": 0.6664238596504385,
"epoch": 0.7126478616924476,
"grad_norm": 1.0451632738113403,
"learning_rate": 1.4308274021352314e-06,
"loss": 0.154,
"mean_token_accuracy": 0.9591329000212929,
"num_tokens": 704674589.0,
"step": 3560
},
{
"entropy": 0.6827213899655775,
"epoch": 0.7136487716105551,
"grad_norm": 1.1186326742172241,
"learning_rate": 1.429715302491103e-06,
"loss": 0.1515,
"mean_token_accuracy": 0.9586131052537398,
"num_tokens": 705709505.0,
"step": 3565
},
{
"entropy": 0.7037599785761399,
"epoch": 0.7146496815286624,
"grad_norm": 1.0326136350631714,
"learning_rate": 1.428603202846975e-06,
"loss": 0.1504,
"mean_token_accuracy": 0.9594932919198816,
"num_tokens": 706667750.0,
"step": 3570
},
{
"epoch": 0.7146496815286624,
"eval_entropy": 0.646297568180522,
"eval_loss": 0.18333254754543304,
"eval_mean_token_accuracy": 0.9501433626550143,
"eval_num_tokens": 706667750.0,
"eval_runtime": 7.0291,
"eval_samples_per_second": 138.425,
"eval_steps_per_second": 8.678,
"step": 3570
},
{
"entropy": 0.7007351406595924,
"epoch": 0.7156505914467698,
"grad_norm": 1.76486337184906,
"learning_rate": 1.427491103202847e-06,
"loss": 0.1477,
"mean_token_accuracy": 0.9616134372624484,
"num_tokens": 707404649.0,
"step": 3575
},
{
"entropy": 0.6178434678099373,
"epoch": 0.7166515013648772,
"grad_norm": 0.9774680137634277,
"learning_rate": 1.4263790035587188e-06,
"loss": 0.1466,
"mean_token_accuracy": 0.9608237499540503,
"num_tokens": 708535434.0,
"step": 3580
},
{
"entropy": 0.6685388267040253,
"epoch": 0.7176524112829845,
"grad_norm": 0.8269554972648621,
"learning_rate": 1.4252669039145906e-06,
"loss": 0.1526,
"mean_token_accuracy": 0.9589363054795699,
"num_tokens": 709623931.0,
"step": 3585
},
{
"entropy": 0.6837363061579791,
"epoch": 0.7186533212010919,
"grad_norm": 0.759075939655304,
"learning_rate": 1.4241548042704625e-06,
"loss": 0.1502,
"mean_token_accuracy": 0.9594019618901339,
"num_tokens": 710666538.0,
"step": 3590
},
{
"entropy": 0.6967312319712206,
"epoch": 0.7196542311191992,
"grad_norm": 0.7646484971046448,
"learning_rate": 1.4230427046263344e-06,
"loss": 0.1507,
"mean_token_accuracy": 0.9600866355679252,
"num_tokens": 711616703.0,
"step": 3595
},
{
"entropy": 0.7033360708843578,
"epoch": 0.7206551410373067,
"grad_norm": 1.6652473211288452,
"learning_rate": 1.4219306049822064e-06,
"loss": 0.1527,
"mean_token_accuracy": 0.9600472737442364,
"num_tokens": 712339902.0,
"step": 3600
},
{
"epoch": 0.7206551410373067,
"eval_entropy": 0.6492331389520989,
"eval_loss": 0.18077199161052704,
"eval_mean_token_accuracy": 0.9501383842014876,
"eval_num_tokens": 712339902.0,
"eval_runtime": 7.1169,
"eval_samples_per_second": 136.717,
"eval_steps_per_second": 8.571,
"step": 3600
},
{
"entropy": 0.6149807561527599,
"epoch": 0.721656050955414,
"grad_norm": 0.928352415561676,
"learning_rate": 1.420818505338078e-06,
"loss": 0.1451,
"mean_token_accuracy": 0.9617514366453345,
"num_tokens": 713469631.0,
"step": 3605
},
{
"entropy": 0.6708079273050481,
"epoch": 0.7226569608735214,
"grad_norm": 0.8746845722198486,
"learning_rate": 1.4197064056939501e-06,
"loss": 0.1553,
"mean_token_accuracy": 0.9591595194556496,
"num_tokens": 714568149.0,
"step": 3610
},
{
"entropy": 0.6869456586512652,
"epoch": 0.7236578707916288,
"grad_norm": 0.7715699076652527,
"learning_rate": 1.418594306049822e-06,
"loss": 0.1499,
"mean_token_accuracy": 0.9593700820749457,
"num_tokens": 715610113.0,
"step": 3615
},
{
"entropy": 0.6990539791909132,
"epoch": 0.7246587807097361,
"grad_norm": 0.7243727445602417,
"learning_rate": 1.4174822064056939e-06,
"loss": 0.1513,
"mean_token_accuracy": 0.9602306030013344,
"num_tokens": 716584071.0,
"step": 3620
},
{
"entropy": 0.7046049112623388,
"epoch": 0.7256596906278435,
"grad_norm": 1.6954907178878784,
"learning_rate": 1.416370106761566e-06,
"loss": 0.1451,
"mean_token_accuracy": 0.962337300452319,
"num_tokens": 717316967.0,
"step": 3625
},
{
"entropy": 0.6160386166789314,
"epoch": 0.7266606005459508,
"grad_norm": 0.9596716165542603,
"learning_rate": 1.4152580071174376e-06,
"loss": 0.1451,
"mean_token_accuracy": 0.9615418428724463,
"num_tokens": 718445509.0,
"step": 3630
},
{
"epoch": 0.7266606005459508,
"eval_entropy": 0.6465183843354709,
"eval_loss": 0.1834045797586441,
"eval_mean_token_accuracy": 0.94984726143665,
"eval_num_tokens": 718445509.0,
"eval_runtime": 7.0491,
"eval_samples_per_second": 138.031,
"eval_steps_per_second": 8.654,
"step": 3630
},
{
"entropy": 0.6670124200257388,
"epoch": 0.7276615104640582,
"grad_norm": 0.8411712646484375,
"learning_rate": 1.4141459074733094e-06,
"loss": 0.1517,
"mean_token_accuracy": 0.9583657335151325,
"num_tokens": 719527229.0,
"step": 3635
},
{
"entropy": 0.691483823819594,
"epoch": 0.7286624203821656,
"grad_norm": 0.8118385672569275,
"learning_rate": 1.4130338078291815e-06,
"loss": 0.152,
"mean_token_accuracy": 0.9590539748018438,
"num_tokens": 720562230.0,
"step": 3640
},
{
"entropy": 0.6978759061206471,
"epoch": 0.729663330300273,
"grad_norm": 0.6536839604377747,
"learning_rate": 1.4119217081850534e-06,
"loss": 0.1491,
"mean_token_accuracy": 0.9607249758460304,
"num_tokens": 721510665.0,
"step": 3645
},
{
"entropy": 0.692423168908466,
"epoch": 0.7306642402183804,
"grad_norm": 1.5654547214508057,
"learning_rate": 1.4108096085409252e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.9622391191395846,
"num_tokens": 722236574.0,
"step": 3650
},
{
"entropy": 0.6100410997867585,
"epoch": 0.7316651501364877,
"grad_norm": 0.9501330852508545,
"learning_rate": 1.409697508896797e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.9622214566577565,
"num_tokens": 723386101.0,
"step": 3655
},
{
"entropy": 0.6609509679404172,
"epoch": 0.7326660600545951,
"grad_norm": 0.8339934945106506,
"learning_rate": 1.408585409252669e-06,
"loss": 0.1517,
"mean_token_accuracy": 0.958848465572704,
"num_tokens": 724478073.0,
"step": 3660
},
{
"epoch": 0.7326660600545951,
"eval_entropy": 0.6399203392325855,
"eval_loss": 0.18328502774238586,
"eval_mean_token_accuracy": 0.9502141133683627,
"eval_num_tokens": 724478073.0,
"eval_runtime": 7.0325,
"eval_samples_per_second": 138.358,
"eval_steps_per_second": 8.674,
"step": 3660
},
{
"entropy": 0.6784357306632128,
"epoch": 0.7336669699727024,
"grad_norm": 0.7843255996704102,
"learning_rate": 1.407473309608541e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.9594681257551366,
"num_tokens": 725530725.0,
"step": 3665
},
{
"entropy": 0.6910421122204173,
"epoch": 0.7346678798908098,
"grad_norm": 0.6633345484733582,
"learning_rate": 1.4063612099644126e-06,
"loss": 0.1475,
"mean_token_accuracy": 0.9606767145070163,
"num_tokens": 726494609.0,
"step": 3670
},
{
"entropy": 0.6835452020168304,
"epoch": 0.7356687898089171,
"grad_norm": 1.633773922920227,
"learning_rate": 1.4052491103202845e-06,
"loss": 0.1456,
"mean_token_accuracy": 0.9621837160804055,
"num_tokens": 727238125.0,
"step": 3675
},
{
"entropy": 0.603841777823188,
"epoch": 0.7366696997270246,
"grad_norm": 0.9435672760009766,
"learning_rate": 1.4041370106761566e-06,
"loss": 0.1412,
"mean_token_accuracy": 0.9622807643630288,
"num_tokens": 728383176.0,
"step": 3680
},
{
"entropy": 0.6598623080687089,
"epoch": 0.737670609645132,
"grad_norm": 0.9847542643547058,
"learning_rate": 1.4030249110320284e-06,
"loss": 0.1476,
"mean_token_accuracy": 0.960059937021949,
"num_tokens": 729471217.0,
"step": 3685
},
{
"entropy": 0.6746501594781875,
"epoch": 0.7386715195632393,
"grad_norm": 0.8633275628089905,
"learning_rate": 1.4019128113879003e-06,
"loss": 0.1535,
"mean_token_accuracy": 0.9588108068162745,
"num_tokens": 730506240.0,
"step": 3690
},
{
"epoch": 0.7386715195632393,
"eval_entropy": 0.6451824957230052,
"eval_loss": 0.1821545660495758,
"eval_mean_token_accuracy": 0.9503177375089927,
"eval_num_tokens": 730506240.0,
"eval_runtime": 7.0055,
"eval_samples_per_second": 138.891,
"eval_steps_per_second": 8.707,
"step": 3690
},
{
"entropy": 0.6850697804581035,
"epoch": 0.7396724294813467,
"grad_norm": 0.7276564836502075,
"learning_rate": 1.4008007117437721e-06,
"loss": 0.1467,
"mean_token_accuracy": 0.9607731239362196,
"num_tokens": 731447363.0,
"step": 3695
},
{
"entropy": 0.6955865442752838,
"epoch": 0.740673339399454,
"grad_norm": 1.6277596950531006,
"learning_rate": 1.399688612099644e-06,
"loss": 0.1486,
"mean_token_accuracy": 0.9611599418249998,
"num_tokens": 732166557.0,
"step": 3700
},
{
"entropy": 0.605426854707978,
"epoch": 0.7416742493175614,
"grad_norm": 0.9806519746780396,
"learning_rate": 1.398576512455516e-06,
"loss": 0.1463,
"mean_token_accuracy": 0.9612675450064919,
"num_tokens": 733291659.0,
"step": 3705
},
{
"entropy": 0.6565829529003664,
"epoch": 0.7426751592356687,
"grad_norm": 0.8315209746360779,
"learning_rate": 1.397464412811388e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.9587733626365662,
"num_tokens": 734373176.0,
"step": 3710
},
{
"entropy": 0.6744102640585465,
"epoch": 0.7436760691537762,
"grad_norm": 0.7913417816162109,
"learning_rate": 1.3963523131672596e-06,
"loss": 0.1491,
"mean_token_accuracy": 0.9598020212216811,
"num_tokens": 735417403.0,
"step": 3715
},
{
"entropy": 0.6832040979103609,
"epoch": 0.7446769790718836,
"grad_norm": 0.6335570216178894,
"learning_rate": 1.3952402135231316e-06,
"loss": 0.1478,
"mean_token_accuracy": 0.9613436964425174,
"num_tokens": 736379877.0,
"step": 3720
},
{
"epoch": 0.7446769790718836,
"eval_entropy": 0.6417029988570292,
"eval_loss": 0.18389441072940826,
"eval_mean_token_accuracy": 0.9506415773610599,
"eval_num_tokens": 736379877.0,
"eval_runtime": 6.9931,
"eval_samples_per_second": 139.137,
"eval_steps_per_second": 8.723,
"step": 3720
},
{
"entropy": 0.6932811298153617,
"epoch": 0.7456778889899909,
"grad_norm": 1.6904598474502563,
"learning_rate": 1.3941281138790035e-06,
"loss": 0.1446,
"mean_token_accuracy": 0.9619651420549913,
"num_tokens": 737125840.0,
"step": 3725
},
{
"entropy": 0.6003106886690314,
"epoch": 0.7466787989080983,
"grad_norm": 0.9378305077552795,
"learning_rate": 1.3930160142348756e-06,
"loss": 0.14,
"mean_token_accuracy": 0.9627970738844438,
"num_tokens": 738254662.0,
"step": 3730
},
{
"entropy": 0.6617660430344668,
"epoch": 0.7476797088262056,
"grad_norm": 0.8425555229187012,
"learning_rate": 1.3919039145907472e-06,
"loss": 0.1504,
"mean_token_accuracy": 0.9594602817838842,
"num_tokens": 739351062.0,
"step": 3735
},
{
"entropy": 0.6762841075658799,
"epoch": 0.748680618744313,
"grad_norm": 0.7894054055213928,
"learning_rate": 1.390791814946619e-06,
"loss": 0.1473,
"mean_token_accuracy": 0.9604934166778217,
"num_tokens": 740390420.0,
"step": 3740
},
{
"entropy": 0.6912749702280218,
"epoch": 0.7496815286624203,
"grad_norm": 0.674893856048584,
"learning_rate": 1.3896797153024911e-06,
"loss": 0.1498,
"mean_token_accuracy": 0.9602149833332408,
"num_tokens": 741347590.0,
"step": 3745
},
{
"entropy": 0.6899059181863612,
"epoch": 0.7506824385805277,
"grad_norm": 1.6593986749649048,
"learning_rate": 1.388567615658363e-06,
"loss": 0.1446,
"mean_token_accuracy": 0.9622206286950545,
"num_tokens": 742079535.0,
"step": 3750
},
{
"epoch": 0.7506824385805277,
"eval_entropy": 0.6426626101869052,
"eval_loss": 0.18340618908405304,
"eval_mean_token_accuracy": 0.950067000310929,
"eval_num_tokens": 742079535.0,
"eval_runtime": 7.0509,
"eval_samples_per_second": 137.997,
"eval_steps_per_second": 8.651,
"step": 3750
},
{
"entropy": 0.6002561170946468,
"epoch": 0.7516833484986352,
"grad_norm": 0.9747891426086426,
"learning_rate": 1.3874555160142347e-06,
"loss": 0.1398,
"mean_token_accuracy": 0.9628812367265874,
"num_tokens": 743216959.0,
"step": 3755
},
{
"entropy": 0.6451197315346111,
"epoch": 0.7526842584167425,
"grad_norm": 0.8404658436775208,
"learning_rate": 1.3863434163701067e-06,
"loss": 0.1482,
"mean_token_accuracy": 0.9605841652913527,
"num_tokens": 744296247.0,
"step": 3760
},
{
"entropy": 0.6663664116100831,
"epoch": 0.7536851683348499,
"grad_norm": 0.8358835577964783,
"learning_rate": 1.3852313167259786e-06,
"loss": 0.1478,
"mean_token_accuracy": 0.9607979530637915,
"num_tokens": 745340381.0,
"step": 3765
},
{
"entropy": 0.6912969998338006,
"epoch": 0.7546860782529572,
"grad_norm": 0.6832510828971863,
"learning_rate": 1.3841192170818504e-06,
"loss": 0.1438,
"mean_token_accuracy": 0.9614791436628862,
"num_tokens": 746282199.0,
"step": 3770
},
{
"entropy": 0.6806035925041546,
"epoch": 0.7556869881710646,
"grad_norm": 1.5931488275527954,
"learning_rate": 1.3830071174377223e-06,
"loss": 0.1384,
"mean_token_accuracy": 0.9636109731414101,
"num_tokens": 747012358.0,
"step": 3775
},
{
"entropy": 0.599098454280333,
"epoch": 0.756687898089172,
"grad_norm": 0.974331796169281,
"learning_rate": 1.3818950177935942e-06,
"loss": 0.1401,
"mean_token_accuracy": 0.962726751240817,
"num_tokens": 748131245.0,
"step": 3780
},
{
"epoch": 0.756687898089172,
"eval_entropy": 0.6385055829267032,
"eval_loss": 0.18179599940776825,
"eval_mean_token_accuracy": 0.9507253433837265,
"eval_num_tokens": 748131245.0,
"eval_runtime": 7.076,
"eval_samples_per_second": 137.507,
"eval_steps_per_second": 8.621,
"step": 3780
},
{
"entropy": 0.6537848071618514,
"epoch": 0.7576888080072793,
"grad_norm": 0.834989607334137,
"learning_rate": 1.3807829181494662e-06,
"loss": 0.1487,
"mean_token_accuracy": 0.9598635267127644,
"num_tokens": 749231997.0,
"step": 3785
},
{
"entropy": 0.677148444273255,
"epoch": 0.7586897179253868,
"grad_norm": 0.8472671508789062,
"learning_rate": 1.379670818505338e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.959491520578211,
"num_tokens": 750255932.0,
"step": 3790
},
{
"entropy": 0.6869501252066005,
"epoch": 0.7596906278434941,
"grad_norm": 0.8762997984886169,
"learning_rate": 1.37855871886121e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.9598960773511367,
"num_tokens": 751208745.0,
"step": 3795
},
{
"entropy": 0.6860447016629305,
"epoch": 0.7606915377616015,
"grad_norm": 1.5384591817855835,
"learning_rate": 1.3774466192170818e-06,
"loss": 0.1446,
"mean_token_accuracy": 0.962565876678987,
"num_tokens": 751930047.0,
"step": 3800
},
{
"entropy": 0.6060644594105807,
"epoch": 0.7616924476797088,
"grad_norm": 1.1648614406585693,
"learning_rate": 1.3763345195729537e-06,
"loss": 0.1388,
"mean_token_accuracy": 0.9629409508271651,
"num_tokens": 753058832.0,
"step": 3805
},
{
"entropy": 0.6572167962789536,
"epoch": 0.7626933575978162,
"grad_norm": 0.8445199131965637,
"learning_rate": 1.3752224199288255e-06,
"loss": 0.1498,
"mean_token_accuracy": 0.9599610285325484,
"num_tokens": 754149822.0,
"step": 3810
},
{
"epoch": 0.7626933575978162,
"eval_entropy": 0.6403389221332112,
"eval_loss": 0.18195439875125885,
"eval_mean_token_accuracy": 0.9507655999699577,
"eval_num_tokens": 754149822.0,
"eval_runtime": 7.0319,
"eval_samples_per_second": 138.37,
"eval_steps_per_second": 8.675,
"step": 3810
},
{
"entropy": 0.6741142516786401,
"epoch": 0.7636942675159236,
"grad_norm": 0.7991525530815125,
"learning_rate": 1.3741103202846976e-06,
"loss": 0.1478,
"mean_token_accuracy": 0.9602746784687042,
"num_tokens": 755198711.0,
"step": 3815
},
{
"entropy": 0.6725743767890063,
"epoch": 0.7646951774340309,
"grad_norm": 0.7079398036003113,
"learning_rate": 1.3729982206405692e-06,
"loss": 0.1394,
"mean_token_accuracy": 0.9624946919354526,
"num_tokens": 756163341.0,
"step": 3820
},
{
"entropy": 0.6698882056908174,
"epoch": 0.7656960873521383,
"grad_norm": 1.74517822265625,
"learning_rate": 1.3718861209964413e-06,
"loss": 0.1407,
"mean_token_accuracy": 0.9628500087694688,
"num_tokens": 756905491.0,
"step": 3825
},
{
"entropy": 0.5904871030287309,
"epoch": 0.7666969972702457,
"grad_norm": 0.9414699077606201,
"learning_rate": 1.3707740213523132e-06,
"loss": 0.141,
"mean_token_accuracy": 0.9624408916993574,
"num_tokens": 758060283.0,
"step": 3830
},
{
"entropy": 0.6385924938050184,
"epoch": 0.7676979071883531,
"grad_norm": 0.8462045788764954,
"learning_rate": 1.369661921708185e-06,
"loss": 0.1453,
"mean_token_accuracy": 0.9610617979006334,
"num_tokens": 759147397.0,
"step": 3835
},
{
"entropy": 0.6599980823018334,
"epoch": 0.7686988171064604,
"grad_norm": 0.7839226126670837,
"learning_rate": 1.3685498220640569e-06,
"loss": 0.1471,
"mean_token_accuracy": 0.9601600359786641,
"num_tokens": 760195635.0,
"step": 3840
},
{
"epoch": 0.7686988171064604,
"eval_entropy": 0.6299580189048267,
"eval_loss": 0.18395261466503143,
"eval_mean_token_accuracy": 0.9503186227845364,
"eval_num_tokens": 760195635.0,
"eval_runtime": 7.0623,
"eval_samples_per_second": 137.773,
"eval_steps_per_second": 8.637,
"step": 3840
},
{
"entropy": 0.6697973516854373,
"epoch": 0.7696997270245678,
"grad_norm": 0.7063069939613342,
"learning_rate": 1.3674377224199287e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.962298633293672,
"num_tokens": 761156776.0,
"step": 3845
},
{
"entropy": 0.6719630772417242,
"epoch": 0.7707006369426752,
"grad_norm": 1.6459494829177856,
"learning_rate": 1.3663256227758006e-06,
"loss": 0.1417,
"mean_token_accuracy": 0.9631251654841683,
"num_tokens": 761884891.0,
"step": 3850
},
{
"entropy": 0.5801961367780512,
"epoch": 0.7717015468607825,
"grad_norm": 0.9611675143241882,
"learning_rate": 1.3652135231316726e-06,
"loss": 0.1371,
"mean_token_accuracy": 0.963437082008882,
"num_tokens": 763026278.0,
"step": 3855
},
{
"entropy": 0.6409909424456683,
"epoch": 0.7727024567788899,
"grad_norm": 0.904933750629425,
"learning_rate": 1.3641014234875443e-06,
"loss": 0.145,
"mean_token_accuracy": 0.9613065215674313,
"num_tokens": 764133161.0,
"step": 3860
},
{
"entropy": 0.6667505315758965,
"epoch": 0.7737033666969972,
"grad_norm": 0.8145974278450012,
"learning_rate": 1.3629893238434162e-06,
"loss": 0.1426,
"mean_token_accuracy": 0.9608027013865384,
"num_tokens": 765182864.0,
"step": 3865
},
{
"entropy": 0.6728833125396209,
"epoch": 0.7747042766151047,
"grad_norm": 0.6522021889686584,
"learning_rate": 1.3618772241992882e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.96253671429374,
"num_tokens": 766151392.0,
"step": 3870
},
{
"epoch": 0.7747042766151047,
"eval_entropy": 0.6360755171932158,
"eval_loss": 0.18470442295074463,
"eval_mean_token_accuracy": 0.9502559939368826,
"eval_num_tokens": 766151392.0,
"eval_runtime": 7.0452,
"eval_samples_per_second": 138.108,
"eval_steps_per_second": 8.658,
"step": 3870
},
{
"entropy": 0.6780615603381938,
"epoch": 0.775705186533212,
"grad_norm": 1.7449678182601929,
"learning_rate": 1.36076512455516e-06,
"loss": 0.1426,
"mean_token_accuracy": 0.9625177778980949,
"num_tokens": 766878686.0,
"step": 3875
},
{
"entropy": 0.5946085046638142,
"epoch": 0.7767060964513194,
"grad_norm": 0.929898202419281,
"learning_rate": 1.3596530249110321e-06,
"loss": 0.142,
"mean_token_accuracy": 0.9622558994726701,
"num_tokens": 768017789.0,
"step": 3880
},
{
"entropy": 0.6389347041195089,
"epoch": 0.7777070063694268,
"grad_norm": 0.8491219878196716,
"learning_rate": 1.3585409252669038e-06,
"loss": 0.1447,
"mean_token_accuracy": 0.9610752945596521,
"num_tokens": 769116436.0,
"step": 3885
},
{
"entropy": 0.652755316008221,
"epoch": 0.7787079162875341,
"grad_norm": 0.7734766006469727,
"learning_rate": 1.3574288256227757e-06,
"loss": 0.1448,
"mean_token_accuracy": 0.9613142360340465,
"num_tokens": 770163077.0,
"step": 3890
},
{
"entropy": 0.6724637372927232,
"epoch": 0.7797088262056415,
"grad_norm": 0.7407189607620239,
"learning_rate": 1.3563167259786477e-06,
"loss": 0.1449,
"mean_token_accuracy": 0.9617013438181443,
"num_tokens": 771128050.0,
"step": 3895
},
{
"entropy": 0.6701836732300845,
"epoch": 0.7807097361237488,
"grad_norm": 1.544957160949707,
"learning_rate": 1.3552046263345196e-06,
"loss": 0.1402,
"mean_token_accuracy": 0.9635271229527214,
"num_tokens": 771862832.0,
"step": 3900
},
{
"epoch": 0.7807097361237488,
"eval_entropy": 0.6336434786436987,
"eval_loss": 0.18497776985168457,
"eval_mean_token_accuracy": 0.9496565441616246,
"eval_num_tokens": 771862832.0,
"eval_runtime": 7.2324,
"eval_samples_per_second": 134.533,
"eval_steps_per_second": 8.434,
"step": 3900
},
{
"entropy": 0.6014582438902422,
"epoch": 0.7817106460418562,
"grad_norm": 0.9415779709815979,
"learning_rate": 1.3540925266903912e-06,
"loss": 0.1389,
"mean_token_accuracy": 0.9630598826841874,
"num_tokens": 772987977.0,
"step": 3905
},
{
"entropy": 0.6386843873695893,
"epoch": 0.7827115559599636,
"grad_norm": 0.8790847063064575,
"learning_rate": 1.3529804270462633e-06,
"loss": 0.1427,
"mean_token_accuracy": 0.9621189491315322,
"num_tokens": 774081270.0,
"step": 3910
},
{
"entropy": 0.6616236562078649,
"epoch": 0.783712465878071,
"grad_norm": 0.8104801177978516,
"learning_rate": 1.3518683274021352e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.9597180469469591,
"num_tokens": 775149795.0,
"step": 3915
},
{
"entropy": 0.6556011256846515,
"epoch": 0.7847133757961784,
"grad_norm": 0.672337532043457,
"learning_rate": 1.3507562277580072e-06,
"loss": 0.1356,
"mean_token_accuracy": 0.9633873435583982,
"num_tokens": 776126187.0,
"step": 3920
},
{
"entropy": 0.6662999545986003,
"epoch": 0.7857142857142857,
"grad_norm": 1.6722906827926636,
"learning_rate": 1.3496441281138789e-06,
"loss": 0.1377,
"mean_token_accuracy": 0.9638949724760922,
"num_tokens": 776852493.0,
"step": 3925
},
{
"entropy": 0.5871605239131233,
"epoch": 0.7867151956323931,
"grad_norm": 0.9676291942596436,
"learning_rate": 1.3485320284697507e-06,
"loss": 0.1399,
"mean_token_accuracy": 0.9627551940354434,
"num_tokens": 777990863.0,
"step": 3930
},
{
"epoch": 0.7867151956323931,
"eval_entropy": 0.6308732477367901,
"eval_loss": 0.18097124993801117,
"eval_mean_token_accuracy": 0.9505264221644792,
"eval_num_tokens": 777990863.0,
"eval_runtime": 7.0433,
"eval_samples_per_second": 138.145,
"eval_steps_per_second": 8.661,
"step": 3930
},
{
"entropy": 0.6423692865805193,
"epoch": 0.7877161055505004,
"grad_norm": 0.8802201747894287,
"learning_rate": 1.3474199288256228e-06,
"loss": 0.1438,
"mean_token_accuracy": 0.961167681759054,
"num_tokens": 779081702.0,
"step": 3935
},
{
"entropy": 0.6667531590570103,
"epoch": 0.7887170154686078,
"grad_norm": 0.7931551933288574,
"learning_rate": 1.3463078291814947e-06,
"loss": 0.14,
"mean_token_accuracy": 0.9613506761464206,
"num_tokens": 780111893.0,
"step": 3940
},
{
"entropy": 0.6715754471041939,
"epoch": 0.7897179253867151,
"grad_norm": 0.7701306939125061,
"learning_rate": 1.3451957295373665e-06,
"loss": 0.1411,
"mean_token_accuracy": 0.9625847984444011,
"num_tokens": 781073403.0,
"step": 3945
},
{
"entropy": 0.6649499947374518,
"epoch": 0.7907188353048226,
"grad_norm": 1.5195958614349365,
"learning_rate": 1.3440836298932384e-06,
"loss": 0.1359,
"mean_token_accuracy": 0.9639710442586379,
"num_tokens": 781810124.0,
"step": 3950
},
{
"entropy": 0.583565341071649,
"epoch": 0.79171974522293,
"grad_norm": 0.8992461562156677,
"learning_rate": 1.3429715302491102e-06,
"loss": 0.1341,
"mean_token_accuracy": 0.9644046192819422,
"num_tokens": 782971983.0,
"step": 3955
},
{
"entropy": 0.640065108646046,
"epoch": 0.7927206551410373,
"grad_norm": 0.8525144457817078,
"learning_rate": 1.3418594306049823e-06,
"loss": 0.14,
"mean_token_accuracy": 0.9617968380451203,
"num_tokens": 784058621.0,
"step": 3960
},
{
"epoch": 0.7927206551410373,
"eval_entropy": 0.635074506529042,
"eval_loss": 0.18335242569446564,
"eval_mean_token_accuracy": 0.9508119587038384,
"eval_num_tokens": 784058621.0,
"eval_runtime": 7.1026,
"eval_samples_per_second": 136.992,
"eval_steps_per_second": 8.588,
"step": 3960
},
{
"entropy": 0.6650575345212763,
"epoch": 0.7937215650591447,
"grad_norm": 0.8279107213020325,
"learning_rate": 1.3407473309608542e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.9613479056141593,
"num_tokens": 785094205.0,
"step": 3965
},
{
"entropy": 0.6705412731929259,
"epoch": 0.794722474977252,
"grad_norm": 0.6302039623260498,
"learning_rate": 1.3396352313167258e-06,
"loss": 0.1407,
"mean_token_accuracy": 0.9632759739052166,
"num_tokens": 786054123.0,
"step": 3970
},
{
"entropy": 0.672145242582668,
"epoch": 0.7957233848953594,
"grad_norm": 1.680174708366394,
"learning_rate": 1.3385231316725979e-06,
"loss": 0.1349,
"mean_token_accuracy": 0.964633910222487,
"num_tokens": 786786537.0,
"step": 3975
},
{
"entropy": 0.5964198713952845,
"epoch": 0.7967242948134667,
"grad_norm": 0.9663762450218201,
"learning_rate": 1.3374110320284697e-06,
"loss": 0.1359,
"mean_token_accuracy": 0.963967761668292,
"num_tokens": 787899948.0,
"step": 3980
},
{
"entropy": 0.6433003907853907,
"epoch": 0.7977252047315742,
"grad_norm": 0.8532348275184631,
"learning_rate": 1.3362989323843416e-06,
"loss": 0.1421,
"mean_token_accuracy": 0.961438084190542,
"num_tokens": 788992170.0,
"step": 3985
},
{
"entropy": 0.6622095113450831,
"epoch": 0.7987261146496816,
"grad_norm": 0.762026309967041,
"learning_rate": 1.3351868327402134e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.9612802380865271,
"num_tokens": 790043605.0,
"step": 3990
},
{
"epoch": 0.7987261146496816,
"eval_entropy": 0.6395599700388361,
"eval_loss": 0.18441607058048248,
"eval_mean_token_accuracy": 0.9507672083182414,
"eval_num_tokens": 790043605.0,
"eval_runtime": 7.0864,
"eval_samples_per_second": 137.305,
"eval_steps_per_second": 8.608,
"step": 3990
},
{
"entropy": 0.6702088984576139,
"epoch": 0.7997270245677889,
"grad_norm": 0.6676604151725769,
"learning_rate": 1.3340747330960853e-06,
"loss": 0.1413,
"mean_token_accuracy": 0.9625859986652028,
"num_tokens": 791005452.0,
"step": 3995
},
{
"entropy": 0.672667917880145,
"epoch": 0.8007279344858963,
"grad_norm": 1.7214157581329346,
"learning_rate": 1.3329626334519572e-06,
"loss": 0.1364,
"mean_token_accuracy": 0.9645379922606728,
"num_tokens": 791732168.0,
"step": 4000
},
{
"entropy": 0.5873585137453946,
"epoch": 0.8017288444040036,
"grad_norm": 0.9482730031013489,
"learning_rate": 1.3318505338078292e-06,
"loss": 0.1313,
"mean_token_accuracy": 0.9647230906919999,
"num_tokens": 792872336.0,
"step": 4005
},
{
"entropy": 0.6405410547148097,
"epoch": 0.802729754322111,
"grad_norm": 0.8936473727226257,
"learning_rate": 1.3307384341637009e-06,
"loss": 0.14,
"mean_token_accuracy": 0.9622243496504697,
"num_tokens": 793939516.0,
"step": 4010
},
{
"entropy": 0.6661894949999723,
"epoch": 0.8037306642402183,
"grad_norm": 0.7600812315940857,
"learning_rate": 1.329626334519573e-06,
"loss": 0.1397,
"mean_token_accuracy": 0.961715197021311,
"num_tokens": 794994800.0,
"step": 4015
},
{
"entropy": 0.6729139336130836,
"epoch": 0.8047315741583257,
"grad_norm": 0.6852670907974243,
"learning_rate": 1.3285142348754448e-06,
"loss": 0.1411,
"mean_token_accuracy": 0.9618587027896535,
"num_tokens": 795942393.0,
"step": 4020
},
{
"epoch": 0.8047315741583257,
"eval_entropy": 0.6385497704881137,
"eval_loss": 0.18298886716365814,
"eval_mean_token_accuracy": 0.95083493092021,
"eval_num_tokens": 795942393.0,
"eval_runtime": 7.22,
"eval_samples_per_second": 134.764,
"eval_steps_per_second": 8.449,
"step": 4020
},
{
"entropy": 0.6744548082351685,
"epoch": 0.8057324840764332,
"grad_norm": 1.6530650854110718,
"learning_rate": 1.3274021352313167e-06,
"loss": 0.1324,
"mean_token_accuracy": 0.965755269202319,
"num_tokens": 796668510.0,
"step": 4025
},
{
"entropy": 0.5910497521812266,
"epoch": 0.8067333939945405,
"grad_norm": 0.9345577359199524,
"learning_rate": 1.3262900355871887e-06,
"loss": 0.1355,
"mean_token_accuracy": 0.9639625316316431,
"num_tokens": 797807447.0,
"step": 4030
},
{
"entropy": 0.6318013099106875,
"epoch": 0.8077343039126479,
"grad_norm": 0.8773898482322693,
"learning_rate": 1.3251779359430604e-06,
"loss": 0.1404,
"mean_token_accuracy": 0.9624045025218617,
"num_tokens": 798900021.0,
"step": 4035
},
{
"entropy": 0.6586670068177309,
"epoch": 0.8087352138307552,
"grad_norm": 0.7887452244758606,
"learning_rate": 1.3240658362989322e-06,
"loss": 0.1415,
"mean_token_accuracy": 0.9622185707092286,
"num_tokens": 799936750.0,
"step": 4040
},
{
"entropy": 0.6714841157197953,
"epoch": 0.8097361237488626,
"grad_norm": 0.6408624649047852,
"learning_rate": 1.3229537366548043e-06,
"loss": 0.1395,
"mean_token_accuracy": 0.9631155973131006,
"num_tokens": 800890339.0,
"step": 4045
},
{
"entropy": 0.671166700666601,
"epoch": 0.8107370336669699,
"grad_norm": 1.5876646041870117,
"learning_rate": 1.3218416370106762e-06,
"loss": 0.1345,
"mean_token_accuracy": 0.9646015541120009,
"num_tokens": 801621520.0,
"step": 4050
},
{
"epoch": 0.8107370336669699,
"eval_entropy": 0.6287256723544636,
"eval_loss": 0.18470019102096558,
"eval_mean_token_accuracy": 0.950807981803769,
"eval_num_tokens": 801621520.0,
"eval_runtime": 7.0146,
"eval_samples_per_second": 138.711,
"eval_steps_per_second": 8.696,
"step": 4050
},
{
"entropy": 0.5836854978041215,
"epoch": 0.8117379435850773,
"grad_norm": 0.9563764929771423,
"learning_rate": 1.320729537366548e-06,
"loss": 0.137,
"mean_token_accuracy": 0.9636379382827065,
"num_tokens": 802760229.0,
"step": 4055
},
{
"entropy": 0.6356291532516479,
"epoch": 0.8127388535031848,
"grad_norm": 0.8574218153953552,
"learning_rate": 1.3196174377224199e-06,
"loss": 0.1469,
"mean_token_accuracy": 0.9603158907456831,
"num_tokens": 803856177.0,
"step": 4060
},
{
"entropy": 0.6495377407832579,
"epoch": 0.8137397634212921,
"grad_norm": 0.8074198365211487,
"learning_rate": 1.3185053380782917e-06,
"loss": 0.1386,
"mean_token_accuracy": 0.9626891878518191,
"num_tokens": 804911386.0,
"step": 4065
},
{
"entropy": 0.6640802432190288,
"epoch": 0.8147406733393995,
"grad_norm": 0.7216334342956543,
"learning_rate": 1.3173932384341638e-06,
"loss": 0.1376,
"mean_token_accuracy": 0.9628864521330053,
"num_tokens": 805866645.0,
"step": 4070
},
{
"entropy": 0.6685602345249869,
"epoch": 0.8157415832575068,
"grad_norm": 1.7459522485733032,
"learning_rate": 1.3162811387900354e-06,
"loss": 0.1392,
"mean_token_accuracy": 0.9641159864989194,
"num_tokens": 806595104.0,
"step": 4075
},
{
"entropy": 0.5927186499942433,
"epoch": 0.8167424931756142,
"grad_norm": 0.9222381114959717,
"learning_rate": 1.3151690391459073e-06,
"loss": 0.1358,
"mean_token_accuracy": 0.9640862822532654,
"num_tokens": 807727469.0,
"step": 4080
},
{
"epoch": 0.8167424931756142,
"eval_entropy": 0.6358805328118996,
"eval_loss": 0.18385587632656097,
"eval_mean_token_accuracy": 0.9509023574532055,
"eval_num_tokens": 807727469.0,
"eval_runtime": 7.0308,
"eval_samples_per_second": 138.392,
"eval_steps_per_second": 8.676,
"step": 4080
},
{
"entropy": 0.6393450959162279,
"epoch": 0.8177434030937215,
"grad_norm": 0.84666907787323,
"learning_rate": 1.3140569395017794e-06,
"loss": 0.1404,
"mean_token_accuracy": 0.961673707853664,
"num_tokens": 808810685.0,
"step": 4085
},
{
"entropy": 0.6551467876542698,
"epoch": 0.8187443130118289,
"grad_norm": 0.8185970783233643,
"learning_rate": 1.3129448398576512e-06,
"loss": 0.1404,
"mean_token_accuracy": 0.9627237601713701,
"num_tokens": 809853102.0,
"step": 4090
},
{
"entropy": 0.6675399541854858,
"epoch": 0.8197452229299363,
"grad_norm": 0.6775302886962891,
"learning_rate": 1.3118327402135229e-06,
"loss": 0.1372,
"mean_token_accuracy": 0.9637189258228649,
"num_tokens": 810804376.0,
"step": 4095
},
{
"entropy": 0.668658479235389,
"epoch": 0.8207461328480437,
"grad_norm": 1.633571743965149,
"learning_rate": 1.310720640569395e-06,
"loss": 0.1355,
"mean_token_accuracy": 0.9643102878873998,
"num_tokens": 811535129.0,
"step": 4100
},
{
"entropy": 0.5880933046340943,
"epoch": 0.8217470427661511,
"grad_norm": 0.9041305780410767,
"learning_rate": 1.3096085409252668e-06,
"loss": 0.1334,
"mean_token_accuracy": 0.9647295447913083,
"num_tokens": 812666591.0,
"step": 4105
},
{
"entropy": 0.6378572691570629,
"epoch": 0.8227479526842584,
"grad_norm": 0.8185557723045349,
"learning_rate": 1.3084964412811389e-06,
"loss": 0.1345,
"mean_token_accuracy": 0.963734941590916,
"num_tokens": 813752886.0,
"step": 4110
},
{
"epoch": 0.8227479526842584,
"eval_entropy": 0.6318896456820066,
"eval_loss": 0.18597018718719482,
"eval_mean_token_accuracy": 0.950390275384559,
"eval_num_tokens": 813752886.0,
"eval_runtime": 7.083,
"eval_samples_per_second": 137.37,
"eval_steps_per_second": 8.612,
"step": 4110
},
{
"entropy": 0.6565879512916911,
"epoch": 0.8237488626023658,
"grad_norm": 0.8251848816871643,
"learning_rate": 1.3073843416370107e-06,
"loss": 0.1373,
"mean_token_accuracy": 0.9621736976233396,
"num_tokens": 814780578.0,
"step": 4115
},
{
"entropy": 0.6591654620387337,
"epoch": 0.8247497725204732,
"grad_norm": 0.680801510810852,
"learning_rate": 1.3062722419928824e-06,
"loss": 0.134,
"mean_token_accuracy": 0.9642984504049474,
"num_tokens": 815727309.0,
"step": 4120
},
{
"entropy": 0.6689842855388468,
"epoch": 0.8257506824385805,
"grad_norm": 1.6310633420944214,
"learning_rate": 1.3051601423487544e-06,
"loss": 0.132,
"mean_token_accuracy": 0.9654631246219981,
"num_tokens": 816454050.0,
"step": 4125
},
{
"entropy": 0.5842076683586294,
"epoch": 0.8267515923566879,
"grad_norm": 1.0107406377792358,
"learning_rate": 1.3040480427046263e-06,
"loss": 0.1314,
"mean_token_accuracy": 0.965171016888185,
"num_tokens": 817616130.0,
"step": 4130
},
{
"entropy": 0.6371192531152206,
"epoch": 0.8277525022747952,
"grad_norm": 0.8968419432640076,
"learning_rate": 1.3029359430604982e-06,
"loss": 0.1369,
"mean_token_accuracy": 0.9628690156069669,
"num_tokens": 818690928.0,
"step": 4135
},
{
"entropy": 0.6599432755600322,
"epoch": 0.8287534121929027,
"grad_norm": 0.9083675146102905,
"learning_rate": 1.30182384341637e-06,
"loss": 0.1427,
"mean_token_accuracy": 0.9619063783775677,
"num_tokens": 819719290.0,
"step": 4140
},
{
"epoch": 0.8287534121929027,
"eval_entropy": 0.627330626620621,
"eval_loss": 0.18673530220985413,
"eval_mean_token_accuracy": 0.9505173554185962,
"eval_num_tokens": 819719290.0,
"eval_runtime": 7.0694,
"eval_samples_per_second": 137.636,
"eval_steps_per_second": 8.629,
"step": 4140
},
{
"entropy": 0.6650434326041829,
"epoch": 0.82975432211101,
"grad_norm": 0.7410187721252441,
"learning_rate": 1.3007117437722419e-06,
"loss": 0.1343,
"mean_token_accuracy": 0.9638854568654841,
"num_tokens": 820658717.0,
"step": 4145
},
{
"entropy": 0.6653602497144179,
"epoch": 0.8307552320291174,
"grad_norm": 1.657799243927002,
"learning_rate": 1.299599644128114e-06,
"loss": 0.1314,
"mean_token_accuracy": 0.9655446870760485,
"num_tokens": 821387042.0,
"step": 4150
},
{
"entropy": 0.5887753250924024,
"epoch": 0.8317561419472248,
"grad_norm": 0.9166984558105469,
"learning_rate": 1.2984875444839858e-06,
"loss": 0.1305,
"mean_token_accuracy": 0.9651481601324948,
"num_tokens": 822524995.0,
"step": 4155
},
{
"entropy": 0.6301268073645505,
"epoch": 0.8327570518653321,
"grad_norm": 0.8409593105316162,
"learning_rate": 1.2973754448398574e-06,
"loss": 0.1316,
"mean_token_accuracy": 0.9642227351665497,
"num_tokens": 823595330.0,
"step": 4160
},
{
"entropy": 0.6482060646468942,
"epoch": 0.8337579617834395,
"grad_norm": 0.828996479511261,
"learning_rate": 1.2962633451957295e-06,
"loss": 0.141,
"mean_token_accuracy": 0.9620839937166734,
"num_tokens": 824625650.0,
"step": 4165
},
{
"entropy": 0.6599689732898365,
"epoch": 0.8347588717015468,
"grad_norm": 0.6699210405349731,
"learning_rate": 1.2951512455516014e-06,
"loss": 0.1399,
"mean_token_accuracy": 0.9629984969442541,
"num_tokens": 825583528.0,
"step": 4170
},
{
"epoch": 0.8347588717015468,
"eval_entropy": 0.629080096229178,
"eval_loss": 0.18436747789382935,
"eval_mean_token_accuracy": 0.9509243994462685,
"eval_num_tokens": 825583528.0,
"eval_runtime": 7.0802,
"eval_samples_per_second": 137.426,
"eval_steps_per_second": 8.616,
"step": 4170
}
],
"logging_steps": 5,
"max_steps": 9992,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 30,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3661070247602422e+19,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}