| { |
| "best_global_step": 4170, |
| "best_metric": 0.9509243994462685, |
| "best_model_checkpoint": "/workspaces/decompile_search/data/models/jan_experiments/stripped_unstripped_22b_unstripped_stop/checkpoint-4170", |
| "epoch": 0.8347588717015468, |
| "eval_steps": 30, |
| "global_step": 4170, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.9579070736061442, |
| "epoch": 0.0010009099181073704, |
| "grad_norm": 31.209657669067383, |
| "learning_rate": 8e-09, |
| "loss": 1.2257, |
| "mean_token_accuracy": 0.7700222129171544, |
| "num_tokens": 1145748.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.011449571089311, |
| "epoch": 0.002001819836214741, |
| "grad_norm": 23.227827072143555, |
| "learning_rate": 1.8e-08, |
| "loss": 1.2785, |
| "mean_token_accuracy": 0.7474062916907397, |
| "num_tokens": 2249793.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.05531109679829, |
| "epoch": 0.003002729754322111, |
| "grad_norm": 17.10189437866211, |
| "learning_rate": 2.8e-08, |
| "loss": 1.3173, |
| "mean_token_accuracy": 0.732902865247293, |
| "num_tokens": 3277722.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.080242946473035, |
| "epoch": 0.004003639672429482, |
| "grad_norm": 10.145275115966797, |
| "learning_rate": 3.7999999999999996e-08, |
| "loss": 1.3569, |
| "mean_token_accuracy": 0.7288111207160083, |
| "num_tokens": 4232543.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.1169310082088817, |
| "epoch": 0.005004549590536852, |
| "grad_norm": 41.83009338378906, |
| "learning_rate": 4.8e-08, |
| "loss": 1.548, |
| "mean_token_accuracy": 0.7169481502337889, |
| "num_tokens": 4962149.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.9460688650608062, |
| "epoch": 0.006005459508644222, |
| "grad_norm": 29.797571182250977, |
| "learning_rate": 5.8e-08, |
| "loss": 1.1969, |
| "mean_token_accuracy": 0.7767708816311576, |
| "num_tokens": 6078015.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.006005459508644222, |
| "eval_entropy": 0.8858045388440616, |
| "eval_loss": 1.0837815999984741, |
| "eval_mean_token_accuracy": 0.7985450283425753, |
| "eval_num_tokens": 6078015.0, |
| "eval_runtime": 7.1989, |
| "eval_samples_per_second": 135.159, |
| "eval_steps_per_second": 8.473, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.004025985436006, |
| "epoch": 0.0070063694267515925, |
| "grad_norm": 20.322914123535156, |
| "learning_rate": 6.8e-08, |
| "loss": 1.2249, |
| "mean_token_accuracy": 0.757005724310875, |
| "num_tokens": 7170484.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 1.0326186922463503, |
| "epoch": 0.008007279344858963, |
| "grad_norm": 14.3685302734375, |
| "learning_rate": 7.8e-08, |
| "loss": 1.2034, |
| "mean_token_accuracy": 0.7555106471885334, |
| "num_tokens": 8215978.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.050350191376426, |
| "epoch": 0.009008189262966333, |
| "grad_norm": 8.891011238098145, |
| "learning_rate": 8.8e-08, |
| "loss": 1.2058, |
| "mean_token_accuracy": 0.7581022883003409, |
| "num_tokens": 9176310.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 1.0481942902911794, |
| "epoch": 0.010009099181073703, |
| "grad_norm": 25.542049407958984, |
| "learning_rate": 9.8e-08, |
| "loss": 1.2043, |
| "mean_token_accuracy": 0.7758935884995894, |
| "num_tokens": 9907937.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.8896388709545135, |
| "epoch": 0.011010009099181074, |
| "grad_norm": 17.134859085083008, |
| "learning_rate": 1.0799999999999999e-07, |
| "loss": 0.9333, |
| "mean_token_accuracy": 0.8181748888709328, |
| "num_tokens": 11040470.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.9338755531744524, |
| "epoch": 0.012010919017288443, |
| "grad_norm": 11.280884742736816, |
| "learning_rate": 1.1799999999999998e-07, |
| "loss": 0.9171, |
| "mean_token_accuracy": 0.809243483435024, |
| "num_tokens": 12138554.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.012010919017288443, |
| "eval_entropy": 0.827404188328102, |
| "eval_loss": 0.7739703059196472, |
| "eval_mean_token_accuracy": 0.8451327916051521, |
| "eval_num_tokens": 12138554.0, |
| "eval_runtime": 7.1713, |
| "eval_samples_per_second": 135.679, |
| "eval_steps_per_second": 8.506, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.9394035225564783, |
| "epoch": 0.013011828935395814, |
| "grad_norm": 6.800612449645996, |
| "learning_rate": 1.28e-07, |
| "loss": 0.816, |
| "mean_token_accuracy": 0.8172335895625028, |
| "num_tokens": 13161971.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.931338392604481, |
| "epoch": 0.014012738853503185, |
| "grad_norm": 3.9526984691619873, |
| "learning_rate": 1.3800000000000002e-07, |
| "loss": 0.729, |
| "mean_token_accuracy": 0.8396818897940895, |
| "num_tokens": 14121047.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.9432308787649328, |
| "epoch": 0.015013648771610554, |
| "grad_norm": 13.202656745910645, |
| "learning_rate": 1.4799999999999998e-07, |
| "loss": 0.7149, |
| "mean_token_accuracy": 0.8470803531733426, |
| "num_tokens": 14853168.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.8036102744666013, |
| "epoch": 0.016014558689717927, |
| "grad_norm": 5.494961261749268, |
| "learning_rate": 1.5799999999999999e-07, |
| "loss": 0.5533, |
| "mean_token_accuracy": 0.8729204730554061, |
| "num_tokens": 16006267.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8627965910868212, |
| "epoch": 0.017015468607825296, |
| "grad_norm": 3.109562635421753, |
| "learning_rate": 1.68e-07, |
| "loss": 0.527, |
| "mean_token_accuracy": 0.8722775372591886, |
| "num_tokens": 17103593.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.8908803132447329, |
| "epoch": 0.018016378525932665, |
| "grad_norm": 1.8945680856704712, |
| "learning_rate": 1.7799999999999998e-07, |
| "loss": 0.5005, |
| "mean_token_accuracy": 0.8758173530752008, |
| "num_tokens": 18130627.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.018016378525932665, |
| "eval_entropy": 0.7762856190321875, |
| "eval_loss": 0.40602007508277893, |
| "eval_mean_token_accuracy": 0.899208920900939, |
| "eval_num_tokens": 18130627.0, |
| "eval_runtime": 7.1881, |
| "eval_samples_per_second": 135.362, |
| "eval_steps_per_second": 8.486, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.9098557423461567, |
| "epoch": 0.019017288444040038, |
| "grad_norm": 1.237993597984314, |
| "learning_rate": 1.88e-07, |
| "loss": 0.4722, |
| "mean_token_accuracy": 0.8838203982873396, |
| "num_tokens": 19076434.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.9214537311684001, |
| "epoch": 0.020018198362147407, |
| "grad_norm": 5.1105427742004395, |
| "learning_rate": 1.98e-07, |
| "loss": 0.4761, |
| "mean_token_accuracy": 0.885197820446708, |
| "num_tokens": 19806235.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.7998497681184249, |
| "epoch": 0.021019108280254776, |
| "grad_norm": 2.0025761127471924, |
| "learning_rate": 2.0799999999999998e-07, |
| "loss": 0.4172, |
| "mean_token_accuracy": 0.8957362061197107, |
| "num_tokens": 20936435.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 0.8544187637892636, |
| "epoch": 0.02202001819836215, |
| "grad_norm": 1.4504088163375854, |
| "learning_rate": 2.18e-07, |
| "loss": 0.4103, |
| "mean_token_accuracy": 0.8934655557979237, |
| "num_tokens": 22013484.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.8855489952997728, |
| "epoch": 0.023020928116469518, |
| "grad_norm": 1.1078195571899414, |
| "learning_rate": 2.28e-07, |
| "loss": 0.4117, |
| "mean_token_accuracy": 0.8933108893307773, |
| "num_tokens": 23045886.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 0.9073191062970595, |
| "epoch": 0.024021838034576887, |
| "grad_norm": 0.9317752718925476, |
| "learning_rate": 2.38e-07, |
| "loss": 0.4015, |
| "mean_token_accuracy": 0.8976379817182367, |
| "num_tokens": 23997532.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.024021838034576887, |
| "eval_entropy": 0.7722517046772066, |
| "eval_loss": 0.29370784759521484, |
| "eval_mean_token_accuracy": 0.9163104364129363, |
| "eval_num_tokens": 23997532.0, |
| "eval_runtime": 7.1752, |
| "eval_samples_per_second": 135.606, |
| "eval_steps_per_second": 8.502, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.9209811920469457, |
| "epoch": 0.02502274795268426, |
| "grad_norm": 3.9668571949005127, |
| "learning_rate": 2.48e-07, |
| "loss": 0.4115, |
| "mean_token_accuracy": 0.8966231562874534, |
| "num_tokens": 24736731.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 0.7977617914026434, |
| "epoch": 0.02602365787079163, |
| "grad_norm": 1.2996954917907715, |
| "learning_rate": 2.58e-07, |
| "loss": 0.3835, |
| "mean_token_accuracy": 0.9029265501282432, |
| "num_tokens": 25872435.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.8485897243022918, |
| "epoch": 0.027024567788898998, |
| "grad_norm": 1.0686163902282715, |
| "learning_rate": 2.68e-07, |
| "loss": 0.3879, |
| "mean_token_accuracy": 0.8995744439688595, |
| "num_tokens": 26959841.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 0.8709002348509702, |
| "epoch": 0.02802547770700637, |
| "grad_norm": 0.92051762342453, |
| "learning_rate": 2.7800000000000003e-07, |
| "loss": 0.3856, |
| "mean_token_accuracy": 0.9004006510431116, |
| "num_tokens": 27993013.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.8909420809962533, |
| "epoch": 0.02902638762511374, |
| "grad_norm": 0.8413939476013184, |
| "learning_rate": 2.88e-07, |
| "loss": 0.388, |
| "mean_token_accuracy": 0.9014144734902816, |
| "num_tokens": 28946080.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 0.9160865225575187, |
| "epoch": 0.03002729754322111, |
| "grad_norm": 4.02235221862793, |
| "learning_rate": 2.98e-07, |
| "loss": 0.3961, |
| "mean_token_accuracy": 0.9003071714531291, |
| "num_tokens": 29669559.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03002729754322111, |
| "eval_entropy": 0.773854380748311, |
| "eval_loss": 0.2762528359889984, |
| "eval_mean_token_accuracy": 0.9212246898744927, |
| "eval_num_tokens": 29669559.0, |
| "eval_runtime": 7.2541, |
| "eval_samples_per_second": 134.132, |
| "eval_steps_per_second": 8.409, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.7960180721499703, |
| "epoch": 0.03102820746132848, |
| "grad_norm": 1.3376996517181396, |
| "learning_rate": 3.08e-07, |
| "loss": 0.3679, |
| "mean_token_accuracy": 0.9063912429592826, |
| "num_tokens": 30786460.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 0.8393291793086312, |
| "epoch": 0.032029117379435854, |
| "grad_norm": 0.9782769083976746, |
| "learning_rate": 3.18e-07, |
| "loss": 0.3739, |
| "mean_token_accuracy": 0.9036351117220792, |
| "num_tokens": 31893205.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.8657339849255302, |
| "epoch": 0.03303002729754322, |
| "grad_norm": 0.916830837726593, |
| "learning_rate": 3.28e-07, |
| "loss": 0.3732, |
| "mean_token_accuracy": 0.9034594573757865, |
| "num_tokens": 32917211.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 0.8868166403336959, |
| "epoch": 0.03403093721565059, |
| "grad_norm": 0.8498401045799255, |
| "learning_rate": 3.38e-07, |
| "loss": 0.3753, |
| "mean_token_accuracy": 0.9043488231572238, |
| "num_tokens": 33872966.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.9020868615670637, |
| "epoch": 0.03503184713375796, |
| "grad_norm": 3.632817506790161, |
| "learning_rate": 3.4799999999999994e-07, |
| "loss": 0.3814, |
| "mean_token_accuracy": 0.9035663994875821, |
| "num_tokens": 34603817.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 0.794374041665684, |
| "epoch": 0.03603275705186533, |
| "grad_norm": 1.3152629137039185, |
| "learning_rate": 3.5799999999999995e-07, |
| "loss": 0.3598, |
| "mean_token_accuracy": 0.9083835087039254, |
| "num_tokens": 35738363.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.03603275705186533, |
| "eval_entropy": 0.7766775371598416, |
| "eval_loss": 0.2652251720428467, |
| "eval_mean_token_accuracy": 0.9241212303521203, |
| "eval_num_tokens": 35738363.0, |
| "eval_runtime": 7.1835, |
| "eval_samples_per_second": 135.449, |
| "eval_steps_per_second": 8.492, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.8374889547174628, |
| "epoch": 0.0370336669699727, |
| "grad_norm": 1.069807767868042, |
| "learning_rate": 3.6799999999999996e-07, |
| "loss": 0.362, |
| "mean_token_accuracy": 0.906164778362621, |
| "num_tokens": 36835769.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 0.865638066963716, |
| "epoch": 0.038034576888080075, |
| "grad_norm": 0.9059945344924927, |
| "learning_rate": 3.7799999999999997e-07, |
| "loss": 0.3667, |
| "mean_token_accuracy": 0.9057478319514881, |
| "num_tokens": 37868585.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.8880685941739516, |
| "epoch": 0.039035486806187444, |
| "grad_norm": 0.7509809136390686, |
| "learning_rate": 3.88e-07, |
| "loss": 0.3578, |
| "mean_token_accuracy": 0.9070773032578555, |
| "num_tokens": 38807693.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 0.909015315771103, |
| "epoch": 0.040036396724294813, |
| "grad_norm": 3.0933594703674316, |
| "learning_rate": 3.98e-07, |
| "loss": 0.3733, |
| "mean_token_accuracy": 0.9064179236238653, |
| "num_tokens": 39535122.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.7939805047078566, |
| "epoch": 0.04103730664240218, |
| "grad_norm": 1.179566740989685, |
| "learning_rate": 4.0799999999999995e-07, |
| "loss": 0.3551, |
| "mean_token_accuracy": 0.9092546192082491, |
| "num_tokens": 40666622.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 0.846534158966758, |
| "epoch": 0.04203821656050955, |
| "grad_norm": 0.9151410460472107, |
| "learning_rate": 4.1799999999999996e-07, |
| "loss": 0.3589, |
| "mean_token_accuracy": 0.9077435683120381, |
| "num_tokens": 41748748.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.04203821656050955, |
| "eval_entropy": 0.7851541091184147, |
| "eval_loss": 0.25734254717826843, |
| "eval_mean_token_accuracy": 0.9266526767464934, |
| "eval_num_tokens": 41748748.0, |
| "eval_runtime": 7.1633, |
| "eval_samples_per_second": 135.832, |
| "eval_steps_per_second": 8.516, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.8759665044871243, |
| "epoch": 0.04303912647861693, |
| "grad_norm": 1.0037354230880737, |
| "learning_rate": 4.2799999999999997e-07, |
| "loss": 0.3613, |
| "mean_token_accuracy": 0.906467653946443, |
| "num_tokens": 42776021.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 0.8946322251449932, |
| "epoch": 0.0440400363967243, |
| "grad_norm": 0.7233147621154785, |
| "learning_rate": 4.38e-07, |
| "loss": 0.3685, |
| "mean_token_accuracy": 0.9061482787132263, |
| "num_tokens": 43731864.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.9084791925820437, |
| "epoch": 0.045040946314831666, |
| "grad_norm": 3.0358214378356934, |
| "learning_rate": 4.48e-07, |
| "loss": 0.3644, |
| "mean_token_accuracy": 0.9079454855485396, |
| "num_tokens": 44458397.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 0.795162683725357, |
| "epoch": 0.046041856232939035, |
| "grad_norm": 1.060544490814209, |
| "learning_rate": 4.58e-07, |
| "loss": 0.347, |
| "mean_token_accuracy": 0.9115611629052596, |
| "num_tokens": 45582504.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.8501786047762091, |
| "epoch": 0.047042766151046404, |
| "grad_norm": 0.8345633745193481, |
| "learning_rate": 4.68e-07, |
| "loss": 0.3518, |
| "mean_token_accuracy": 0.9087835637005892, |
| "num_tokens": 46691664.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 0.8739784273234281, |
| "epoch": 0.04804367606915377, |
| "grad_norm": 0.8899365067481995, |
| "learning_rate": 4.779999999999999e-07, |
| "loss": 0.354, |
| "mean_token_accuracy": 0.9071491924199191, |
| "num_tokens": 47712971.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.04804367606915377, |
| "eval_entropy": 0.7832569479942322, |
| "eval_loss": 0.25191357731819153, |
| "eval_mean_token_accuracy": 0.9278596326953075, |
| "eval_num_tokens": 47712971.0, |
| "eval_runtime": 7.1165, |
| "eval_samples_per_second": 136.725, |
| "eval_steps_per_second": 8.572, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.8918125726959922, |
| "epoch": 0.04904458598726115, |
| "grad_norm": 0.7503334879875183, |
| "learning_rate": 4.879999999999999e-07, |
| "loss": 0.3536, |
| "mean_token_accuracy": 0.9086009448224848, |
| "num_tokens": 48657962.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 0.9069400212981484, |
| "epoch": 0.05004549590536852, |
| "grad_norm": 2.7447826862335205, |
| "learning_rate": 4.979999999999999e-07, |
| "loss": 0.3607, |
| "mean_token_accuracy": 0.9092577500776811, |
| "num_tokens": 49382591.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.7897281029007651, |
| "epoch": 0.05104640582347589, |
| "grad_norm": 1.0658516883850098, |
| "learning_rate": 5.079999999999999e-07, |
| "loss": 0.3343, |
| "mean_token_accuracy": 0.9143867644396695, |
| "num_tokens": 50493880.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 0.8420896362174641, |
| "epoch": 0.05204731574158326, |
| "grad_norm": 0.9911607503890991, |
| "learning_rate": 5.18e-07, |
| "loss": 0.3467, |
| "mean_token_accuracy": 0.9096509906378659, |
| "num_tokens": 51593094.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.8670889122919603, |
| "epoch": 0.053048225659690626, |
| "grad_norm": 0.8509455323219299, |
| "learning_rate": 5.28e-07, |
| "loss": 0.3429, |
| "mean_token_accuracy": 0.9105742725459012, |
| "num_tokens": 52624215.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 0.8908169654282656, |
| "epoch": 0.054049135577797995, |
| "grad_norm": 1.258712649345398, |
| "learning_rate": 5.38e-07, |
| "loss": 0.3471, |
| "mean_token_accuracy": 0.9100607395172119, |
| "num_tokens": 53579583.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.054049135577797995, |
| "eval_entropy": 0.781644055100738, |
| "eval_loss": 0.24734267592430115, |
| "eval_mean_token_accuracy": 0.9283195352945172, |
| "eval_num_tokens": 53579583.0, |
| "eval_runtime": 7.1344, |
| "eval_samples_per_second": 136.382, |
| "eval_steps_per_second": 8.55, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.8974328610030088, |
| "epoch": 0.05505004549590537, |
| "grad_norm": 2.7811849117279053, |
| "learning_rate": 5.48e-07, |
| "loss": 0.3531, |
| "mean_token_accuracy": 0.9098310795697299, |
| "num_tokens": 54306230.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.790298336202448, |
| "epoch": 0.05605095541401274, |
| "grad_norm": 1.0269526243209839, |
| "learning_rate": 5.58e-07, |
| "loss": 0.3426, |
| "mean_token_accuracy": 0.912391439893029, |
| "num_tokens": 55434008.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.8415309033610604, |
| "epoch": 0.05705186533212011, |
| "grad_norm": 0.8907763957977295, |
| "learning_rate": 5.679999999999999e-07, |
| "loss": 0.3457, |
| "mean_token_accuracy": 0.9103805682875893, |
| "num_tokens": 56527326.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 0.8678737878799438, |
| "epoch": 0.05805277525022748, |
| "grad_norm": 0.933925449848175, |
| "learning_rate": 5.779999999999999e-07, |
| "loss": 0.3435, |
| "mean_token_accuracy": 0.9097982775081288, |
| "num_tokens": 57577716.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.8823735535144805, |
| "epoch": 0.05905368516833485, |
| "grad_norm": 0.7687504291534424, |
| "learning_rate": 5.879999999999999e-07, |
| "loss": 0.3448, |
| "mean_token_accuracy": 0.9104289829730987, |
| "num_tokens": 58525576.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 0.8998256000605497, |
| "epoch": 0.06005459508644222, |
| "grad_norm": 2.8133366107940674, |
| "learning_rate": 5.979999999999999e-07, |
| "loss": 0.3474, |
| "mean_token_accuracy": 0.9119950543750416, |
| "num_tokens": 59255710.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.06005459508644222, |
| "eval_entropy": 0.7829059782575388, |
| "eval_loss": 0.24350076913833618, |
| "eval_mean_token_accuracy": 0.9303004888237499, |
| "eval_num_tokens": 59255710.0, |
| "eval_runtime": 7.4044, |
| "eval_samples_per_second": 131.408, |
| "eval_steps_per_second": 8.238, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.7927286684513092, |
| "epoch": 0.06105550500454959, |
| "grad_norm": 0.9818949103355408, |
| "learning_rate": 6.079999999999999e-07, |
| "loss": 0.3337, |
| "mean_token_accuracy": 0.9142222328619524, |
| "num_tokens": 60371359.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 0.8404727691953833, |
| "epoch": 0.06205641492265696, |
| "grad_norm": 0.8817327618598938, |
| "learning_rate": 6.18e-07, |
| "loss": 0.336, |
| "mean_token_accuracy": 0.9129292596470225, |
| "num_tokens": 61459516.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.8821817175908522, |
| "epoch": 0.06305732484076433, |
| "grad_norm": 1.0255104303359985, |
| "learning_rate": 6.28e-07, |
| "loss": 0.3445, |
| "mean_token_accuracy": 0.9102013338695872, |
| "num_tokens": 62511188.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 0.8907575244253332, |
| "epoch": 0.06405823475887171, |
| "grad_norm": 0.6242014169692993, |
| "learning_rate": 6.38e-07, |
| "loss": 0.3306, |
| "mean_token_accuracy": 0.9132393593137914, |
| "num_tokens": 63462734.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.9019130771810359, |
| "epoch": 0.06505914467697907, |
| "grad_norm": 2.4068405628204346, |
| "learning_rate": 6.48e-07, |
| "loss": 0.3431, |
| "mean_token_accuracy": 0.9133785475384105, |
| "num_tokens": 64195174.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.7956698184663599, |
| "epoch": 0.06606005459508645, |
| "grad_norm": 1.0124462842941284, |
| "learning_rate": 6.58e-07, |
| "loss": 0.3232, |
| "mean_token_accuracy": 0.916981242461638, |
| "num_tokens": 65310690.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.06606005459508645, |
| "eval_entropy": 0.7826571161629724, |
| "eval_loss": 0.2395564466714859, |
| "eval_mean_token_accuracy": 0.9312737300747731, |
| "eval_num_tokens": 65310690.0, |
| "eval_runtime": 7.1164, |
| "eval_samples_per_second": 136.726, |
| "eval_steps_per_second": 8.572, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.8469071046872573, |
| "epoch": 0.06706096451319381, |
| "grad_norm": 0.8455765843391418, |
| "learning_rate": 6.68e-07, |
| "loss": 0.3336, |
| "mean_token_accuracy": 0.9130166606469587, |
| "num_tokens": 66407916.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 0.8715857310728593, |
| "epoch": 0.06806187443130118, |
| "grad_norm": 0.848980188369751, |
| "learning_rate": 6.78e-07, |
| "loss": 0.332, |
| "mean_token_accuracy": 0.9133841227401387, |
| "num_tokens": 67462675.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.8887097320773385, |
| "epoch": 0.06906278434940856, |
| "grad_norm": 0.7524086236953735, |
| "learning_rate": 6.879999999999999e-07, |
| "loss": 0.3293, |
| "mean_token_accuracy": 0.9141406866637143, |
| "num_tokens": 68430686.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 0.9178955408659848, |
| "epoch": 0.07006369426751592, |
| "grad_norm": 2.6862826347351074, |
| "learning_rate": 6.979999999999999e-07, |
| "loss": 0.3503, |
| "mean_token_accuracy": 0.910759204084223, |
| "num_tokens": 69159538.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.7949739504944194, |
| "epoch": 0.0710646041856233, |
| "grad_norm": 0.939888060092926, |
| "learning_rate": 7.079999999999999e-07, |
| "loss": 0.3226, |
| "mean_token_accuracy": 0.917046070098877, |
| "num_tokens": 70297859.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 0.8522608453577215, |
| "epoch": 0.07206551410373066, |
| "grad_norm": 0.8733773827552795, |
| "learning_rate": 7.179999999999999e-07, |
| "loss": 0.3245, |
| "mean_token_accuracy": 0.9145838070999492, |
| "num_tokens": 71388877.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.07206551410373066, |
| "eval_entropy": 0.7842210042672079, |
| "eval_loss": 0.23638135194778442, |
| "eval_mean_token_accuracy": 0.9317856763229996, |
| "eval_num_tokens": 71388877.0, |
| "eval_runtime": 7.0752, |
| "eval_samples_per_second": 137.523, |
| "eval_steps_per_second": 8.622, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.8792784777554599, |
| "epoch": 0.07306642402183804, |
| "grad_norm": 0.9495289921760559, |
| "learning_rate": 7.28e-07, |
| "loss": 0.3357, |
| "mean_token_accuracy": 0.9130886435508728, |
| "num_tokens": 72425465.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 0.8881763842972842, |
| "epoch": 0.0740673339399454, |
| "grad_norm": 0.7181362509727478, |
| "learning_rate": 7.38e-07, |
| "loss": 0.3273, |
| "mean_token_accuracy": 0.9161153722893108, |
| "num_tokens": 73377921.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.9161553989757191, |
| "epoch": 0.07506824385805277, |
| "grad_norm": 2.3346126079559326, |
| "learning_rate": 7.48e-07, |
| "loss": 0.3416, |
| "mean_token_accuracy": 0.9133548351851377, |
| "num_tokens": 74109742.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.7970652249726382, |
| "epoch": 0.07606915377616015, |
| "grad_norm": 0.9897024035453796, |
| "learning_rate": 7.58e-07, |
| "loss": 0.3206, |
| "mean_token_accuracy": 0.9177604393525557, |
| "num_tokens": 75254960.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.8417174593968825, |
| "epoch": 0.07707006369426751, |
| "grad_norm": 1.0447583198547363, |
| "learning_rate": 7.68e-07, |
| "loss": 0.3204, |
| "mean_token_accuracy": 0.9170865096829154, |
| "num_tokens": 76368490.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 0.8782219377431002, |
| "epoch": 0.07807097361237489, |
| "grad_norm": 0.9332379698753357, |
| "learning_rate": 7.78e-07, |
| "loss": 0.3301, |
| "mean_token_accuracy": 0.9143734498457475, |
| "num_tokens": 77412393.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.07807097361237489, |
| "eval_entropy": 0.779330243829821, |
| "eval_loss": 0.23134513199329376, |
| "eval_mean_token_accuracy": 0.9325118944293163, |
| "eval_num_tokens": 77412393.0, |
| "eval_runtime": 7.1925, |
| "eval_samples_per_second": 135.279, |
| "eval_steps_per_second": 8.481, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.8939531375061381, |
| "epoch": 0.07907188353048225, |
| "grad_norm": 0.7231958508491516, |
| "learning_rate": 7.88e-07, |
| "loss": 0.3267, |
| "mean_token_accuracy": 0.9155195750973442, |
| "num_tokens": 78362341.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 0.9062723474069075, |
| "epoch": 0.08007279344858963, |
| "grad_norm": 2.0436787605285645, |
| "learning_rate": 7.98e-07, |
| "loss": 0.3365, |
| "mean_token_accuracy": 0.9146250042048367, |
| "num_tokens": 79096898.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.7945969061418013, |
| "epoch": 0.081073703366697, |
| "grad_norm": 0.9359119534492493, |
| "learning_rate": 8.08e-07, |
| "loss": 0.3177, |
| "mean_token_accuracy": 0.9181306638500907, |
| "num_tokens": 80217692.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 0.844357325813987, |
| "epoch": 0.08207461328480437, |
| "grad_norm": 0.8438592553138733, |
| "learning_rate": 8.179999999999999e-07, |
| "loss": 0.3203, |
| "mean_token_accuracy": 0.9169050964442167, |
| "num_tokens": 81307270.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.8725964551622217, |
| "epoch": 0.08307552320291174, |
| "grad_norm": 0.7924903035163879, |
| "learning_rate": 8.28e-07, |
| "loss": 0.3209, |
| "mean_token_accuracy": 0.9159043675119226, |
| "num_tokens": 82357546.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 0.8886379382827065, |
| "epoch": 0.0840764331210191, |
| "grad_norm": 0.7205538153648376, |
| "learning_rate": 8.38e-07, |
| "loss": 0.3235, |
| "mean_token_accuracy": 0.9162523784420707, |
| "num_tokens": 83328951.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.0840764331210191, |
| "eval_entropy": 0.7785051123040622, |
| "eval_loss": 0.2309650033712387, |
| "eval_mean_token_accuracy": 0.9339301380954805, |
| "eval_num_tokens": 83328951.0, |
| "eval_runtime": 7.1338, |
| "eval_samples_per_second": 136.393, |
| "eval_steps_per_second": 8.551, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.9015932933850722, |
| "epoch": 0.08507734303912648, |
| "grad_norm": 2.2584614753723145, |
| "learning_rate": 8.48e-07, |
| "loss": 0.3281, |
| "mean_token_accuracy": 0.9161337093873457, |
| "num_tokens": 84065002.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 0.7703537437048825, |
| "epoch": 0.08607825295723386, |
| "grad_norm": 0.8600314855575562, |
| "learning_rate": 8.58e-07, |
| "loss": 0.3051, |
| "mean_token_accuracy": 0.9216179517182437, |
| "num_tokens": 85217505.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.837453713200309, |
| "epoch": 0.08707916287534122, |
| "grad_norm": 0.837756335735321, |
| "learning_rate": 8.68e-07, |
| "loss": 0.3171, |
| "mean_token_accuracy": 0.9169958277182145, |
| "num_tokens": 86302592.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 0.8648753382942893, |
| "epoch": 0.0880800727934486, |
| "grad_norm": 0.8668557405471802, |
| "learning_rate": 8.78e-07, |
| "loss": 0.3128, |
| "mean_token_accuracy": 0.9178522229194641, |
| "num_tokens": 87371908.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.8813207095319574, |
| "epoch": 0.08908098271155596, |
| "grad_norm": 0.7655621767044067, |
| "learning_rate": 8.88e-07, |
| "loss": 0.325, |
| "mean_token_accuracy": 0.9160207412459633, |
| "num_tokens": 88338748.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 0.8887845928018744, |
| "epoch": 0.09008189262966333, |
| "grad_norm": 1.8667429685592651, |
| "learning_rate": 8.98e-07, |
| "loss": 0.3201, |
| "mean_token_accuracy": 0.918709414655512, |
| "num_tokens": 89071435.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.09008189262966333, |
| "eval_entropy": 0.7726943004326742, |
| "eval_loss": 0.2284156084060669, |
| "eval_mean_token_accuracy": 0.9343840216026932, |
| "eval_num_tokens": 89071435.0, |
| "eval_runtime": 7.0961, |
| "eval_samples_per_second": 137.118, |
| "eval_steps_per_second": 8.596, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.7833973987535997, |
| "epoch": 0.0910828025477707, |
| "grad_norm": 0.9149163365364075, |
| "learning_rate": 9.08e-07, |
| "loss": 0.309, |
| "mean_token_accuracy": 0.9209194253791463, |
| "num_tokens": 90204895.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 0.8328197641806169, |
| "epoch": 0.09208371246587807, |
| "grad_norm": 0.8828219771385193, |
| "learning_rate": 9.18e-07, |
| "loss": 0.3113, |
| "mean_token_accuracy": 0.9182033365423029, |
| "num_tokens": 91290928.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.8542791778391058, |
| "epoch": 0.09308462238398545, |
| "grad_norm": 0.847339928150177, |
| "learning_rate": 9.28e-07, |
| "loss": 0.3111, |
| "mean_token_accuracy": 0.9180883992802013, |
| "num_tokens": 92353449.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 0.8792417260733518, |
| "epoch": 0.09408553230209281, |
| "grad_norm": 0.6507946848869324, |
| "learning_rate": 9.379999999999998e-07, |
| "loss": 0.3093, |
| "mean_token_accuracy": 0.9185398605736819, |
| "num_tokens": 93329097.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.8795178760181773, |
| "epoch": 0.09508644222020018, |
| "grad_norm": 1.9719516038894653, |
| "learning_rate": 9.479999999999999e-07, |
| "loss": 0.3248, |
| "mean_token_accuracy": 0.9174890155142004, |
| "num_tokens": 94076001.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 0.7680907699194821, |
| "epoch": 0.09608735213830755, |
| "grad_norm": 0.9014910459518433, |
| "learning_rate": 9.58e-07, |
| "loss": 0.2999, |
| "mean_token_accuracy": 0.9230750652876767, |
| "num_tokens": 95199341.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.09608735213830755, |
| "eval_entropy": 0.7766347244137624, |
| "eval_loss": 0.22492578625679016, |
| "eval_mean_token_accuracy": 0.9348700437389437, |
| "eval_num_tokens": 95199341.0, |
| "eval_runtime": 7.167, |
| "eval_samples_per_second": 135.761, |
| "eval_steps_per_second": 8.511, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.8319806510751898, |
| "epoch": 0.09708826205641492, |
| "grad_norm": 0.8409860134124756, |
| "learning_rate": 9.679999999999999e-07, |
| "loss": 0.3096, |
| "mean_token_accuracy": 0.919183918020942, |
| "num_tokens": 96280405.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 0.8512482968243685, |
| "epoch": 0.0980891719745223, |
| "grad_norm": 0.7460948824882507, |
| "learning_rate": 9.78e-07, |
| "loss": 0.3048, |
| "mean_token_accuracy": 0.9195660710334778, |
| "num_tokens": 97324958.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.8670887063850056, |
| "epoch": 0.09909008189262966, |
| "grad_norm": 0.7144485116004944, |
| "learning_rate": 9.88e-07, |
| "loss": 0.3047, |
| "mean_token_accuracy": 0.9209712348201058, |
| "num_tokens": 98281184.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 0.876132286678661, |
| "epoch": 0.10009099181073704, |
| "grad_norm": 2.102391242980957, |
| "learning_rate": 9.98e-07, |
| "loss": 0.3134, |
| "mean_token_accuracy": 0.9201786610213193, |
| "num_tokens": 99020369.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.772329353202473, |
| "epoch": 0.1010919017288444, |
| "grad_norm": 0.9755299687385559, |
| "learning_rate": 1.008e-06, |
| "loss": 0.3018, |
| "mean_token_accuracy": 0.9223481546748769, |
| "num_tokens": 100156631.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 0.8236495657400651, |
| "epoch": 0.10209281164695178, |
| "grad_norm": 0.8604740500450134, |
| "learning_rate": 1.018e-06, |
| "loss": 0.3032, |
| "mean_token_accuracy": 0.9208208913152868, |
| "num_tokens": 101235364.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.10209281164695178, |
| "eval_entropy": 0.7710273295152382, |
| "eval_loss": 0.22457090020179749, |
| "eval_mean_token_accuracy": 0.9353827013344062, |
| "eval_num_tokens": 101235364.0, |
| "eval_runtime": 7.1319, |
| "eval_samples_per_second": 136.429, |
| "eval_steps_per_second": 8.553, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.856683783639561, |
| "epoch": 0.10309372156505915, |
| "grad_norm": 0.9585539102554321, |
| "learning_rate": 1.028e-06, |
| "loss": 0.3121, |
| "mean_token_accuracy": 0.9176510073921897, |
| "num_tokens": 102253625.0, |
| "step": 515 |
| }, |
| { |
| "entropy": 0.8793982776728544, |
| "epoch": 0.10409463148316651, |
| "grad_norm": 0.6796140074729919, |
| "learning_rate": 1.038e-06, |
| "loss": 0.3192, |
| "mean_token_accuracy": 0.9176862608302724, |
| "num_tokens": 103206550.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.8828882379965348, |
| "epoch": 0.10509554140127389, |
| "grad_norm": 1.9700580835342407, |
| "learning_rate": 1.048e-06, |
| "loss": 0.3138, |
| "mean_token_accuracy": 0.9197612930427898, |
| "num_tokens": 103937735.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 0.7799841582775116, |
| "epoch": 0.10609645131938125, |
| "grad_norm": 0.9521822929382324, |
| "learning_rate": 1.058e-06, |
| "loss": 0.2993, |
| "mean_token_accuracy": 0.9223499368537557, |
| "num_tokens": 105062562.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.8236062943935394, |
| "epoch": 0.10709736123748863, |
| "grad_norm": 0.8883262276649475, |
| "learning_rate": 1.068e-06, |
| "loss": 0.3043, |
| "mean_token_accuracy": 0.9200864168730649, |
| "num_tokens": 106148854.0, |
| "step": 535 |
| }, |
| { |
| "entropy": 0.855875781991265, |
| "epoch": 0.10809827115559599, |
| "grad_norm": 0.7604862451553345, |
| "learning_rate": 1.078e-06, |
| "loss": 0.301, |
| "mean_token_accuracy": 0.9207951112227006, |
| "num_tokens": 107184016.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.10809827115559599, |
| "eval_entropy": 0.7688479355124177, |
| "eval_loss": 0.2222135066986084, |
| "eval_mean_token_accuracy": 0.9355378277966233, |
| "eval_num_tokens": 107184016.0, |
| "eval_runtime": 7.1389, |
| "eval_samples_per_second": 136.295, |
| "eval_steps_per_second": 8.545, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.8719826313582334, |
| "epoch": 0.10909918107370337, |
| "grad_norm": 0.6990140676498413, |
| "learning_rate": 1.088e-06, |
| "loss": 0.3101, |
| "mean_token_accuracy": 0.9189783123406496, |
| "num_tokens": 108136311.0, |
| "step": 545 |
| }, |
| { |
| "entropy": 0.8727446463975039, |
| "epoch": 0.11010009099181074, |
| "grad_norm": 1.9771708250045776, |
| "learning_rate": 1.0980000000000001e-06, |
| "loss": 0.3056, |
| "mean_token_accuracy": 0.9217729514295404, |
| "num_tokens": 108870399.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.7766597162593495, |
| "epoch": 0.1111010009099181, |
| "grad_norm": 0.9047304391860962, |
| "learning_rate": 1.108e-06, |
| "loss": 0.2989, |
| "mean_token_accuracy": 0.9228177655826916, |
| "num_tokens": 110010412.0, |
| "step": 555 |
| }, |
| { |
| "entropy": 0.821604372696443, |
| "epoch": 0.11210191082802548, |
| "grad_norm": 0.7859129905700684, |
| "learning_rate": 1.1180000000000001e-06, |
| "loss": 0.2986, |
| "mean_token_accuracy": 0.9216692274267023, |
| "num_tokens": 111101459.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.8458013583313335, |
| "epoch": 0.11310282074613284, |
| "grad_norm": 0.7374240159988403, |
| "learning_rate": 1.1279999999999998e-06, |
| "loss": 0.3023, |
| "mean_token_accuracy": 0.9201881116086786, |
| "num_tokens": 112137324.0, |
| "step": 565 |
| }, |
| { |
| "entropy": 0.8739073032682593, |
| "epoch": 0.11410373066424022, |
| "grad_norm": 0.6672995686531067, |
| "learning_rate": 1.138e-06, |
| "loss": 0.3039, |
| "mean_token_accuracy": 0.9213175762783398, |
| "num_tokens": 113102193.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.11410373066424022, |
| "eval_entropy": 0.7622656685407044, |
| "eval_loss": 0.21812133491039276, |
| "eval_mean_token_accuracy": 0.9362532193543481, |
| "eval_num_tokens": 113102193.0, |
| "eval_runtime": 7.083, |
| "eval_samples_per_second": 137.372, |
| "eval_steps_per_second": 8.612, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.8674825570800088, |
| "epoch": 0.1151046405823476, |
| "grad_norm": 1.9667503833770752, |
| "learning_rate": 1.1479999999999999e-06, |
| "loss": 0.3032, |
| "mean_token_accuracy": 0.9225559597665614, |
| "num_tokens": 113844512.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 0.7683469571850516, |
| "epoch": 0.11610555050045496, |
| "grad_norm": 0.8600199818611145, |
| "learning_rate": 1.158e-06, |
| "loss": 0.2874, |
| "mean_token_accuracy": 0.9255136945030906, |
| "num_tokens": 114973235.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.8179097500714388, |
| "epoch": 0.11710646041856233, |
| "grad_norm": 0.7877588272094727, |
| "learning_rate": 1.1679999999999999e-06, |
| "loss": 0.2992, |
| "mean_token_accuracy": 0.9217895182696256, |
| "num_tokens": 116057315.0, |
| "step": 585 |
| }, |
| { |
| "entropy": 0.8371223628520965, |
| "epoch": 0.1181073703366697, |
| "grad_norm": 0.7893165946006775, |
| "learning_rate": 1.178e-06, |
| "loss": 0.2992, |
| "mean_token_accuracy": 0.9224891754713925, |
| "num_tokens": 117103213.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.8535070988264951, |
| "epoch": 0.11910828025477707, |
| "grad_norm": 0.7347795367240906, |
| "learning_rate": 1.1879999999999999e-06, |
| "loss": 0.2933, |
| "mean_token_accuracy": 0.9236816352063959, |
| "num_tokens": 118057061.0, |
| "step": 595 |
| }, |
| { |
| "entropy": 0.873365730047226, |
| "epoch": 0.12010919017288443, |
| "grad_norm": 2.104503870010376, |
| "learning_rate": 1.1979999999999998e-06, |
| "loss": 0.3067, |
| "mean_token_accuracy": 0.9227487694133412, |
| "num_tokens": 118778234.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.12010919017288443, |
| "eval_entropy": 0.7565031569512164, |
| "eval_loss": 0.21913714706897736, |
| "eval_mean_token_accuracy": 0.936484013424545, |
| "eval_num_tokens": 118778234.0, |
| "eval_runtime": 7.1122, |
| "eval_samples_per_second": 136.806, |
| "eval_steps_per_second": 8.577, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.7622005002065139, |
| "epoch": 0.12111010009099181, |
| "grad_norm": 0.9007834196090698, |
| "learning_rate": 1.208e-06, |
| "loss": 0.2921, |
| "mean_token_accuracy": 0.9243286257440394, |
| "num_tokens": 119913350.0, |
| "step": 605 |
| }, |
| { |
| "entropy": 0.8152731066400355, |
| "epoch": 0.12211101000909919, |
| "grad_norm": 0.8577731251716614, |
| "learning_rate": 1.2179999999999998e-06, |
| "loss": 0.303, |
| "mean_token_accuracy": 0.9211578873070804, |
| "num_tokens": 121005111.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.8311775321310216, |
| "epoch": 0.12311191992720655, |
| "grad_norm": 0.8874506950378418, |
| "learning_rate": 1.228e-06, |
| "loss": 0.2941, |
| "mean_token_accuracy": 0.9234593711116097, |
| "num_tokens": 122053423.0, |
| "step": 615 |
| }, |
| { |
| "entropy": 0.8455709652467207, |
| "epoch": 0.12411282984531392, |
| "grad_norm": 0.6410759687423706, |
| "learning_rate": 1.2379999999999998e-06, |
| "loss": 0.2869, |
| "mean_token_accuracy": 0.9240941790017214, |
| "num_tokens": 123023538.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.865269153768366, |
| "epoch": 0.1251137397634213, |
| "grad_norm": 1.943968653678894, |
| "learning_rate": 1.248e-06, |
| "loss": 0.301, |
| "mean_token_accuracy": 0.9235067736018788, |
| "num_tokens": 123756604.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 0.7617347641424699, |
| "epoch": 0.12611464968152866, |
| "grad_norm": 0.8967196941375732, |
| "learning_rate": 1.2579999999999999e-06, |
| "loss": 0.2934, |
| "mean_token_accuracy": 0.9244806538928639, |
| "num_tokens": 124876188.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.12611464968152866, |
| "eval_entropy": 0.758461444104304, |
| "eval_loss": 0.21852023899555206, |
| "eval_mean_token_accuracy": 0.9367063485208105, |
| "eval_num_tokens": 124876188.0, |
| "eval_runtime": 7.0895, |
| "eval_samples_per_second": 137.246, |
| "eval_steps_per_second": 8.604, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.8160103900866075, |
| "epoch": 0.12711555959963602, |
| "grad_norm": 0.7799694538116455, |
| "learning_rate": 1.268e-06, |
| "loss": 0.2946, |
| "mean_token_accuracy": 0.9232698521830819, |
| "num_tokens": 125977507.0, |
| "step": 635 |
| }, |
| { |
| "entropy": 0.8289537635716525, |
| "epoch": 0.12811646951774341, |
| "grad_norm": 0.784251868724823, |
| "learning_rate": 1.2779999999999999e-06, |
| "loss": 0.2917, |
| "mean_token_accuracy": 0.9241768289696086, |
| "num_tokens": 127032719.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.8573456623337485, |
| "epoch": 0.12911737943585078, |
| "grad_norm": 0.7045451402664185, |
| "learning_rate": 1.288e-06, |
| "loss": 0.2984, |
| "mean_token_accuracy": 0.9235251740975814, |
| "num_tokens": 127988387.0, |
| "step": 645 |
| }, |
| { |
| "entropy": 0.8507555175911297, |
| "epoch": 0.13011828935395814, |
| "grad_norm": 1.819658637046814, |
| "learning_rate": 1.298e-06, |
| "loss": 0.2914, |
| "mean_token_accuracy": 0.924909613349221, |
| "num_tokens": 128721584.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.7407966630025343, |
| "epoch": 0.1311191992720655, |
| "grad_norm": 0.8966879844665527, |
| "learning_rate": 1.308e-06, |
| "loss": 0.28, |
| "mean_token_accuracy": 0.92760883027857, |
| "num_tokens": 129860649.0, |
| "step": 655 |
| }, |
| { |
| "entropy": 0.8003853126005693, |
| "epoch": 0.1321201091901729, |
| "grad_norm": 0.8079231977462769, |
| "learning_rate": 1.318e-06, |
| "loss": 0.2894, |
| "mean_token_accuracy": 0.9240497372367166, |
| "num_tokens": 130965498.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.1321201091901729, |
| "eval_entropy": 0.750291645526886, |
| "eval_loss": 0.213973730802536, |
| "eval_mean_token_accuracy": 0.9382940315809406, |
| "eval_num_tokens": 130965498.0, |
| "eval_runtime": 7.0264, |
| "eval_samples_per_second": 138.477, |
| "eval_steps_per_second": 8.681, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.8198544599793174, |
| "epoch": 0.13312101910828025, |
| "grad_norm": 0.8206574320793152, |
| "learning_rate": 1.328e-06, |
| "loss": 0.2865, |
| "mean_token_accuracy": 0.9246467048471624, |
| "num_tokens": 132002292.0, |
| "step": 665 |
| }, |
| { |
| "entropy": 0.8489445995200764, |
| "epoch": 0.13412192902638762, |
| "grad_norm": 0.6715352535247803, |
| "learning_rate": 1.338e-06, |
| "loss": 0.2924, |
| "mean_token_accuracy": 0.9234138223257932, |
| "num_tokens": 132956333.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.8532397329807282, |
| "epoch": 0.135122838944495, |
| "grad_norm": 1.8673855066299438, |
| "learning_rate": 1.348e-06, |
| "loss": 0.2997, |
| "mean_token_accuracy": 0.9234183029694991, |
| "num_tokens": 133677965.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 0.7524514572186903, |
| "epoch": 0.13612374886260237, |
| "grad_norm": 0.8716238737106323, |
| "learning_rate": 1.358e-06, |
| "loss": 0.2841, |
| "mean_token_accuracy": 0.9266363842920824, |
| "num_tokens": 134802840.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.8017716060985218, |
| "epoch": 0.13712465878070973, |
| "grad_norm": 0.7700985074043274, |
| "learning_rate": 1.368e-06, |
| "loss": 0.2855, |
| "mean_token_accuracy": 0.9250672015276822, |
| "num_tokens": 135888025.0, |
| "step": 685 |
| }, |
| { |
| "entropy": 0.8074500967155803, |
| "epoch": 0.13812556869881712, |
| "grad_norm": 0.8650295734405518, |
| "learning_rate": 1.3779999999999998e-06, |
| "loss": 0.286, |
| "mean_token_accuracy": 0.9245203012769873, |
| "num_tokens": 136924539.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.13812556869881712, |
| "eval_entropy": 0.7438388193239931, |
| "eval_loss": 0.21345947682857513, |
| "eval_mean_token_accuracy": 0.9379716603482355, |
| "eval_num_tokens": 136924539.0, |
| "eval_runtime": 7.2288, |
| "eval_samples_per_second": 134.6, |
| "eval_steps_per_second": 8.438, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.8413560314611955, |
| "epoch": 0.13912647861692448, |
| "grad_norm": 0.6126115918159485, |
| "learning_rate": 1.3879999999999999e-06, |
| "loss": 0.2903, |
| "mean_token_accuracy": 0.924701126055284, |
| "num_tokens": 137879499.0, |
| "step": 695 |
| }, |
| { |
| "entropy": 0.8566647876392711, |
| "epoch": 0.14012738853503184, |
| "grad_norm": 1.6679223775863647, |
| "learning_rate": 1.3979999999999998e-06, |
| "loss": 0.2998, |
| "mean_token_accuracy": 0.9239558404142206, |
| "num_tokens": 138603916.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.7479892709038475, |
| "epoch": 0.1411282984531392, |
| "grad_norm": 0.8888144493103027, |
| "learning_rate": 1.408e-06, |
| "loss": 0.281, |
| "mean_token_accuracy": 0.9272219023921273, |
| "num_tokens": 139731892.0, |
| "step": 705 |
| }, |
| { |
| "entropy": 0.8010973101312464, |
| "epoch": 0.1421292083712466, |
| "grad_norm": 0.7999457716941833, |
| "learning_rate": 1.4179999999999998e-06, |
| "loss": 0.2862, |
| "mean_token_accuracy": 0.9249833226203918, |
| "num_tokens": 140823520.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.8243546702645042, |
| "epoch": 0.14313011828935396, |
| "grad_norm": 0.7929534912109375, |
| "learning_rate": 1.428e-06, |
| "loss": 0.289, |
| "mean_token_accuracy": 0.9242551738565619, |
| "num_tokens": 141838998.0, |
| "step": 715 |
| }, |
| { |
| "entropy": 0.8342987900430506, |
| "epoch": 0.14413102820746132, |
| "grad_norm": 0.7797636985778809, |
| "learning_rate": 1.4379999999999998e-06, |
| "loss": 0.286, |
| "mean_token_accuracy": 0.9250240569764917, |
| "num_tokens": 142788817.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.14413102820746132, |
| "eval_entropy": 0.7430196765993462, |
| "eval_loss": 0.21336835622787476, |
| "eval_mean_token_accuracy": 0.9381084842760055, |
| "eval_num_tokens": 142788817.0, |
| "eval_runtime": 7.0804, |
| "eval_samples_per_second": 137.421, |
| "eval_steps_per_second": 8.615, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.8445068792863326, |
| "epoch": 0.1451319381255687, |
| "grad_norm": 1.6939398050308228, |
| "learning_rate": 1.448e-06, |
| "loss": 0.2895, |
| "mean_token_accuracy": 0.9256705939769745, |
| "num_tokens": 143528076.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 0.7430232730778781, |
| "epoch": 0.14613284804367607, |
| "grad_norm": 0.9444680213928223, |
| "learning_rate": 1.4579999999999998e-06, |
| "loss": 0.2684, |
| "mean_token_accuracy": 0.9300248498266394, |
| "num_tokens": 144669748.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.8000332848592238, |
| "epoch": 0.14713375796178343, |
| "grad_norm": 0.8172219395637512, |
| "learning_rate": 1.468e-06, |
| "loss": 0.2831, |
| "mean_token_accuracy": 0.9256041104143317, |
| "num_tokens": 145740545.0, |
| "step": 735 |
| }, |
| { |
| "entropy": 0.8183066297661175, |
| "epoch": 0.1481346678798908, |
| "grad_norm": 0.7712555527687073, |
| "learning_rate": 1.4779999999999999e-06, |
| "loss": 0.2832, |
| "mean_token_accuracy": 0.9251929564909501, |
| "num_tokens": 146777060.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.8324488238854841, |
| "epoch": 0.1491355777979982, |
| "grad_norm": 0.6426140666007996, |
| "learning_rate": 1.488e-06, |
| "loss": 0.2842, |
| "mean_token_accuracy": 0.925419792803851, |
| "num_tokens": 147735794.0, |
| "step": 745 |
| }, |
| { |
| "entropy": 0.845582206140865, |
| "epoch": 0.15013648771610555, |
| "grad_norm": 1.8751927614212036, |
| "learning_rate": 1.4979999999999999e-06, |
| "loss": 0.2924, |
| "mean_token_accuracy": 0.9251068283211101, |
| "num_tokens": 148475511.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.15013648771610555, |
| "eval_entropy": 0.7409035055363764, |
| "eval_loss": 0.20810416340827942, |
| "eval_mean_token_accuracy": 0.9396978216093095, |
| "eval_num_tokens": 148475511.0, |
| "eval_runtime": 7.0982, |
| "eval_samples_per_second": 137.077, |
| "eval_steps_per_second": 8.594, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.738556033372879, |
| "epoch": 0.1511373976342129, |
| "grad_norm": 0.9034550189971924, |
| "learning_rate": 1.508e-06, |
| "loss": 0.2687, |
| "mean_token_accuracy": 0.9301519659432498, |
| "num_tokens": 149602202.0, |
| "step": 755 |
| }, |
| { |
| "entropy": 0.8039004320448095, |
| "epoch": 0.1521383075523203, |
| "grad_norm": 0.7859927415847778, |
| "learning_rate": 1.518e-06, |
| "loss": 0.2835, |
| "mean_token_accuracy": 0.9249755826863375, |
| "num_tokens": 150660031.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.8235690740021793, |
| "epoch": 0.15313921747042766, |
| "grad_norm": 0.9304025769233704, |
| "learning_rate": 1.528e-06, |
| "loss": 0.2837, |
| "mean_token_accuracy": 0.9240913233973763, |
| "num_tokens": 151705865.0, |
| "step": 765 |
| }, |
| { |
| "entropy": 0.8295753197236495, |
| "epoch": 0.15414012738853503, |
| "grad_norm": 0.6954373717308044, |
| "learning_rate": 1.538e-06, |
| "loss": 0.2826, |
| "mean_token_accuracy": 0.9268539065664465, |
| "num_tokens": 152662320.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.8385162537748163, |
| "epoch": 0.15514103730664242, |
| "grad_norm": 1.6783963441848755, |
| "learning_rate": 1.548e-06, |
| "loss": 0.2861, |
| "mean_token_accuracy": 0.9266390404917977, |
| "num_tokens": 153390397.0, |
| "step": 775 |
| }, |
| { |
| "entropy": 0.7379371079531583, |
| "epoch": 0.15614194722474978, |
| "grad_norm": 0.9129334092140198, |
| "learning_rate": 1.558e-06, |
| "loss": 0.2728, |
| "mean_token_accuracy": 0.9291019217534499, |
| "num_tokens": 154510653.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.15614194722474978, |
| "eval_entropy": 0.744651442668477, |
| "eval_loss": 0.20912407338619232, |
| "eval_mean_token_accuracy": 0.9394876233866958, |
| "eval_num_tokens": 154510653.0, |
| "eval_runtime": 7.2729, |
| "eval_samples_per_second": 133.784, |
| "eval_steps_per_second": 8.387, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.7914630618962375, |
| "epoch": 0.15714285714285714, |
| "grad_norm": 0.8796635270118713, |
| "learning_rate": 1.568e-06, |
| "loss": 0.279, |
| "mean_token_accuracy": 0.9265191034837202, |
| "num_tokens": 155598773.0, |
| "step": 785 |
| }, |
| { |
| "entropy": 0.8060251300985163, |
| "epoch": 0.1581437670609645, |
| "grad_norm": 0.722435474395752, |
| "learning_rate": 1.578e-06, |
| "loss": 0.2837, |
| "mean_token_accuracy": 0.9250888071276925, |
| "num_tokens": 156631931.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.8348277311433445, |
| "epoch": 0.1591446769790719, |
| "grad_norm": 0.6317898631095886, |
| "learning_rate": 1.588e-06, |
| "loss": 0.2873, |
| "mean_token_accuracy": 0.9253057523207231, |
| "num_tokens": 157582000.0, |
| "step": 795 |
| }, |
| { |
| "entropy": 0.8389149953018535, |
| "epoch": 0.16014558689717925, |
| "grad_norm": 1.8928760290145874, |
| "learning_rate": 1.598e-06, |
| "loss": 0.2875, |
| "mean_token_accuracy": 0.9262628761204806, |
| "num_tokens": 158320297.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.7366051608865911, |
| "epoch": 0.16114649681528662, |
| "grad_norm": 0.8541039228439331, |
| "learning_rate": 1.608e-06, |
| "loss": 0.2663, |
| "mean_token_accuracy": 0.9309936154972424, |
| "num_tokens": 159440189.0, |
| "step": 805 |
| }, |
| { |
| "entropy": 0.7848947503349998, |
| "epoch": 0.162147406733394, |
| "grad_norm": 0.752082884311676, |
| "learning_rate": 1.618e-06, |
| "loss": 0.2813, |
| "mean_token_accuracy": 0.9258821048519829, |
| "num_tokens": 160545730.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.162147406733394, |
| "eval_entropy": 0.7330616829825229, |
| "eval_loss": 0.2090436816215515, |
| "eval_mean_token_accuracy": 0.9397259344820117, |
| "eval_num_tokens": 160545730.0, |
| "eval_runtime": 7.0312, |
| "eval_samples_per_second": 138.384, |
| "eval_steps_per_second": 8.676, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.8028561315753243, |
| "epoch": 0.16314831665150137, |
| "grad_norm": 0.7342677116394043, |
| "learning_rate": 1.628e-06, |
| "loss": 0.2804, |
| "mean_token_accuracy": 0.9256297485394911, |
| "num_tokens": 161573472.0, |
| "step": 815 |
| }, |
| { |
| "entropy": 0.8302115288647738, |
| "epoch": 0.16414922656960873, |
| "grad_norm": 0.7893772721290588, |
| "learning_rate": 1.6379999999999998e-06, |
| "loss": 0.2836, |
| "mean_token_accuracy": 0.9263590422543613, |
| "num_tokens": 162530179.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.827764184366573, |
| "epoch": 0.1651501364877161, |
| "grad_norm": 1.99238920211792, |
| "learning_rate": 1.648e-06, |
| "loss": 0.2817, |
| "mean_token_accuracy": 0.9277313232421875, |
| "num_tokens": 163263981.0, |
| "step": 825 |
| }, |
| { |
| "entropy": 0.722087900205092, |
| "epoch": 0.16615104640582348, |
| "grad_norm": 0.8378480076789856, |
| "learning_rate": 1.6579999999999998e-06, |
| "loss": 0.2627, |
| "mean_token_accuracy": 0.9315350006927143, |
| "num_tokens": 164398357.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.7725223552096974, |
| "epoch": 0.16715195632393084, |
| "grad_norm": 0.7914682626724243, |
| "learning_rate": 1.668e-06, |
| "loss": 0.2716, |
| "mean_token_accuracy": 0.9280308051542803, |
| "num_tokens": 165468194.0, |
| "step": 835 |
| }, |
| { |
| "entropy": 0.8090368211269379, |
| "epoch": 0.1681528662420382, |
| "grad_norm": 0.8411707282066345, |
| "learning_rate": 1.6779999999999999e-06, |
| "loss": 0.2795, |
| "mean_token_accuracy": 0.9261684049259532, |
| "num_tokens": 166510298.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.1681528662420382, |
| "eval_entropy": 0.726200912819534, |
| "eval_loss": 0.20932643115520477, |
| "eval_mean_token_accuracy": 0.9394924601570505, |
| "eval_num_tokens": 166510298.0, |
| "eval_runtime": 7.1551, |
| "eval_samples_per_second": 135.988, |
| "eval_steps_per_second": 8.525, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.8117384894327684, |
| "epoch": 0.1691537761601456, |
| "grad_norm": 0.6859620809555054, |
| "learning_rate": 1.6879999999999998e-06, |
| "loss": 0.2775, |
| "mean_token_accuracy": 0.9275808865373785, |
| "num_tokens": 167466578.0, |
| "step": 845 |
| }, |
| { |
| "entropy": 0.8264376792040738, |
| "epoch": 0.17015468607825296, |
| "grad_norm": 1.9784756898880005, |
| "learning_rate": 1.6979999999999999e-06, |
| "loss": 0.2832, |
| "mean_token_accuracy": 0.9271657266400077, |
| "num_tokens": 168200534.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.7185999631881714, |
| "epoch": 0.17115559599636032, |
| "grad_norm": 0.8721832036972046, |
| "learning_rate": 1.7079999999999998e-06, |
| "loss": 0.2669, |
| "mean_token_accuracy": 0.9305740556933663, |
| "num_tokens": 169340781.0, |
| "step": 855 |
| }, |
| { |
| "entropy": 0.7641947990114039, |
| "epoch": 0.1721565059144677, |
| "grad_norm": 0.7572875618934631, |
| "learning_rate": 1.718e-06, |
| "loss": 0.2713, |
| "mean_token_accuracy": 0.9280713466080752, |
| "num_tokens": 170422517.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.7978116544810209, |
| "epoch": 0.17315741583257507, |
| "grad_norm": 0.7459094524383545, |
| "learning_rate": 1.7279999999999998e-06, |
| "loss": 0.2765, |
| "mean_token_accuracy": 0.9266852221705697, |
| "num_tokens": 171469260.0, |
| "step": 865 |
| }, |
| { |
| "entropy": 0.8156883180141449, |
| "epoch": 0.17415832575068244, |
| "grad_norm": 0.6680793166160583, |
| "learning_rate": 1.738e-06, |
| "loss": 0.2768, |
| "mean_token_accuracy": 0.9277672919360074, |
| "num_tokens": 172427066.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.17415832575068244, |
| "eval_entropy": 0.7165868282318115, |
| "eval_loss": 0.20711387693881989, |
| "eval_mean_token_accuracy": 0.9398616259215308, |
| "eval_num_tokens": 172427066.0, |
| "eval_runtime": 7.0985, |
| "eval_samples_per_second": 137.071, |
| "eval_steps_per_second": 8.593, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.8210206216031855, |
| "epoch": 0.1751592356687898, |
| "grad_norm": 1.6756982803344727, |
| "learning_rate": 1.7479999999999998e-06, |
| "loss": 0.2764, |
| "mean_token_accuracy": 0.9285656040365046, |
| "num_tokens": 173142991.0, |
| "step": 875 |
| }, |
| { |
| "entropy": 0.7199788321148265, |
| "epoch": 0.1761601455868972, |
| "grad_norm": 0.8849101066589355, |
| "learning_rate": 1.758e-06, |
| "loss": 0.2598, |
| "mean_token_accuracy": 0.9321709827943282, |
| "num_tokens": 174282576.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.7870851630514318, |
| "epoch": 0.17716105550500455, |
| "grad_norm": 0.7448862791061401, |
| "learning_rate": 1.7679999999999998e-06, |
| "loss": 0.2737, |
| "mean_token_accuracy": 0.9282706352797422, |
| "num_tokens": 175369483.0, |
| "step": 885 |
| }, |
| { |
| "entropy": 0.8100219070911407, |
| "epoch": 0.1781619654231119, |
| "grad_norm": 0.8964276313781738, |
| "learning_rate": 1.778e-06, |
| "loss": 0.2773, |
| "mean_token_accuracy": 0.9265438226136294, |
| "num_tokens": 176409360.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.8189065765250813, |
| "epoch": 0.1791628753412193, |
| "grad_norm": 0.7688677906990051, |
| "learning_rate": 1.7879999999999999e-06, |
| "loss": 0.2699, |
| "mean_token_accuracy": 0.9298083012754267, |
| "num_tokens": 177361894.0, |
| "step": 895 |
| }, |
| { |
| "entropy": 0.8263901694254442, |
| "epoch": 0.18016378525932666, |
| "grad_norm": 1.8425803184509277, |
| "learning_rate": 1.798e-06, |
| "loss": 0.2763, |
| "mean_token_accuracy": 0.9287969903512434, |
| "num_tokens": 178089733.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.18016378525932666, |
| "eval_entropy": 0.7217419499256572, |
| "eval_loss": 0.20589642226696014, |
| "eval_mean_token_accuracy": 0.9406212060177912, |
| "eval_num_tokens": 178089733.0, |
| "eval_runtime": 7.1263, |
| "eval_samples_per_second": 136.536, |
| "eval_steps_per_second": 8.56, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.72322096824646, |
| "epoch": 0.18116469517743403, |
| "grad_norm": 0.8959372043609619, |
| "learning_rate": 1.8079999999999999e-06, |
| "loss": 0.2666, |
| "mean_token_accuracy": 0.9303640398112211, |
| "num_tokens": 179231674.0, |
| "step": 905 |
| }, |
| { |
| "entropy": 0.7706199407577514, |
| "epoch": 0.1821656050955414, |
| "grad_norm": 0.8667539358139038, |
| "learning_rate": 1.818e-06, |
| "loss": 0.2765, |
| "mean_token_accuracy": 0.9267726708542217, |
| "num_tokens": 180315249.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.7906596568497745, |
| "epoch": 0.18316651501364878, |
| "grad_norm": 0.7227725982666016, |
| "learning_rate": 1.828e-06, |
| "loss": 0.2686, |
| "mean_token_accuracy": 0.9294304208322005, |
| "num_tokens": 181340192.0, |
| "step": 915 |
| }, |
| { |
| "entropy": 0.8029232171448795, |
| "epoch": 0.18416742493175614, |
| "grad_norm": 0.6649354696273804, |
| "learning_rate": 1.838e-06, |
| "loss": 0.2667, |
| "mean_token_accuracy": 0.9300361882556568, |
| "num_tokens": 182307935.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.7977046684785323, |
| "epoch": 0.1851683348498635, |
| "grad_norm": 1.6901991367340088, |
| "learning_rate": 1.848e-06, |
| "loss": 0.2723, |
| "mean_token_accuracy": 0.9294875199144537, |
| "num_tokens": 183060078.0, |
| "step": 925 |
| }, |
| { |
| "entropy": 0.7040667669339613, |
| "epoch": 0.1861692447679709, |
| "grad_norm": 0.8532208204269409, |
| "learning_rate": 1.858e-06, |
| "loss": 0.2571, |
| "mean_token_accuracy": 0.9326450272039933, |
| "num_tokens": 184221420.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.1861692447679709, |
| "eval_entropy": 0.725787528225633, |
| "eval_loss": 0.20359419286251068, |
| "eval_mean_token_accuracy": 0.9418040504221057, |
| "eval_num_tokens": 184221420.0, |
| "eval_runtime": 7.1054, |
| "eval_samples_per_second": 136.938, |
| "eval_steps_per_second": 8.585, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.7605256313627416, |
| "epoch": 0.18717015468607826, |
| "grad_norm": 0.740172266960144, |
| "learning_rate": 1.868e-06, |
| "loss": 0.2642, |
| "mean_token_accuracy": 0.9301990406079725, |
| "num_tokens": 185315366.0, |
| "step": 935 |
| }, |
| { |
| "entropy": 0.788138997554779, |
| "epoch": 0.18817106460418562, |
| "grad_norm": 0.7328667640686035, |
| "learning_rate": 1.8779999999999998e-06, |
| "loss": 0.2594, |
| "mean_token_accuracy": 0.9303668347272006, |
| "num_tokens": 186337490.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.815633828531612, |
| "epoch": 0.189171974522293, |
| "grad_norm": 0.6744860410690308, |
| "learning_rate": 1.8879999999999998e-06, |
| "loss": 0.2702, |
| "mean_token_accuracy": 0.9292519065466794, |
| "num_tokens": 187269716.0, |
| "step": 945 |
| }, |
| { |
| "entropy": 0.8176946092735637, |
| "epoch": 0.19017288444040037, |
| "grad_norm": 1.8253427743911743, |
| "learning_rate": 1.8979999999999999e-06, |
| "loss": 0.2659, |
| "mean_token_accuracy": 0.931620988520709, |
| "num_tokens": 187991692.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.7086463402618062, |
| "epoch": 0.19117379435850773, |
| "grad_norm": 0.9137970209121704, |
| "learning_rate": 1.9079999999999998e-06, |
| "loss": 0.2586, |
| "mean_token_accuracy": 0.9328472657637162, |
| "num_tokens": 189120364.0, |
| "step": 955 |
| }, |
| { |
| "entropy": 0.7753447532653809, |
| "epoch": 0.1921747042766151, |
| "grad_norm": 0.8782041668891907, |
| "learning_rate": 1.9179999999999997e-06, |
| "loss": 0.2659, |
| "mean_token_accuracy": 0.9294385693290017, |
| "num_tokens": 190191796.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.1921747042766151, |
| "eval_entropy": 0.724771861170159, |
| "eval_loss": 0.2042693942785263, |
| "eval_mean_token_accuracy": 0.9414341498593815, |
| "eval_num_tokens": 190191796.0, |
| "eval_runtime": 7.0787, |
| "eval_samples_per_second": 137.455, |
| "eval_steps_per_second": 8.617, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.7960949518463828, |
| "epoch": 0.19317561419472248, |
| "grad_norm": 0.713079035282135, |
| "learning_rate": 1.928e-06, |
| "loss": 0.2676, |
| "mean_token_accuracy": 0.9294228992678902, |
| "num_tokens": 191241957.0, |
| "step": 965 |
| }, |
| { |
| "entropy": 0.8051728243177587, |
| "epoch": 0.19417652411282985, |
| "grad_norm": 0.666320264339447, |
| "learning_rate": 1.938e-06, |
| "loss": 0.2652, |
| "mean_token_accuracy": 0.9296959736130455, |
| "num_tokens": 192200870.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.8057123639366843, |
| "epoch": 0.1951774340309372, |
| "grad_norm": 1.6562741994857788, |
| "learning_rate": 1.948e-06, |
| "loss": 0.2605, |
| "mean_token_accuracy": 0.9326337846842679, |
| "num_tokens": 192934859.0, |
| "step": 975 |
| }, |
| { |
| "entropy": 0.7061334458264438, |
| "epoch": 0.1961783439490446, |
| "grad_norm": 0.8408867120742798, |
| "learning_rate": 1.9579999999999997e-06, |
| "loss": 0.2536, |
| "mean_token_accuracy": 0.9338537595488808, |
| "num_tokens": 194104139.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.7579250015995719, |
| "epoch": 0.19717925386715196, |
| "grad_norm": 0.7840215563774109, |
| "learning_rate": 1.968e-06, |
| "loss": 0.2595, |
| "mean_token_accuracy": 0.9305404896085913, |
| "num_tokens": 195188455.0, |
| "step": 985 |
| }, |
| { |
| "entropy": 0.7914492504163222, |
| "epoch": 0.19818016378525932, |
| "grad_norm": 0.692529559135437, |
| "learning_rate": 1.978e-06, |
| "loss": 0.2671, |
| "mean_token_accuracy": 0.9293763789263638, |
| "num_tokens": 196209141.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.19818016378525932, |
| "eval_entropy": 0.7139858779360037, |
| "eval_loss": 0.20102433860301971, |
| "eval_mean_token_accuracy": 0.9419027381255979, |
| "eval_num_tokens": 196209141.0, |
| "eval_runtime": 7.1947, |
| "eval_samples_per_second": 135.239, |
| "eval_steps_per_second": 8.479, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.7997732845219698, |
| "epoch": 0.19918107370336668, |
| "grad_norm": 0.7436501979827881, |
| "learning_rate": 1.988e-06, |
| "loss": 0.2613, |
| "mean_token_accuracy": 0.9318017385222696, |
| "num_tokens": 197169823.0, |
| "step": 995 |
| }, |
| { |
| "entropy": 0.808041772517291, |
| "epoch": 0.20018198362147407, |
| "grad_norm": 1.8346213102340698, |
| "learning_rate": 1.9979999999999998e-06, |
| "loss": 0.2684, |
| "mean_token_accuracy": 0.9309409526261416, |
| "num_tokens": 197904294.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.7090240332213316, |
| "epoch": 0.20118289353958144, |
| "grad_norm": 0.8898105025291443, |
| "learning_rate": 1.9991103202846973e-06, |
| "loss": 0.2542, |
| "mean_token_accuracy": 0.9338583967902444, |
| "num_tokens": 199040537.0, |
| "step": 1005 |
| }, |
| { |
| "entropy": 0.7625901590694081, |
| "epoch": 0.2021838034576888, |
| "grad_norm": 0.7580350041389465, |
| "learning_rate": 1.997998220640569e-06, |
| "loss": 0.2684, |
| "mean_token_accuracy": 0.9290495872497558, |
| "num_tokens": 200122330.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.7868972290645946, |
| "epoch": 0.2031847133757962, |
| "grad_norm": 0.9172696471214294, |
| "learning_rate": 1.996886120996441e-06, |
| "loss": 0.2592, |
| "mean_token_accuracy": 0.9309038433161649, |
| "num_tokens": 201149457.0, |
| "step": 1015 |
| }, |
| { |
| "entropy": 0.7947816740382802, |
| "epoch": 0.20418562329390355, |
| "grad_norm": 0.6719794273376465, |
| "learning_rate": 1.9957740213523133e-06, |
| "loss": 0.2634, |
| "mean_token_accuracy": 0.9316002515229311, |
| "num_tokens": 202101608.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.20418562329390355, |
| "eval_entropy": 0.7147035344702298, |
| "eval_loss": 0.20133115351200104, |
| "eval_mean_token_accuracy": 0.9419465289741266, |
| "eval_num_tokens": 202101608.0, |
| "eval_runtime": 7.0701, |
| "eval_samples_per_second": 137.622, |
| "eval_steps_per_second": 8.628, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.813241909308867, |
| "epoch": 0.2051865332120109, |
| "grad_norm": 1.68107271194458, |
| "learning_rate": 1.994661921708185e-06, |
| "loss": 0.2721, |
| "mean_token_accuracy": 0.9300860870968212, |
| "num_tokens": 202823517.0, |
| "step": 1025 |
| }, |
| { |
| "entropy": 0.6989771512421694, |
| "epoch": 0.2061874431301183, |
| "grad_norm": 0.9269376397132874, |
| "learning_rate": 1.9935498220640566e-06, |
| "loss": 0.2535, |
| "mean_token_accuracy": 0.9341622206297788, |
| "num_tokens": 203958059.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.7591653926806017, |
| "epoch": 0.20718835304822567, |
| "grad_norm": 0.7755193114280701, |
| "learning_rate": 1.992437722419929e-06, |
| "loss": 0.2648, |
| "mean_token_accuracy": 0.9302975632927635, |
| "num_tokens": 205042771.0, |
| "step": 1035 |
| }, |
| { |
| "entropy": 0.7722339581359516, |
| "epoch": 0.20818926296633303, |
| "grad_norm": 0.8515006303787231, |
| "learning_rate": 1.9913256227758007e-06, |
| "loss": 0.2638, |
| "mean_token_accuracy": 0.9300298192284324, |
| "num_tokens": 206086748.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.7889559702439741, |
| "epoch": 0.2091901728844404, |
| "grad_norm": 0.6690332293510437, |
| "learning_rate": 1.9902135231316726e-06, |
| "loss": 0.2565, |
| "mean_token_accuracy": 0.9327416582541033, |
| "num_tokens": 207023751.0, |
| "step": 1045 |
| }, |
| { |
| "entropy": 0.7905822466720235, |
| "epoch": 0.21019108280254778, |
| "grad_norm": 1.524138331413269, |
| "learning_rate": 1.9891014234875445e-06, |
| "loss": 0.2618, |
| "mean_token_accuracy": 0.9318056187846444, |
| "num_tokens": 207751826.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.21019108280254778, |
| "eval_entropy": 0.6987361546422615, |
| "eval_loss": 0.20032314956188202, |
| "eval_mean_token_accuracy": 0.9415211052191063, |
| "eval_num_tokens": 207751826.0, |
| "eval_runtime": 7.1237, |
| "eval_samples_per_second": 136.587, |
| "eval_steps_per_second": 8.563, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.700323451649059, |
| "epoch": 0.21119199272065514, |
| "grad_norm": 0.9274206161499023, |
| "learning_rate": 1.9879893238434163e-06, |
| "loss": 0.2499, |
| "mean_token_accuracy": 0.9347092021595348, |
| "num_tokens": 208886557.0, |
| "step": 1055 |
| }, |
| { |
| "entropy": 0.7475979534062472, |
| "epoch": 0.2121929026387625, |
| "grad_norm": 0.8458713293075562, |
| "learning_rate": 1.986877224199288e-06, |
| "loss": 0.261, |
| "mean_token_accuracy": 0.9306270117109472, |
| "num_tokens": 209999842.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.7634694963693619, |
| "epoch": 0.2131938125568699, |
| "grad_norm": 0.7438536882400513, |
| "learning_rate": 1.98576512455516e-06, |
| "loss": 0.2612, |
| "mean_token_accuracy": 0.9316813165491278, |
| "num_tokens": 211047482.0, |
| "step": 1065 |
| }, |
| { |
| "entropy": 0.7860465927557512, |
| "epoch": 0.21419472247497726, |
| "grad_norm": 0.6679530739784241, |
| "learning_rate": 1.984653024911032e-06, |
| "loss": 0.2616, |
| "mean_token_accuracy": 0.932481362061067, |
| "num_tokens": 211999890.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.7879262474450198, |
| "epoch": 0.21519563239308462, |
| "grad_norm": 1.5317449569702148, |
| "learning_rate": 1.9835409252669037e-06, |
| "loss": 0.256, |
| "mean_token_accuracy": 0.9341791461814534, |
| "num_tokens": 212724971.0, |
| "step": 1075 |
| }, |
| { |
| "entropy": 0.6914473251862959, |
| "epoch": 0.21619654231119198, |
| "grad_norm": 0.9009571671485901, |
| "learning_rate": 1.9824288256227756e-06, |
| "loss": 0.2469, |
| "mean_token_accuracy": 0.935233576189388, |
| "num_tokens": 213865483.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.21619654231119198, |
| "eval_entropy": 0.6992622926586964, |
| "eval_loss": 0.19818614423274994, |
| "eval_mean_token_accuracy": 0.9426181824480901, |
| "eval_num_tokens": 213865483.0, |
| "eval_runtime": 7.3835, |
| "eval_samples_per_second": 131.78, |
| "eval_steps_per_second": 8.262, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.7396956460042433, |
| "epoch": 0.21719745222929937, |
| "grad_norm": 0.7676311135292053, |
| "learning_rate": 1.9813167259786475e-06, |
| "loss": 0.2553, |
| "mean_token_accuracy": 0.9329301888292486, |
| "num_tokens": 214946018.0, |
| "step": 1085 |
| }, |
| { |
| "entropy": 0.7576209339228543, |
| "epoch": 0.21819836214740673, |
| "grad_norm": 0.9512864351272583, |
| "learning_rate": 1.9802046263345197e-06, |
| "loss": 0.2574, |
| "mean_token_accuracy": 0.931071363254027, |
| "num_tokens": 215999988.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.7767835638739846, |
| "epoch": 0.2191992720655141, |
| "grad_norm": 0.6882670521736145, |
| "learning_rate": 1.979092526690391e-06, |
| "loss": 0.2518, |
| "mean_token_accuracy": 0.9337078777226535, |
| "num_tokens": 216962447.0, |
| "step": 1095 |
| }, |
| { |
| "entropy": 0.7832509934902191, |
| "epoch": 0.22020018198362148, |
| "grad_norm": 1.6970500946044922, |
| "learning_rate": 1.977980427046263e-06, |
| "loss": 0.2583, |
| "mean_token_accuracy": 0.9332552210851149, |
| "num_tokens": 217692537.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.6820299370722337, |
| "epoch": 0.22120109190172885, |
| "grad_norm": 0.8949645757675171, |
| "learning_rate": 1.9768683274021353e-06, |
| "loss": 0.2445, |
| "mean_token_accuracy": 0.935930597782135, |
| "num_tokens": 218839476.0, |
| "step": 1105 |
| }, |
| { |
| "entropy": 0.72886228073727, |
| "epoch": 0.2222020018198362, |
| "grad_norm": 0.8621814846992493, |
| "learning_rate": 1.975756227758007e-06, |
| "loss": 0.2493, |
| "mean_token_accuracy": 0.9338542092930187, |
| "num_tokens": 219923390.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.2222020018198362, |
| "eval_entropy": 0.6884255741463333, |
| "eval_loss": 0.19926953315734863, |
| "eval_mean_token_accuracy": 0.9423930107570085, |
| "eval_num_tokens": 219923390.0, |
| "eval_runtime": 7.0927, |
| "eval_samples_per_second": 137.184, |
| "eval_steps_per_second": 8.6, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.7540641031482003, |
| "epoch": 0.22320291173794357, |
| "grad_norm": 0.971157431602478, |
| "learning_rate": 1.974644128113879e-06, |
| "loss": 0.2567, |
| "mean_token_accuracy": 0.932219631021673, |
| "num_tokens": 220957232.0, |
| "step": 1115 |
| }, |
| { |
| "entropy": 0.7798225131901828, |
| "epoch": 0.22420382165605096, |
| "grad_norm": 0.7949030995368958, |
| "learning_rate": 1.973532028469751e-06, |
| "loss": 0.2581, |
| "mean_token_accuracy": 0.9322475785558874, |
| "num_tokens": 221909237.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.7734460061246698, |
| "epoch": 0.22520473157415832, |
| "grad_norm": 1.671317219734192, |
| "learning_rate": 1.9724199288256227e-06, |
| "loss": 0.2532, |
| "mean_token_accuracy": 0.9343869902870872, |
| "num_tokens": 222629518.0, |
| "step": 1125 |
| }, |
| { |
| "entropy": 0.6769220758568156, |
| "epoch": 0.22620564149226569, |
| "grad_norm": 0.8417484164237976, |
| "learning_rate": 1.9713078291814946e-06, |
| "loss": 0.2432, |
| "mean_token_accuracy": 0.9365156341682781, |
| "num_tokens": 223771141.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.7289805867455222, |
| "epoch": 0.22720655141037308, |
| "grad_norm": 0.8334816694259644, |
| "learning_rate": 1.9701957295373665e-06, |
| "loss": 0.2564, |
| "mean_token_accuracy": 0.9321391544558785, |
| "num_tokens": 224858611.0, |
| "step": 1135 |
| }, |
| { |
| "entropy": 0.7575576175342906, |
| "epoch": 0.22820746132848044, |
| "grad_norm": 0.686861515045166, |
| "learning_rate": 1.9690836298932383e-06, |
| "loss": 0.2553, |
| "mean_token_accuracy": 0.932028527693315, |
| "num_tokens": 225904498.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.22820746132848044, |
| "eval_entropy": 0.687260666831595, |
| "eval_loss": 0.19723324477672577, |
| "eval_mean_token_accuracy": 0.9429298082336051, |
| "eval_num_tokens": 225904498.0, |
| "eval_runtime": 7.2193, |
| "eval_samples_per_second": 134.777, |
| "eval_steps_per_second": 8.45, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.7571648413484747, |
| "epoch": 0.2292083712465878, |
| "grad_norm": 0.6368003487586975, |
| "learning_rate": 1.96797153024911e-06, |
| "loss": 0.2484, |
| "mean_token_accuracy": 0.9342491680925543, |
| "num_tokens": 226858707.0, |
| "step": 1145 |
| }, |
| { |
| "entropy": 0.7685175494714217, |
| "epoch": 0.2302092811646952, |
| "grad_norm": 1.7895119190216064, |
| "learning_rate": 1.966859430604982e-06, |
| "loss": 0.2531, |
| "mean_token_accuracy": 0.9351052864031358, |
| "num_tokens": 227586735.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.6730130303989758, |
| "epoch": 0.23121019108280255, |
| "grad_norm": 0.8514677286148071, |
| "learning_rate": 1.9657473309608543e-06, |
| "loss": 0.2434, |
| "mean_token_accuracy": 0.9364338099956513, |
| "num_tokens": 228710792.0, |
| "step": 1155 |
| }, |
| { |
| "entropy": 0.7245557562871413, |
| "epoch": 0.23221110100090991, |
| "grad_norm": 0.7925510406494141, |
| "learning_rate": 1.9646352313167257e-06, |
| "loss": 0.2565, |
| "mean_token_accuracy": 0.9326732272451574, |
| "num_tokens": 229789807.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.7381821754303846, |
| "epoch": 0.23321201091901728, |
| "grad_norm": 0.7272951006889343, |
| "learning_rate": 1.9635231316725976e-06, |
| "loss": 0.2467, |
| "mean_token_accuracy": 0.9342716991901397, |
| "num_tokens": 230830474.0, |
| "step": 1165 |
| }, |
| { |
| "entropy": 0.7532747295769778, |
| "epoch": 0.23421292083712467, |
| "grad_norm": 0.6639147996902466, |
| "learning_rate": 1.96241103202847e-06, |
| "loss": 0.2521, |
| "mean_token_accuracy": 0.9335366579619321, |
| "num_tokens": 231790758.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.23421292083712467, |
| "eval_entropy": 0.6738434072400703, |
| "eval_loss": 0.19970019161701202, |
| "eval_mean_token_accuracy": 0.9427229283285923, |
| "eval_num_tokens": 231790758.0, |
| "eval_runtime": 7.0658, |
| "eval_samples_per_second": 137.705, |
| "eval_steps_per_second": 8.633, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.7472162235866894, |
| "epoch": 0.23521383075523203, |
| "grad_norm": 1.5396642684936523, |
| "learning_rate": 1.9612989323843417e-06, |
| "loss": 0.2494, |
| "mean_token_accuracy": 0.9352785722775893, |
| "num_tokens": 232530867.0, |
| "step": 1175 |
| }, |
| { |
| "entropy": 0.6697620332241059, |
| "epoch": 0.2362147406733394, |
| "grad_norm": 0.8647318482398987, |
| "learning_rate": 1.960186832740213e-06, |
| "loss": 0.2433, |
| "mean_token_accuracy": 0.9363701712001454, |
| "num_tokens": 233651796.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.7114524765448137, |
| "epoch": 0.23721565059144678, |
| "grad_norm": 0.8350867629051208, |
| "learning_rate": 1.9590747330960855e-06, |
| "loss": 0.251, |
| "mean_token_accuracy": 0.9338924034075303, |
| "num_tokens": 234754552.0, |
| "step": 1185 |
| }, |
| { |
| "entropy": 0.7274992368438027, |
| "epoch": 0.23821656050955414, |
| "grad_norm": 0.6969212293624878, |
| "learning_rate": 1.9579626334519573e-06, |
| "loss": 0.2487, |
| "mean_token_accuracy": 0.9337175385518508, |
| "num_tokens": 235782960.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.7455267862840133, |
| "epoch": 0.2392174704276615, |
| "grad_norm": 0.624343752861023, |
| "learning_rate": 1.956850533807829e-06, |
| "loss": 0.2532, |
| "mean_token_accuracy": 0.9332292107018557, |
| "num_tokens": 236735963.0, |
| "step": 1195 |
| }, |
| { |
| "entropy": 0.7484802782535553, |
| "epoch": 0.24021838034576887, |
| "grad_norm": 1.5747654438018799, |
| "learning_rate": 1.955738434163701e-06, |
| "loss": 0.2506, |
| "mean_token_accuracy": 0.9349917281757701, |
| "num_tokens": 237476602.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.24021838034576887, |
| "eval_entropy": 0.6878872777594894, |
| "eval_loss": 0.19709168374538422, |
| "eval_mean_token_accuracy": 0.9429968146027111, |
| "eval_num_tokens": 237476602.0, |
| "eval_runtime": 7.0622, |
| "eval_samples_per_second": 137.775, |
| "eval_steps_per_second": 8.637, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.671830934827978, |
| "epoch": 0.24121929026387626, |
| "grad_norm": 0.8599943518638611, |
| "learning_rate": 1.954626334519573e-06, |
| "loss": 0.2366, |
| "mean_token_accuracy": 0.9383657791397788, |
| "num_tokens": 238617406.0, |
| "step": 1205 |
| }, |
| { |
| "entropy": 0.7293940170244737, |
| "epoch": 0.24222020018198362, |
| "grad_norm": 0.754350483417511, |
| "learning_rate": 1.9535142348754447e-06, |
| "loss": 0.2512, |
| "mean_token_accuracy": 0.9323639192364432, |
| "num_tokens": 239700088.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.7499282219193198, |
| "epoch": 0.24322111010009098, |
| "grad_norm": 0.7476288080215454, |
| "learning_rate": 1.9524021352313166e-06, |
| "loss": 0.2552, |
| "mean_token_accuracy": 0.9318941896611994, |
| "num_tokens": 240733335.0, |
| "step": 1215 |
| }, |
| { |
| "entropy": 0.7511982554739172, |
| "epoch": 0.24422202001819837, |
| "grad_norm": 0.6863506436347961, |
| "learning_rate": 1.9512900355871885e-06, |
| "loss": 0.243, |
| "mean_token_accuracy": 0.9356909887357192, |
| "num_tokens": 241687104.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.749161382154985, |
| "epoch": 0.24522292993630573, |
| "grad_norm": 1.631894826889038, |
| "learning_rate": 1.9501779359430603e-06, |
| "loss": 0.2514, |
| "mean_token_accuracy": 0.9346412853761152, |
| "num_tokens": 242426018.0, |
| "step": 1225 |
| }, |
| { |
| "entropy": 0.6726668021895669, |
| "epoch": 0.2462238398544131, |
| "grad_norm": 0.8596307635307312, |
| "learning_rate": 1.949065836298932e-06, |
| "loss": 0.2454, |
| "mean_token_accuracy": 0.9364431234923276, |
| "num_tokens": 243548718.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.2462238398544131, |
| "eval_entropy": 0.6812147097509416, |
| "eval_loss": 0.19748112559318542, |
| "eval_mean_token_accuracy": 0.943136929488573, |
| "eval_num_tokens": 243548718.0, |
| "eval_runtime": 7.0861, |
| "eval_samples_per_second": 137.311, |
| "eval_steps_per_second": 8.608, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.7223554464903745, |
| "epoch": 0.24722474977252049, |
| "grad_norm": 0.8182641863822937, |
| "learning_rate": 1.947953736654804e-06, |
| "loss": 0.2473, |
| "mean_token_accuracy": 0.9328928150913932, |
| "num_tokens": 244634262.0, |
| "step": 1235 |
| }, |
| { |
| "entropy": 0.7410072830590335, |
| "epoch": 0.24822565969062785, |
| "grad_norm": 0.831390380859375, |
| "learning_rate": 1.9468416370106763e-06, |
| "loss": 0.2458, |
| "mean_token_accuracy": 0.9338255047798156, |
| "num_tokens": 245677570.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.7599548085169359, |
| "epoch": 0.2492265696087352, |
| "grad_norm": 0.8275907635688782, |
| "learning_rate": 1.9457295373665477e-06, |
| "loss": 0.2424, |
| "mean_token_accuracy": 0.9356081453236667, |
| "num_tokens": 246647643.0, |
| "step": 1245 |
| }, |
| { |
| "entropy": 0.7624553501605987, |
| "epoch": 0.2502274795268426, |
| "grad_norm": 1.9468681812286377, |
| "learning_rate": 1.94461743772242e-06, |
| "loss": 0.2445, |
| "mean_token_accuracy": 0.9364055861126293, |
| "num_tokens": 247388979.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.6826613940975883, |
| "epoch": 0.25122838944494996, |
| "grad_norm": 0.8892253041267395, |
| "learning_rate": 1.943505338078292e-06, |
| "loss": 0.2377, |
| "mean_token_accuracy": 0.937986614487388, |
| "num_tokens": 248507582.0, |
| "step": 1255 |
| }, |
| { |
| "entropy": 0.7349318878217177, |
| "epoch": 0.2522292993630573, |
| "grad_norm": 0.7683637738227844, |
| "learning_rate": 1.9423932384341637e-06, |
| "loss": 0.2494, |
| "mean_token_accuracy": 0.9336408035321669, |
| "num_tokens": 249580005.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.2522292993630573, |
| "eval_entropy": 0.6866021556932418, |
| "eval_loss": 0.19629527628421783, |
| "eval_mean_token_accuracy": 0.9435793952863725, |
| "eval_num_tokens": 249580005.0, |
| "eval_runtime": 7.0601, |
| "eval_samples_per_second": 137.817, |
| "eval_steps_per_second": 8.64, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.7540321504527873, |
| "epoch": 0.2532302092811647, |
| "grad_norm": 0.7559732794761658, |
| "learning_rate": 1.9412811387900356e-06, |
| "loss": 0.2516, |
| "mean_token_accuracy": 0.9330432496287606, |
| "num_tokens": 250621468.0, |
| "step": 1265 |
| }, |
| { |
| "entropy": 0.7423492084849964, |
| "epoch": 0.25423111919927205, |
| "grad_norm": 0.7324007153511047, |
| "learning_rate": 1.9401690391459075e-06, |
| "loss": 0.2381, |
| "mean_token_accuracy": 0.9373159939592535, |
| "num_tokens": 251581237.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.7672662193124945, |
| "epoch": 0.2552320291173794, |
| "grad_norm": 1.4408397674560547, |
| "learning_rate": 1.9390569395017793e-06, |
| "loss": 0.2423, |
| "mean_token_accuracy": 0.9368164999918505, |
| "num_tokens": 252303125.0, |
| "step": 1275 |
| }, |
| { |
| "entropy": 0.6668464682318948, |
| "epoch": 0.25623293903548683, |
| "grad_norm": 0.9180498123168945, |
| "learning_rate": 1.937944839857651e-06, |
| "loss": 0.2387, |
| "mean_token_accuracy": 0.9375127759846774, |
| "num_tokens": 253437743.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 0.7173917884176427, |
| "epoch": 0.2572338489535942, |
| "grad_norm": 0.7993113994598389, |
| "learning_rate": 1.936832740213523e-06, |
| "loss": 0.2435, |
| "mean_token_accuracy": 0.9355504203926434, |
| "num_tokens": 254543380.0, |
| "step": 1285 |
| }, |
| { |
| "entropy": 0.7427029658447613, |
| "epoch": 0.25823475887170155, |
| "grad_norm": 0.7974119186401367, |
| "learning_rate": 1.935720640569395e-06, |
| "loss": 0.2404, |
| "mean_token_accuracy": 0.9355508500879461, |
| "num_tokens": 255569004.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.25823475887170155, |
| "eval_entropy": 0.683321903963558, |
| "eval_loss": 0.197197824716568, |
| "eval_mean_token_accuracy": 0.9433203554544293, |
| "eval_num_tokens": 255569004.0, |
| "eval_runtime": 7.0779, |
| "eval_samples_per_second": 137.47, |
| "eval_steps_per_second": 8.618, |
| "step": 1290 |
| }, |
| { |
| "entropy": 0.7673221891576594, |
| "epoch": 0.2592356687898089, |
| "grad_norm": 0.6773776412010193, |
| "learning_rate": 1.9346085409252667e-06, |
| "loss": 0.2522, |
| "mean_token_accuracy": 0.9333479886705225, |
| "num_tokens": 256521955.0, |
| "step": 1295 |
| }, |
| { |
| "entropy": 0.7722088591618972, |
| "epoch": 0.2602365787079163, |
| "grad_norm": 1.5807671546936035, |
| "learning_rate": 1.9334964412811386e-06, |
| "loss": 0.2445, |
| "mean_token_accuracy": 0.9367749100381678, |
| "num_tokens": 257261891.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.6817871857773173, |
| "epoch": 0.26123748862602364, |
| "grad_norm": 0.8420500159263611, |
| "learning_rate": 1.932384341637011e-06, |
| "loss": 0.2307, |
| "mean_token_accuracy": 0.9392897643826225, |
| "num_tokens": 258422670.0, |
| "step": 1305 |
| }, |
| { |
| "entropy": 0.7291848995468834, |
| "epoch": 0.262238398544131, |
| "grad_norm": 0.8453850746154785, |
| "learning_rate": 1.9312722419928823e-06, |
| "loss": 0.2367, |
| "mean_token_accuracy": 0.9367011297832836, |
| "num_tokens": 259498893.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 0.7557943192395297, |
| "epoch": 0.2632393084622384, |
| "grad_norm": 0.7049674391746521, |
| "learning_rate": 1.930160142348754e-06, |
| "loss": 0.2394, |
| "mean_token_accuracy": 0.9362640223719857, |
| "num_tokens": 260532718.0, |
| "step": 1315 |
| }, |
| { |
| "entropy": 0.7660176255486228, |
| "epoch": 0.2642402183803458, |
| "grad_norm": 0.7112149596214294, |
| "learning_rate": 1.9290480427046265e-06, |
| "loss": 0.2442, |
| "mean_token_accuracy": 0.9359012392434207, |
| "num_tokens": 261477169.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.2642402183803458, |
| "eval_entropy": 0.6886299956040304, |
| "eval_loss": 0.19393064081668854, |
| "eval_mean_token_accuracy": 0.9443301275128224, |
| "eval_num_tokens": 261477169.0, |
| "eval_runtime": 7.0513, |
| "eval_samples_per_second": 137.989, |
| "eval_steps_per_second": 8.651, |
| "step": 1320 |
| }, |
| { |
| "entropy": 0.7740896999835968, |
| "epoch": 0.26524112829845314, |
| "grad_norm": 1.7373411655426025, |
| "learning_rate": 1.9279359430604983e-06, |
| "loss": 0.2382, |
| "mean_token_accuracy": 0.937723603031852, |
| "num_tokens": 262203299.0, |
| "step": 1325 |
| }, |
| { |
| "entropy": 0.6813056788661264, |
| "epoch": 0.2662420382165605, |
| "grad_norm": 0.8700944185256958, |
| "learning_rate": 1.9268238434163697e-06, |
| "loss": 0.2344, |
| "mean_token_accuracy": 0.9385422473604029, |
| "num_tokens": 263358422.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 0.7247711669314991, |
| "epoch": 0.26724294813466787, |
| "grad_norm": 0.7497351169586182, |
| "learning_rate": 1.925711743772242e-06, |
| "loss": 0.2399, |
| "mean_token_accuracy": 0.9360633611679077, |
| "num_tokens": 264437282.0, |
| "step": 1335 |
| }, |
| { |
| "entropy": 0.7492102563381196, |
| "epoch": 0.26824385805277523, |
| "grad_norm": 0.712761402130127, |
| "learning_rate": 1.924599644128114e-06, |
| "loss": 0.2379, |
| "mean_token_accuracy": 0.9369383118369362, |
| "num_tokens": 265476998.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 0.7662336116487329, |
| "epoch": 0.26924476797088265, |
| "grad_norm": 1.0059868097305298, |
| "learning_rate": 1.9234875444839857e-06, |
| "loss": 0.235, |
| "mean_token_accuracy": 0.9384725668213584, |
| "num_tokens": 266433276.0, |
| "step": 1345 |
| }, |
| { |
| "entropy": 0.779201509735801, |
| "epoch": 0.27024567788899, |
| "grad_norm": 1.7948832511901855, |
| "learning_rate": 1.9223754448398576e-06, |
| "loss": 0.2454, |
| "mean_token_accuracy": 0.93651580973105, |
| "num_tokens": 267147370.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.27024567788899, |
| "eval_entropy": 0.6936582659111649, |
| "eval_loss": 0.18916112184524536, |
| "eval_mean_token_accuracy": 0.9457350138758049, |
| "eval_num_tokens": 267147370.0, |
| "eval_runtime": 7.0725, |
| "eval_samples_per_second": 137.575, |
| "eval_steps_per_second": 8.625, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.6807152347131209, |
| "epoch": 0.2712465878070974, |
| "grad_norm": 0.8464104533195496, |
| "learning_rate": 1.9212633451957295e-06, |
| "loss": 0.2364, |
| "mean_token_accuracy": 0.9381036953492599, |
| "num_tokens": 268288956.0, |
| "step": 1355 |
| }, |
| { |
| "entropy": 0.7280165471813895, |
| "epoch": 0.27224749772520473, |
| "grad_norm": 0.828230082988739, |
| "learning_rate": 1.9201512455516013e-06, |
| "loss": 0.2385, |
| "mean_token_accuracy": 0.9357607359235937, |
| "num_tokens": 269355784.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 0.7481856107711792, |
| "epoch": 0.2732484076433121, |
| "grad_norm": 0.7362084984779358, |
| "learning_rate": 1.919039145907473e-06, |
| "loss": 0.244, |
| "mean_token_accuracy": 0.9355522545901211, |
| "num_tokens": 270398456.0, |
| "step": 1365 |
| }, |
| { |
| "entropy": 0.7570679174228148, |
| "epoch": 0.27424931756141946, |
| "grad_norm": 0.6655718684196472, |
| "learning_rate": 1.917927046263345e-06, |
| "loss": 0.2337, |
| "mean_token_accuracy": 0.938739211992784, |
| "num_tokens": 271357374.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 0.7706907001408664, |
| "epoch": 0.2752502274795268, |
| "grad_norm": 1.7031316757202148, |
| "learning_rate": 1.916814946619217e-06, |
| "loss": 0.2383, |
| "mean_token_accuracy": 0.9374835350296714, |
| "num_tokens": 272091770.0, |
| "step": 1375 |
| }, |
| { |
| "entropy": 0.6735027275302193, |
| "epoch": 0.27625113739763424, |
| "grad_norm": 0.847005307674408, |
| "learning_rate": 1.9157028469750887e-06, |
| "loss": 0.2313, |
| "mean_token_accuracy": 0.9391226519237865, |
| "num_tokens": 273228350.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.27625113739763424, |
| "eval_entropy": 0.6851567674855716, |
| "eval_loss": 0.19188910722732544, |
| "eval_mean_token_accuracy": 0.9446489283295928, |
| "eval_num_tokens": 273228350.0, |
| "eval_runtime": 7.0137, |
| "eval_samples_per_second": 138.728, |
| "eval_steps_per_second": 8.697, |
| "step": 1380 |
| }, |
| { |
| "entropy": 0.7211063027381897, |
| "epoch": 0.2772520473157416, |
| "grad_norm": 0.7908993363380432, |
| "learning_rate": 1.914590747330961e-06, |
| "loss": 0.2372, |
| "mean_token_accuracy": 0.9372067868709564, |
| "num_tokens": 274295508.0, |
| "step": 1385 |
| }, |
| { |
| "entropy": 0.745275920087641, |
| "epoch": 0.27825295723384896, |
| "grad_norm": 0.7628899216651917, |
| "learning_rate": 1.913478647686833e-06, |
| "loss": 0.2376, |
| "mean_token_accuracy": 0.9368884086608886, |
| "num_tokens": 275339917.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 0.760984147678722, |
| "epoch": 0.2792538671519563, |
| "grad_norm": 0.6237201690673828, |
| "learning_rate": 1.9123665480427043e-06, |
| "loss": 0.2349, |
| "mean_token_accuracy": 0.9373009134422648, |
| "num_tokens": 276295274.0, |
| "step": 1395 |
| }, |
| { |
| "entropy": 0.7674431963400408, |
| "epoch": 0.2802547770700637, |
| "grad_norm": 1.5829390287399292, |
| "learning_rate": 1.9112544483985766e-06, |
| "loss": 0.2381, |
| "mean_token_accuracy": 0.9387489958242936, |
| "num_tokens": 277028591.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.6689763746478341, |
| "epoch": 0.28125568698817105, |
| "grad_norm": 0.9000157713890076, |
| "learning_rate": 1.9101423487544485e-06, |
| "loss": 0.2285, |
| "mean_token_accuracy": 0.9405502384359187, |
| "num_tokens": 278145836.0, |
| "step": 1405 |
| }, |
| { |
| "entropy": 0.7273861186070876, |
| "epoch": 0.2822565969062784, |
| "grad_norm": 0.7861266732215881, |
| "learning_rate": 1.9090302491103203e-06, |
| "loss": 0.2402, |
| "mean_token_accuracy": 0.9364809323440898, |
| "num_tokens": 279207214.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.2822565969062784, |
| "eval_entropy": 0.6891253717610093, |
| "eval_loss": 0.19434459507465363, |
| "eval_mean_token_accuracy": 0.9439800313261689, |
| "eval_num_tokens": 279207214.0, |
| "eval_runtime": 7.0731, |
| "eval_samples_per_second": 137.563, |
| "eval_steps_per_second": 8.624, |
| "step": 1410 |
| }, |
| { |
| "entropy": 0.7461361895908009, |
| "epoch": 0.28325750682438583, |
| "grad_norm": 0.7425960898399353, |
| "learning_rate": 1.907918149466192e-06, |
| "loss": 0.2326, |
| "mean_token_accuracy": 0.936630117893219, |
| "num_tokens": 280228528.0, |
| "step": 1415 |
| }, |
| { |
| "entropy": 0.7540929274125533, |
| "epoch": 0.2842584167424932, |
| "grad_norm": 0.6490366458892822, |
| "learning_rate": 1.906806049822064e-06, |
| "loss": 0.2303, |
| "mean_token_accuracy": 0.938701045513153, |
| "num_tokens": 281179143.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 0.7628308453343131, |
| "epoch": 0.28525932666060055, |
| "grad_norm": 1.7688848972320557, |
| "learning_rate": 1.9056939501779359e-06, |
| "loss": 0.2352, |
| "mean_token_accuracy": 0.938917446678335, |
| "num_tokens": 281909203.0, |
| "step": 1425 |
| }, |
| { |
| "entropy": 0.6694089114665985, |
| "epoch": 0.2862602365787079, |
| "grad_norm": 0.8820457458496094, |
| "learning_rate": 1.9045818505338077e-06, |
| "loss": 0.2298, |
| "mean_token_accuracy": 0.9397172857414592, |
| "num_tokens": 283042814.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 0.7282295411283319, |
| "epoch": 0.2872611464968153, |
| "grad_norm": 0.7975929379463196, |
| "learning_rate": 1.9034697508896796e-06, |
| "loss": 0.241, |
| "mean_token_accuracy": 0.9352296363223683, |
| "num_tokens": 284128852.0, |
| "step": 1435 |
| }, |
| { |
| "entropy": 0.7595264895395799, |
| "epoch": 0.28826205641492264, |
| "grad_norm": 0.734137773513794, |
| "learning_rate": 1.9023576512455515e-06, |
| "loss": 0.2438, |
| "mean_token_accuracy": 0.9358488120815971, |
| "num_tokens": 285148293.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.28826205641492264, |
| "eval_entropy": 0.6968571543693542, |
| "eval_loss": 0.19376739859580994, |
| "eval_mean_token_accuracy": 0.9440957708436934, |
| "eval_num_tokens": 285148293.0, |
| "eval_runtime": 7.0823, |
| "eval_samples_per_second": 137.384, |
| "eval_steps_per_second": 8.613, |
| "step": 1440 |
| }, |
| { |
| "entropy": 0.774249031868848, |
| "epoch": 0.28926296633303, |
| "grad_norm": 0.6629706025123596, |
| "learning_rate": 1.9012455516014233e-06, |
| "loss": 0.2369, |
| "mean_token_accuracy": 0.9372912900014357, |
| "num_tokens": 286110916.0, |
| "step": 1445 |
| }, |
| { |
| "entropy": 0.7704967883500186, |
| "epoch": 0.2902638762511374, |
| "grad_norm": 1.558838129043579, |
| "learning_rate": 1.9001334519572954e-06, |
| "loss": 0.2389, |
| "mean_token_accuracy": 0.9377495538104664, |
| "num_tokens": 286850014.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.6755463258786635, |
| "epoch": 0.2912647861692448, |
| "grad_norm": 0.8654264211654663, |
| "learning_rate": 1.899021352313167e-06, |
| "loss": 0.2256, |
| "mean_token_accuracy": 0.9411474087021567, |
| "num_tokens": 287979759.0, |
| "step": 1455 |
| }, |
| { |
| "entropy": 0.7257778595794331, |
| "epoch": 0.29226569608735214, |
| "grad_norm": 0.771135687828064, |
| "learning_rate": 1.897909252669039e-06, |
| "loss": 0.2318, |
| "mean_token_accuracy": 0.9382907439361919, |
| "num_tokens": 289095571.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 0.7421090098944577, |
| "epoch": 0.2932666060054595, |
| "grad_norm": 0.7648544907569885, |
| "learning_rate": 1.896797153024911e-06, |
| "loss": 0.2368, |
| "mean_token_accuracy": 0.9367521686987443, |
| "num_tokens": 290146846.0, |
| "step": 1465 |
| }, |
| { |
| "entropy": 0.7553750325332989, |
| "epoch": 0.29426751592356687, |
| "grad_norm": 0.7135232090950012, |
| "learning_rate": 1.8956850533807828e-06, |
| "loss": 0.2312, |
| "mean_token_accuracy": 0.9393588678403334, |
| "num_tokens": 291083010.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.29426751592356687, |
| "eval_entropy": 0.6849908721251566, |
| "eval_loss": 0.192779079079628, |
| "eval_mean_token_accuracy": 0.945117437448658, |
| "eval_num_tokens": 291083010.0, |
| "eval_runtime": 7.1063, |
| "eval_samples_per_second": 136.92, |
| "eval_steps_per_second": 8.584, |
| "step": 1470 |
| }, |
| { |
| "entropy": 0.764943554726514, |
| "epoch": 0.29526842584167423, |
| "grad_norm": 1.642973780632019, |
| "learning_rate": 1.8945729537366549e-06, |
| "loss": 0.2358, |
| "mean_token_accuracy": 0.9390517413616181, |
| "num_tokens": 291805745.0, |
| "step": 1475 |
| }, |
| { |
| "entropy": 0.6666032200509852, |
| "epoch": 0.2962693357597816, |
| "grad_norm": 0.9155055284500122, |
| "learning_rate": 1.8934608540925265e-06, |
| "loss": 0.2214, |
| "mean_token_accuracy": 0.9421197701584209, |
| "num_tokens": 292942990.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 0.7196945285255258, |
| "epoch": 0.297270245677889, |
| "grad_norm": 0.8412073254585266, |
| "learning_rate": 1.8923487544483984e-06, |
| "loss": 0.2319, |
| "mean_token_accuracy": 0.9379522231492129, |
| "num_tokens": 294035917.0, |
| "step": 1485 |
| }, |
| { |
| "entropy": 0.7461063379591162, |
| "epoch": 0.2982711555959964, |
| "grad_norm": 0.7782725095748901, |
| "learning_rate": 1.8912366548042705e-06, |
| "loss": 0.2237, |
| "mean_token_accuracy": 0.939490559426221, |
| "num_tokens": 295053863.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 0.7554086994041096, |
| "epoch": 0.29927206551410374, |
| "grad_norm": 0.6107756495475769, |
| "learning_rate": 1.8901245551601423e-06, |
| "loss": 0.2238, |
| "mean_token_accuracy": 0.9404414875940843, |
| "num_tokens": 296006683.0, |
| "step": 1495 |
| }, |
| { |
| "entropy": 0.7677417294545608, |
| "epoch": 0.3002729754322111, |
| "grad_norm": 1.474409580230713, |
| "learning_rate": 1.889012455516014e-06, |
| "loss": 0.2235, |
| "mean_token_accuracy": 0.9412395347248425, |
| "num_tokens": 296732879.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3002729754322111, |
| "eval_entropy": 0.689288352845145, |
| "eval_loss": 0.19140712916851044, |
| "eval_mean_token_accuracy": 0.9447863365783066, |
| "eval_num_tokens": 296732879.0, |
| "eval_runtime": 7.0341, |
| "eval_samples_per_second": 138.327, |
| "eval_steps_per_second": 8.672, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.6815881165591153, |
| "epoch": 0.30127388535031846, |
| "grad_norm": 0.898077130317688, |
| "learning_rate": 1.887900355871886e-06, |
| "loss": 0.2252, |
| "mean_token_accuracy": 0.9407751598141411, |
| "num_tokens": 297865911.0, |
| "step": 1505 |
| }, |
| { |
| "entropy": 0.7220306786623868, |
| "epoch": 0.3022747952684258, |
| "grad_norm": 0.7506076693534851, |
| "learning_rate": 1.8867882562277579e-06, |
| "loss": 0.2271, |
| "mean_token_accuracy": 0.9396520457484505, |
| "num_tokens": 298941429.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 0.7438318740237843, |
| "epoch": 0.3032757051865332, |
| "grad_norm": 0.6981909275054932, |
| "learning_rate": 1.88567615658363e-06, |
| "loss": 0.2306, |
| "mean_token_accuracy": 0.9382837609811263, |
| "num_tokens": 299985729.0, |
| "step": 1515 |
| }, |
| { |
| "entropy": 0.7525452472946861, |
| "epoch": 0.3042766151046406, |
| "grad_norm": 0.6050431728363037, |
| "learning_rate": 1.8845640569395016e-06, |
| "loss": 0.2281, |
| "mean_token_accuracy": 0.9401122011921622, |
| "num_tokens": 300946365.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 0.7545472253452647, |
| "epoch": 0.30527752502274796, |
| "grad_norm": 1.560426115989685, |
| "learning_rate": 1.8834519572953735e-06, |
| "loss": 0.2295, |
| "mean_token_accuracy": 0.9403627395629883, |
| "num_tokens": 301692068.0, |
| "step": 1525 |
| }, |
| { |
| "entropy": 0.678287308324467, |
| "epoch": 0.3062784349408553, |
| "grad_norm": 0.9419786334037781, |
| "learning_rate": 1.8823398576512455e-06, |
| "loss": 0.2251, |
| "mean_token_accuracy": 0.9410501371730458, |
| "num_tokens": 302836523.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.3062784349408553, |
| "eval_entropy": 0.6856205536693823, |
| "eval_loss": 0.19144752621650696, |
| "eval_mean_token_accuracy": 0.9452652296081918, |
| "eval_num_tokens": 302836523.0, |
| "eval_runtime": 7.0363, |
| "eval_samples_per_second": 138.283, |
| "eval_steps_per_second": 8.669, |
| "step": 1530 |
| }, |
| { |
| "entropy": 0.7217613477598537, |
| "epoch": 0.3072793448589627, |
| "grad_norm": 0.7879256010055542, |
| "learning_rate": 1.8812277580071174e-06, |
| "loss": 0.2274, |
| "mean_token_accuracy": 0.9390739977359772, |
| "num_tokens": 303914943.0, |
| "step": 1535 |
| }, |
| { |
| "entropy": 0.7384155148809607, |
| "epoch": 0.30828025477707005, |
| "grad_norm": 0.7203854918479919, |
| "learning_rate": 1.880115658362989e-06, |
| "loss": 0.2313, |
| "mean_token_accuracy": 0.9380901526321065, |
| "num_tokens": 304955992.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 0.7574896769090133, |
| "epoch": 0.3092811646951774, |
| "grad_norm": 0.6372812986373901, |
| "learning_rate": 1.8790035587188611e-06, |
| "loss": 0.2324, |
| "mean_token_accuracy": 0.9386772296645425, |
| "num_tokens": 305902295.0, |
| "step": 1545 |
| }, |
| { |
| "entropy": 0.7646282634951852, |
| "epoch": 0.31028207461328483, |
| "grad_norm": 1.6246287822723389, |
| "learning_rate": 1.877891459074733e-06, |
| "loss": 0.2342, |
| "mean_token_accuracy": 0.93938661867922, |
| "num_tokens": 306621753.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.6615015620535071, |
| "epoch": 0.3112829845313922, |
| "grad_norm": 0.8894542455673218, |
| "learning_rate": 1.876779359430605e-06, |
| "loss": 0.2159, |
| "mean_token_accuracy": 0.9429294396530498, |
| "num_tokens": 307776834.0, |
| "step": 1555 |
| }, |
| { |
| "entropy": 0.7109584380279887, |
| "epoch": 0.31228389444949956, |
| "grad_norm": 0.7467630505561829, |
| "learning_rate": 1.8756672597864769e-06, |
| "loss": 0.2262, |
| "mean_token_accuracy": 0.9396391229196028, |
| "num_tokens": 308870945.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.31228389444949956, |
| "eval_entropy": 0.6883383735281522, |
| "eval_loss": 0.18963798880577087, |
| "eval_mean_token_accuracy": 0.94547944967864, |
| "eval_num_tokens": 308870945.0, |
| "eval_runtime": 7.0626, |
| "eval_samples_per_second": 137.768, |
| "eval_steps_per_second": 8.637, |
| "step": 1560 |
| }, |
| { |
| "entropy": 0.7411357695406133, |
| "epoch": 0.3132848043676069, |
| "grad_norm": 0.7028961181640625, |
| "learning_rate": 1.8745551601423485e-06, |
| "loss": 0.2327, |
| "mean_token_accuracy": 0.9375578728589145, |
| "num_tokens": 309929138.0, |
| "step": 1565 |
| }, |
| { |
| "entropy": 0.7481155969879844, |
| "epoch": 0.3142857142857143, |
| "grad_norm": 0.6543077230453491, |
| "learning_rate": 1.8734430604982206e-06, |
| "loss": 0.2229, |
| "mean_token_accuracy": 0.9416344767267054, |
| "num_tokens": 310870701.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 0.7611248016357421, |
| "epoch": 0.31528662420382164, |
| "grad_norm": 1.6195554733276367, |
| "learning_rate": 1.8723309608540925e-06, |
| "loss": 0.226, |
| "mean_token_accuracy": 0.9410505023869601, |
| "num_tokens": 311585943.0, |
| "step": 1575 |
| }, |
| { |
| "entropy": 0.6666329188780351, |
| "epoch": 0.316287534121929, |
| "grad_norm": 0.9081742167472839, |
| "learning_rate": 1.8712188612099643e-06, |
| "loss": 0.2203, |
| "mean_token_accuracy": 0.9423610427162864, |
| "num_tokens": 312697721.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 0.7309835661541332, |
| "epoch": 0.3172884440400364, |
| "grad_norm": 0.7687853574752808, |
| "learning_rate": 1.8701067615658362e-06, |
| "loss": 0.2313, |
| "mean_token_accuracy": 0.9385158609260212, |
| "num_tokens": 313772951.0, |
| "step": 1585 |
| }, |
| { |
| "entropy": 0.747816955501383, |
| "epoch": 0.3182893539581438, |
| "grad_norm": 0.7106137871742249, |
| "learning_rate": 1.868994661921708e-06, |
| "loss": 0.2306, |
| "mean_token_accuracy": 0.9387970268726349, |
| "num_tokens": 314805910.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.3182893539581438, |
| "eval_entropy": 0.6863706659098141, |
| "eval_loss": 0.1875828355550766, |
| "eval_mean_token_accuracy": 0.9464720380110819, |
| "eval_num_tokens": 314805910.0, |
| "eval_runtime": 7.0006, |
| "eval_samples_per_second": 138.987, |
| "eval_steps_per_second": 8.713, |
| "step": 1590 |
| }, |
| { |
| "entropy": 0.7534966772252863, |
| "epoch": 0.31929026387625115, |
| "grad_norm": 0.6492555141448975, |
| "learning_rate": 1.86788256227758e-06, |
| "loss": 0.2231, |
| "mean_token_accuracy": 0.9415834470228716, |
| "num_tokens": 315758423.0, |
| "step": 1595 |
| }, |
| { |
| "entropy": 0.7610360833731564, |
| "epoch": 0.3202911737943585, |
| "grad_norm": 1.5349500179290771, |
| "learning_rate": 1.866770462633452e-06, |
| "loss": 0.2281, |
| "mean_token_accuracy": 0.9405443429946899, |
| "num_tokens": 316494038.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.6749826358123259, |
| "epoch": 0.32129208371246587, |
| "grad_norm": 0.8564639091491699, |
| "learning_rate": 1.8656583629893236e-06, |
| "loss": 0.2168, |
| "mean_token_accuracy": 0.9434851581400091, |
| "num_tokens": 317637128.0, |
| "step": 1605 |
| }, |
| { |
| "entropy": 0.7281479911370711, |
| "epoch": 0.32229299363057323, |
| "grad_norm": 0.7530900239944458, |
| "learning_rate": 1.8645462633451957e-06, |
| "loss": 0.2306, |
| "mean_token_accuracy": 0.9385505968874152, |
| "num_tokens": 318711437.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 0.7480952777645805, |
| "epoch": 0.3232939035486806, |
| "grad_norm": 0.7651330828666687, |
| "learning_rate": 1.8634341637010675e-06, |
| "loss": 0.2222, |
| "mean_token_accuracy": 0.9407213232733986, |
| "num_tokens": 319762432.0, |
| "step": 1615 |
| }, |
| { |
| "entropy": 0.7627674433318051, |
| "epoch": 0.324294813466788, |
| "grad_norm": 1.319263219833374, |
| "learning_rate": 1.8623220640569394e-06, |
| "loss": 0.2291, |
| "mean_token_accuracy": 0.9400727407498793, |
| "num_tokens": 320714756.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.324294813466788, |
| "eval_entropy": 0.6917840105588319, |
| "eval_loss": 0.1892538070678711, |
| "eval_mean_token_accuracy": 0.9453025753380823, |
| "eval_num_tokens": 320714756.0, |
| "eval_runtime": 7.0555, |
| "eval_samples_per_second": 137.907, |
| "eval_steps_per_second": 8.646, |
| "step": 1620 |
| }, |
| { |
| "entropy": 0.7565292894840241, |
| "epoch": 0.3252957233848954, |
| "grad_norm": 1.500351905822754, |
| "learning_rate": 1.8612099644128113e-06, |
| "loss": 0.2253, |
| "mean_token_accuracy": 0.9414621531963349, |
| "num_tokens": 321455218.0, |
| "step": 1625 |
| }, |
| { |
| "entropy": 0.6765736005522988, |
| "epoch": 0.32629663330300274, |
| "grad_norm": 0.8855953216552734, |
| "learning_rate": 1.8600978647686831e-06, |
| "loss": 0.2215, |
| "mean_token_accuracy": 0.9421357078985735, |
| "num_tokens": 322588730.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 0.731200877644799, |
| "epoch": 0.3272975432211101, |
| "grad_norm": 0.8593675494194031, |
| "learning_rate": 1.8589857651245552e-06, |
| "loss": 0.2211, |
| "mean_token_accuracy": 0.940478920394724, |
| "num_tokens": 323655165.0, |
| "step": 1635 |
| }, |
| { |
| "entropy": 0.7489201041785154, |
| "epoch": 0.32829845313921746, |
| "grad_norm": 0.6950727105140686, |
| "learning_rate": 1.857873665480427e-06, |
| "loss": 0.2267, |
| "mean_token_accuracy": 0.9399041311307387, |
| "num_tokens": 324696694.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 0.7550950695167888, |
| "epoch": 0.3292993630573248, |
| "grad_norm": 0.6508896946907043, |
| "learning_rate": 1.8567615658362989e-06, |
| "loss": 0.2215, |
| "mean_token_accuracy": 0.9421004755930467, |
| "num_tokens": 325659634.0, |
| "step": 1645 |
| }, |
| { |
| "entropy": 0.7663576700470665, |
| "epoch": 0.3303002729754322, |
| "grad_norm": 1.634458065032959, |
| "learning_rate": 1.8556494661921708e-06, |
| "loss": 0.2207, |
| "mean_token_accuracy": 0.9429123352874409, |
| "num_tokens": 326395974.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.3303002729754322, |
| "eval_entropy": 0.6862643673771718, |
| "eval_loss": 0.18751998245716095, |
| "eval_mean_token_accuracy": 0.9456404789549405, |
| "eval_num_tokens": 326395974.0, |
| "eval_runtime": 7.1493, |
| "eval_samples_per_second": 136.097, |
| "eval_steps_per_second": 8.532, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.6814083841713992, |
| "epoch": 0.3313011828935396, |
| "grad_norm": 0.8884172439575195, |
| "learning_rate": 1.8545373665480426e-06, |
| "loss": 0.2205, |
| "mean_token_accuracy": 0.9427086288278753, |
| "num_tokens": 327518250.0, |
| "step": 1655 |
| }, |
| { |
| "entropy": 0.7267371454022148, |
| "epoch": 0.33230209281164697, |
| "grad_norm": 0.864007294178009, |
| "learning_rate": 1.8534252669039145e-06, |
| "loss": 0.225, |
| "mean_token_accuracy": 0.9397233930501071, |
| "num_tokens": 328618461.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 0.7420350654558702, |
| "epoch": 0.3333030027297543, |
| "grad_norm": 0.7210493087768555, |
| "learning_rate": 1.8523131672597865e-06, |
| "loss": 0.2179, |
| "mean_token_accuracy": 0.9413225569508292, |
| "num_tokens": 329644537.0, |
| "step": 1665 |
| }, |
| { |
| "entropy": 0.7561123300682414, |
| "epoch": 0.3343039126478617, |
| "grad_norm": 0.6487271785736084, |
| "learning_rate": 1.8512010676156582e-06, |
| "loss": 0.2254, |
| "mean_token_accuracy": 0.9409215840426358, |
| "num_tokens": 330598048.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 0.7530049925500696, |
| "epoch": 0.33530482256596905, |
| "grad_norm": 1.4161484241485596, |
| "learning_rate": 1.85008896797153e-06, |
| "loss": 0.2262, |
| "mean_token_accuracy": 0.9407056949355386, |
| "num_tokens": 331335749.0, |
| "step": 1675 |
| }, |
| { |
| "entropy": 0.6709848523139954, |
| "epoch": 0.3363057324840764, |
| "grad_norm": 0.8709940314292908, |
| "learning_rate": 1.8489768683274021e-06, |
| "loss": 0.2126, |
| "mean_token_accuracy": 0.9443153614347631, |
| "num_tokens": 332446173.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.3363057324840764, |
| "eval_entropy": 0.6809069924667234, |
| "eval_loss": 0.18685181438922882, |
| "eval_mean_token_accuracy": 0.9458920027388901, |
| "eval_num_tokens": 332446173.0, |
| "eval_runtime": 7.0534, |
| "eval_samples_per_second": 137.948, |
| "eval_steps_per_second": 8.648, |
| "step": 1680 |
| }, |
| { |
| "entropy": 0.7158765223893252, |
| "epoch": 0.3373066424021838, |
| "grad_norm": 0.7924162745475769, |
| "learning_rate": 1.847864768683274e-06, |
| "loss": 0.2193, |
| "mean_token_accuracy": 0.9411095483736558, |
| "num_tokens": 333540296.0, |
| "step": 1685 |
| }, |
| { |
| "entropy": 0.7432989163832231, |
| "epoch": 0.3383075523202912, |
| "grad_norm": 0.7170067429542542, |
| "learning_rate": 1.8467526690391458e-06, |
| "loss": 0.2231, |
| "mean_token_accuracy": 0.9399533082138408, |
| "num_tokens": 334567842.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 0.7567171161825007, |
| "epoch": 0.33930846223839856, |
| "grad_norm": 0.8179503679275513, |
| "learning_rate": 1.8456405693950177e-06, |
| "loss": 0.2163, |
| "mean_token_accuracy": 0.9416104576804422, |
| "num_tokens": 335513564.0, |
| "step": 1695 |
| }, |
| { |
| "entropy": 0.7489974737167359, |
| "epoch": 0.3403093721565059, |
| "grad_norm": 1.53611421585083, |
| "learning_rate": 1.8445284697508895e-06, |
| "loss": 0.2204, |
| "mean_token_accuracy": 0.942312642661008, |
| "num_tokens": 336241111.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.654525652256879, |
| "epoch": 0.3413102820746133, |
| "grad_norm": 0.8707150816917419, |
| "learning_rate": 1.8434163701067616e-06, |
| "loss": 0.2149, |
| "mean_token_accuracy": 0.9436011032624678, |
| "num_tokens": 337388829.0, |
| "step": 1705 |
| }, |
| { |
| "entropy": 0.7030858914960515, |
| "epoch": 0.34231119199272064, |
| "grad_norm": 0.7638726830482483, |
| "learning_rate": 1.8423042704626333e-06, |
| "loss": 0.2166, |
| "mean_token_accuracy": 0.9418383235281164, |
| "num_tokens": 338487950.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.34231119199272064, |
| "eval_entropy": 0.6751082666584702, |
| "eval_loss": 0.18752196431159973, |
| "eval_mean_token_accuracy": 0.9461400889959491, |
| "eval_num_tokens": 338487950.0, |
| "eval_runtime": 7.0608, |
| "eval_samples_per_second": 137.804, |
| "eval_steps_per_second": 8.639, |
| "step": 1710 |
| }, |
| { |
| "entropy": 0.7331013533202084, |
| "epoch": 0.343312101910828, |
| "grad_norm": 0.7417324781417847, |
| "learning_rate": 1.8411921708185051e-06, |
| "loss": 0.2186, |
| "mean_token_accuracy": 0.9412295021794059, |
| "num_tokens": 339528555.0, |
| "step": 1715 |
| }, |
| { |
| "entropy": 0.7409337301145901, |
| "epoch": 0.3443130118289354, |
| "grad_norm": 0.5683432817459106, |
| "learning_rate": 1.8400800711743772e-06, |
| "loss": 0.2216, |
| "mean_token_accuracy": 0.9424649195237593, |
| "num_tokens": 340471062.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 0.7573754261840474, |
| "epoch": 0.3453139217470428, |
| "grad_norm": 1.6265780925750732, |
| "learning_rate": 1.838967971530249e-06, |
| "loss": 0.2232, |
| "mean_token_accuracy": 0.9421446426348252, |
| "num_tokens": 341200791.0, |
| "step": 1725 |
| }, |
| { |
| "entropy": 0.6634017543359236, |
| "epoch": 0.34631483166515015, |
| "grad_norm": 0.8331648111343384, |
| "learning_rate": 1.8378558718861211e-06, |
| "loss": 0.215, |
| "mean_token_accuracy": 0.9439348957755349, |
| "num_tokens": 342354243.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 0.7234609690579501, |
| "epoch": 0.3473157415832575, |
| "grad_norm": 0.7795122861862183, |
| "learning_rate": 1.8367437722419928e-06, |
| "loss": 0.2218, |
| "mean_token_accuracy": 0.9413307238708842, |
| "num_tokens": 343460007.0, |
| "step": 1735 |
| }, |
| { |
| "entropy": 0.7389553557742726, |
| "epoch": 0.34831665150136487, |
| "grad_norm": 0.7650998830795288, |
| "learning_rate": 1.8356316725978646e-06, |
| "loss": 0.2221, |
| "mean_token_accuracy": 0.9402573309161446, |
| "num_tokens": 344485585.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.34831665150136487, |
| "eval_entropy": 0.6805569617474665, |
| "eval_loss": 0.18589681386947632, |
| "eval_mean_token_accuracy": 0.9470884516590932, |
| "eval_num_tokens": 344485585.0, |
| "eval_runtime": 7.0844, |
| "eval_samples_per_second": 137.343, |
| "eval_steps_per_second": 8.61, |
| "step": 1740 |
| }, |
| { |
| "entropy": 0.7529873455112631, |
| "epoch": 0.34931756141947223, |
| "grad_norm": 0.6545958518981934, |
| "learning_rate": 1.8345195729537367e-06, |
| "loss": 0.2154, |
| "mean_token_accuracy": 0.9425264336846092, |
| "num_tokens": 345437559.0, |
| "step": 1745 |
| }, |
| { |
| "entropy": 0.761711223558946, |
| "epoch": 0.3503184713375796, |
| "grad_norm": 1.5173709392547607, |
| "learning_rate": 1.8334074733096085e-06, |
| "loss": 0.2199, |
| "mean_token_accuracy": 0.9430206764828075, |
| "num_tokens": 346169843.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.6663856636394154, |
| "epoch": 0.351319381255687, |
| "grad_norm": 0.9439927935600281, |
| "learning_rate": 1.8322953736654802e-06, |
| "loss": 0.2083, |
| "mean_token_accuracy": 0.9455912454561753, |
| "num_tokens": 347311478.0, |
| "step": 1755 |
| }, |
| { |
| "entropy": 0.7240512457760897, |
| "epoch": 0.3523202911737944, |
| "grad_norm": 0.8567253351211548, |
| "learning_rate": 1.8311832740213523e-06, |
| "loss": 0.2201, |
| "mean_token_accuracy": 0.94097445119511, |
| "num_tokens": 348391019.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 0.7536248537627134, |
| "epoch": 0.35332120109190174, |
| "grad_norm": 0.7871220707893372, |
| "learning_rate": 1.8300711743772241e-06, |
| "loss": 0.2221, |
| "mean_token_accuracy": 0.9408430018208244, |
| "num_tokens": 349427873.0, |
| "step": 1765 |
| }, |
| { |
| "entropy": 0.752256919037212, |
| "epoch": 0.3543221110100091, |
| "grad_norm": 0.7607414722442627, |
| "learning_rate": 1.8289590747330962e-06, |
| "loss": 0.2133, |
| "mean_token_accuracy": 0.9429961264133453, |
| "num_tokens": 350368302.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.3543221110100091, |
| "eval_entropy": 0.6793914603405311, |
| "eval_loss": 0.18565388023853302, |
| "eval_mean_token_accuracy": 0.9467462975470746, |
| "eval_num_tokens": 350368302.0, |
| "eval_runtime": 7.0807, |
| "eval_samples_per_second": 137.416, |
| "eval_steps_per_second": 8.615, |
| "step": 1770 |
| }, |
| { |
| "entropy": 0.7683494849638506, |
| "epoch": 0.35532302092811646, |
| "grad_norm": 1.6403712034225464, |
| "learning_rate": 1.8278469750889678e-06, |
| "loss": 0.2249, |
| "mean_token_accuracy": 0.9412687144496225, |
| "num_tokens": 351095025.0, |
| "step": 1775 |
| }, |
| { |
| "entropy": 0.6670152826742692, |
| "epoch": 0.3563239308462238, |
| "grad_norm": 0.8832131624221802, |
| "learning_rate": 1.8267348754448397e-06, |
| "loss": 0.205, |
| "mean_token_accuracy": 0.9458218796686693, |
| "num_tokens": 352193150.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 0.7185638297687877, |
| "epoch": 0.3573248407643312, |
| "grad_norm": 0.8331535458564758, |
| "learning_rate": 1.8256227758007118e-06, |
| "loss": 0.2204, |
| "mean_token_accuracy": 0.9414176198569211, |
| "num_tokens": 353272050.0, |
| "step": 1785 |
| }, |
| { |
| "entropy": 0.7420683860778808, |
| "epoch": 0.3583257506824386, |
| "grad_norm": 0.8582054376602173, |
| "learning_rate": 1.8245106761565836e-06, |
| "loss": 0.2194, |
| "mean_token_accuracy": 0.9416669065302069, |
| "num_tokens": 354301218.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 0.7551353487101469, |
| "epoch": 0.35932666060054597, |
| "grad_norm": 0.6729021668434143, |
| "learning_rate": 1.8233985765124555e-06, |
| "loss": 0.2176, |
| "mean_token_accuracy": 0.9424347899176858, |
| "num_tokens": 355250303.0, |
| "step": 1795 |
| }, |
| { |
| "entropy": 0.751355068250136, |
| "epoch": 0.36032757051865333, |
| "grad_norm": 1.6505812406539917, |
| "learning_rate": 1.8222864768683273e-06, |
| "loss": 0.2153, |
| "mean_token_accuracy": 0.9439191401004792, |
| "num_tokens": 355986825.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.36032757051865333, |
| "eval_entropy": 0.6828553637520212, |
| "eval_loss": 0.18496987223625183, |
| "eval_mean_token_accuracy": 0.946765600657854, |
| "eval_num_tokens": 355986825.0, |
| "eval_runtime": 7.196, |
| "eval_samples_per_second": 135.214, |
| "eval_steps_per_second": 8.477, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.671533118052916, |
| "epoch": 0.3613284804367607, |
| "grad_norm": 0.9078927040100098, |
| "learning_rate": 1.8211743772241992e-06, |
| "loss": 0.2158, |
| "mean_token_accuracy": 0.9436379351399161, |
| "num_tokens": 357130061.0, |
| "step": 1805 |
| }, |
| { |
| "entropy": 0.7248334830457513, |
| "epoch": 0.36232939035486805, |
| "grad_norm": 0.8133084177970886, |
| "learning_rate": 1.820062277580071e-06, |
| "loss": 0.2168, |
| "mean_token_accuracy": 0.9420726684006777, |
| "num_tokens": 358203695.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 0.749659313396974, |
| "epoch": 0.3633303002729754, |
| "grad_norm": 0.7274289727210999, |
| "learning_rate": 1.8189501779359431e-06, |
| "loss": 0.2183, |
| "mean_token_accuracy": 0.9410731163891879, |
| "num_tokens": 359255272.0, |
| "step": 1815 |
| }, |
| { |
| "entropy": 0.7492427452044054, |
| "epoch": 0.3643312101910828, |
| "grad_norm": 0.6970122456550598, |
| "learning_rate": 1.8178380782918148e-06, |
| "loss": 0.2122, |
| "mean_token_accuracy": 0.9437575806270946, |
| "num_tokens": 360205760.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 0.7630361009727825, |
| "epoch": 0.3653321201091902, |
| "grad_norm": 1.5553841590881348, |
| "learning_rate": 1.8167259786476868e-06, |
| "loss": 0.2171, |
| "mean_token_accuracy": 0.9435903998938474, |
| "num_tokens": 360930756.0, |
| "step": 1825 |
| }, |
| { |
| "entropy": 0.6604581854560159, |
| "epoch": 0.36633303002729756, |
| "grad_norm": 0.8930894136428833, |
| "learning_rate": 1.8156138790035587e-06, |
| "loss": 0.2064, |
| "mean_token_accuracy": 0.9453142903067849, |
| "num_tokens": 362073768.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.36633303002729756, |
| "eval_entropy": 0.6797945323537607, |
| "eval_loss": 0.18627671897411346, |
| "eval_mean_token_accuracy": 0.9462826427866201, |
| "eval_num_tokens": 362073768.0, |
| "eval_runtime": 7.0766, |
| "eval_samples_per_second": 137.496, |
| "eval_steps_per_second": 8.62, |
| "step": 1830 |
| }, |
| { |
| "entropy": 0.7122279877012426, |
| "epoch": 0.3673339399454049, |
| "grad_norm": 0.7932081818580627, |
| "learning_rate": 1.8145017793594305e-06, |
| "loss": 0.2163, |
| "mean_token_accuracy": 0.9423632816834884, |
| "num_tokens": 363174699.0, |
| "step": 1835 |
| }, |
| { |
| "entropy": 0.7323072322390296, |
| "epoch": 0.3683348498635123, |
| "grad_norm": 0.8120712637901306, |
| "learning_rate": 1.8133896797153024e-06, |
| "loss": 0.2135, |
| "mean_token_accuracy": 0.9426818517121401, |
| "num_tokens": 364220503.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 0.7379515593702143, |
| "epoch": 0.36933575978161964, |
| "grad_norm": 0.6464109420776367, |
| "learning_rate": 1.8122775800711743e-06, |
| "loss": 0.2155, |
| "mean_token_accuracy": 0.9436933148990978, |
| "num_tokens": 365183344.0, |
| "step": 1845 |
| }, |
| { |
| "entropy": 0.747268967736851, |
| "epoch": 0.370336669699727, |
| "grad_norm": 1.5971249341964722, |
| "learning_rate": 1.8111654804270461e-06, |
| "loss": 0.2143, |
| "mean_token_accuracy": 0.9438003106550736, |
| "num_tokens": 365919847.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.6582850561900573, |
| "epoch": 0.37133757961783437, |
| "grad_norm": 0.863985538482666, |
| "learning_rate": 1.8100533807829182e-06, |
| "loss": 0.2095, |
| "mean_token_accuracy": 0.9448756602677432, |
| "num_tokens": 367064732.0, |
| "step": 1855 |
| }, |
| { |
| "entropy": 0.7067332503470507, |
| "epoch": 0.3723384895359418, |
| "grad_norm": 0.8197696208953857, |
| "learning_rate": 1.8089412811387898e-06, |
| "loss": 0.2105, |
| "mean_token_accuracy": 0.9437748323787343, |
| "num_tokens": 368151926.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.3723384895359418, |
| "eval_entropy": 0.6697740437554531, |
| "eval_loss": 0.18541671335697174, |
| "eval_mean_token_accuracy": 0.9467641076103586, |
| "eval_num_tokens": 368151926.0, |
| "eval_runtime": 7.2316, |
| "eval_samples_per_second": 134.548, |
| "eval_steps_per_second": 8.435, |
| "step": 1860 |
| }, |
| { |
| "entropy": 0.7188923353498632, |
| "epoch": 0.37333939945404915, |
| "grad_norm": 0.7241514921188354, |
| "learning_rate": 1.807829181494662e-06, |
| "loss": 0.2159, |
| "mean_token_accuracy": 0.9426933927969499, |
| "num_tokens": 369193883.0, |
| "step": 1865 |
| }, |
| { |
| "entropy": 0.7283467411994934, |
| "epoch": 0.3743403093721565, |
| "grad_norm": 0.6511433720588684, |
| "learning_rate": 1.8067170818505338e-06, |
| "loss": 0.2046, |
| "mean_token_accuracy": 0.9453721604563973, |
| "num_tokens": 370160842.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 0.7400695074688305, |
| "epoch": 0.37534121929026387, |
| "grad_norm": 1.732633113861084, |
| "learning_rate": 1.8056049822064056e-06, |
| "loss": 0.2073, |
| "mean_token_accuracy": 0.9456492299383337, |
| "num_tokens": 370892250.0, |
| "step": 1875 |
| }, |
| { |
| "entropy": 0.6502942296591672, |
| "epoch": 0.37634212920837123, |
| "grad_norm": 0.9057416915893555, |
| "learning_rate": 1.8044928825622777e-06, |
| "loss": 0.2069, |
| "mean_token_accuracy": 0.9456757783889771, |
| "num_tokens": 372022804.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 0.6950947940349579, |
| "epoch": 0.3773430391264786, |
| "grad_norm": 0.7539053559303284, |
| "learning_rate": 1.8033807829181493e-06, |
| "loss": 0.2123, |
| "mean_token_accuracy": 0.9439136678522283, |
| "num_tokens": 373111663.0, |
| "step": 1885 |
| }, |
| { |
| "entropy": 0.7123507954857566, |
| "epoch": 0.378343949044586, |
| "grad_norm": 0.7048158645629883, |
| "learning_rate": 1.8022686832740212e-06, |
| "loss": 0.2048, |
| "mean_token_accuracy": 0.944564142552289, |
| "num_tokens": 374149694.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.378343949044586, |
| "eval_entropy": 0.6676561910598005, |
| "eval_loss": 0.1879517138004303, |
| "eval_mean_token_accuracy": 0.9460887654882962, |
| "eval_num_tokens": 374149694.0, |
| "eval_runtime": 7.0566, |
| "eval_samples_per_second": 137.886, |
| "eval_steps_per_second": 8.644, |
| "step": 1890 |
| }, |
| { |
| "entropy": 0.7297647855498574, |
| "epoch": 0.3793448589626934, |
| "grad_norm": 0.6239880323410034, |
| "learning_rate": 1.8011565836298933e-06, |
| "loss": 0.2072, |
| "mean_token_accuracy": 0.9445389471270821, |
| "num_tokens": 375098313.0, |
| "step": 1895 |
| }, |
| { |
| "entropy": 0.7302077791907571, |
| "epoch": 0.38034576888080074, |
| "grad_norm": 1.6145358085632324, |
| "learning_rate": 1.8000444839857651e-06, |
| "loss": 0.203, |
| "mean_token_accuracy": 0.9472867906093597, |
| "num_tokens": 375816250.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.651517802476883, |
| "epoch": 0.3813466787989081, |
| "grad_norm": 0.9150720834732056, |
| "learning_rate": 1.7989323843416368e-06, |
| "loss": 0.2025, |
| "mean_token_accuracy": 0.9464400454000993, |
| "num_tokens": 376972772.0, |
| "step": 1905 |
| }, |
| { |
| "entropy": 0.7013754468072545, |
| "epoch": 0.38234758871701546, |
| "grad_norm": 0.7586289048194885, |
| "learning_rate": 1.7978202846975088e-06, |
| "loss": 0.2136, |
| "mean_token_accuracy": 0.9431337069381367, |
| "num_tokens": 378053259.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 0.7290194546634501, |
| "epoch": 0.3833484986351228, |
| "grad_norm": 0.7516461610794067, |
| "learning_rate": 1.7967081850533807e-06, |
| "loss": 0.2104, |
| "mean_token_accuracy": 0.9425576074556871, |
| "num_tokens": 379103040.0, |
| "step": 1915 |
| }, |
| { |
| "entropy": 0.729288539019498, |
| "epoch": 0.3843494085532302, |
| "grad_norm": 0.620888352394104, |
| "learning_rate": 1.7955960854092528e-06, |
| "loss": 0.2161, |
| "mean_token_accuracy": 0.9428860014135187, |
| "num_tokens": 380068398.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.3843494085532302, |
| "eval_entropy": 0.6771206176671826, |
| "eval_loss": 0.18380054831504822, |
| "eval_mean_token_accuracy": 0.9472967024709358, |
| "eval_num_tokens": 380068398.0, |
| "eval_runtime": 7.0603, |
| "eval_samples_per_second": 137.813, |
| "eval_steps_per_second": 8.64, |
| "step": 1920 |
| }, |
| { |
| "entropy": 0.7446090530265461, |
| "epoch": 0.3853503184713376, |
| "grad_norm": 1.8233861923217773, |
| "learning_rate": 1.7944839857651244e-06, |
| "loss": 0.2103, |
| "mean_token_accuracy": 0.9449324960058386, |
| "num_tokens": 380798426.0, |
| "step": 1925 |
| }, |
| { |
| "entropy": 0.6486159508878534, |
| "epoch": 0.38635122838944497, |
| "grad_norm": 0.8591449856758118, |
| "learning_rate": 1.7933718861209963e-06, |
| "loss": 0.1985, |
| "mean_token_accuracy": 0.9478086097673937, |
| "num_tokens": 381921279.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 0.6935458053242076, |
| "epoch": 0.38735213830755233, |
| "grad_norm": 0.7733218669891357, |
| "learning_rate": 1.7922597864768683e-06, |
| "loss": 0.2105, |
| "mean_token_accuracy": 0.9436502153223211, |
| "num_tokens": 383034624.0, |
| "step": 1935 |
| }, |
| { |
| "entropy": 0.7152742207050323, |
| "epoch": 0.3883530482256597, |
| "grad_norm": 0.7645531296730042, |
| "learning_rate": 1.7911476868327402e-06, |
| "loss": 0.2141, |
| "mean_token_accuracy": 0.9426664742556485, |
| "num_tokens": 384082302.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 0.72131880142472, |
| "epoch": 0.38935395814376705, |
| "grad_norm": 0.6350061893463135, |
| "learning_rate": 1.7900355871886118e-06, |
| "loss": 0.2139, |
| "mean_token_accuracy": 0.9435091322118586, |
| "num_tokens": 385027404.0, |
| "step": 1945 |
| }, |
| { |
| "entropy": 0.7312435442751104, |
| "epoch": 0.3903548680618744, |
| "grad_norm": 1.5735912322998047, |
| "learning_rate": 1.788923487544484e-06, |
| "loss": 0.2147, |
| "mean_token_accuracy": 0.9442145147106864, |
| "num_tokens": 385747602.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.3903548680618744, |
| "eval_entropy": 0.6652554879423047, |
| "eval_loss": 0.18200834095478058, |
| "eval_mean_token_accuracy": 0.9481365035791867, |
| "eval_num_tokens": 385747602.0, |
| "eval_runtime": 7.0932, |
| "eval_samples_per_second": 137.173, |
| "eval_steps_per_second": 8.6, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.6431225692684001, |
| "epoch": 0.3913557779799818, |
| "grad_norm": 0.8696854710578918, |
| "learning_rate": 1.7878113879003558e-06, |
| "loss": 0.2025, |
| "mean_token_accuracy": 0.9468498847701333, |
| "num_tokens": 386887767.0, |
| "step": 1955 |
| }, |
| { |
| "entropy": 0.7034454665400766, |
| "epoch": 0.3923566878980892, |
| "grad_norm": 0.7643694877624512, |
| "learning_rate": 1.7866992882562278e-06, |
| "loss": 0.2126, |
| "mean_token_accuracy": 0.9431325663219798, |
| "num_tokens": 387964944.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 0.7246899883855473, |
| "epoch": 0.39335759781619656, |
| "grad_norm": 0.7786898016929626, |
| "learning_rate": 1.7855871886120997e-06, |
| "loss": 0.2089, |
| "mean_token_accuracy": 0.9432727011767301, |
| "num_tokens": 389004308.0, |
| "step": 1965 |
| }, |
| { |
| "entropy": 0.7278256719762629, |
| "epoch": 0.3943585077343039, |
| "grad_norm": 0.6474554538726807, |
| "learning_rate": 1.7844750889679713e-06, |
| "loss": 0.2103, |
| "mean_token_accuracy": 0.9444690823554993, |
| "num_tokens": 389960897.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 0.7353235575285825, |
| "epoch": 0.3953594176524113, |
| "grad_norm": 1.4766101837158203, |
| "learning_rate": 1.7833629893238434e-06, |
| "loss": 0.2122, |
| "mean_token_accuracy": 0.9448792641813105, |
| "num_tokens": 390690930.0, |
| "step": 1975 |
| }, |
| { |
| "entropy": 0.6459478435191242, |
| "epoch": 0.39636032757051864, |
| "grad_norm": 0.8893265128135681, |
| "learning_rate": 1.7822508896797153e-06, |
| "loss": 0.1952, |
| "mean_token_accuracy": 0.9480626046657562, |
| "num_tokens": 391824072.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.39636032757051864, |
| "eval_entropy": 0.6699851712242502, |
| "eval_loss": 0.18414077162742615, |
| "eval_mean_token_accuracy": 0.9468610022888809, |
| "eval_num_tokens": 391824072.0, |
| "eval_runtime": 7.0451, |
| "eval_samples_per_second": 138.11, |
| "eval_steps_per_second": 8.659, |
| "step": 1980 |
| }, |
| { |
| "entropy": 0.6953091193329204, |
| "epoch": 0.397361237488626, |
| "grad_norm": 0.7777345776557922, |
| "learning_rate": 1.7811387900355871e-06, |
| "loss": 0.1999, |
| "mean_token_accuracy": 0.9463782939043912, |
| "num_tokens": 392922191.0, |
| "step": 1985 |
| }, |
| { |
| "entropy": 0.7238586826757951, |
| "epoch": 0.39836214740673337, |
| "grad_norm": 0.7229942679405212, |
| "learning_rate": 1.780026690391459e-06, |
| "loss": 0.2091, |
| "mean_token_accuracy": 0.9436769247055053, |
| "num_tokens": 393951915.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 0.7331149523908441, |
| "epoch": 0.3993630573248408, |
| "grad_norm": 4.645083427429199, |
| "learning_rate": 1.7789145907473308e-06, |
| "loss": 0.1999, |
| "mean_token_accuracy": 0.9466928687962619, |
| "num_tokens": 394911168.0, |
| "step": 1995 |
| }, |
| { |
| "entropy": 0.7309780456803062, |
| "epoch": 0.40036396724294815, |
| "grad_norm": 1.4829082489013672, |
| "learning_rate": 1.777802491103203e-06, |
| "loss": 0.2012, |
| "mean_token_accuracy": 0.9472140420566906, |
| "num_tokens": 395641494.0, |
| "step": 2000 |
| }, |
| { |
| "entropy": 0.6425018229267814, |
| "epoch": 0.4013648771610555, |
| "grad_norm": 0.9035446047782898, |
| "learning_rate": 1.7766903914590748e-06, |
| "loss": 0.2018, |
| "mean_token_accuracy": 0.9465541931715878, |
| "num_tokens": 396806735.0, |
| "step": 2005 |
| }, |
| { |
| "entropy": 0.6907033795660192, |
| "epoch": 0.4023657870791629, |
| "grad_norm": 0.7838383913040161, |
| "learning_rate": 1.7755782918149464e-06, |
| "loss": 0.202, |
| "mean_token_accuracy": 0.9456824893301183, |
| "num_tokens": 397897808.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.4023657870791629, |
| "eval_entropy": 0.6626140348246841, |
| "eval_loss": 0.18410223722457886, |
| "eval_mean_token_accuracy": 0.9472184445037216, |
| "eval_num_tokens": 397897808.0, |
| "eval_runtime": 7.0819, |
| "eval_samples_per_second": 137.392, |
| "eval_steps_per_second": 8.613, |
| "step": 2010 |
| }, |
| { |
| "entropy": 0.726021606000987, |
| "epoch": 0.40336669699727024, |
| "grad_norm": 0.7114729881286621, |
| "learning_rate": 1.7744661921708185e-06, |
| "loss": 0.2095, |
| "mean_token_accuracy": 0.9433100288564509, |
| "num_tokens": 398940578.0, |
| "step": 2015 |
| }, |
| { |
| "entropy": 0.734506199576638, |
| "epoch": 0.4043676069153776, |
| "grad_norm": 0.6169213652610779, |
| "learning_rate": 1.7733540925266903e-06, |
| "loss": 0.2013, |
| "mean_token_accuracy": 0.9469331833449277, |
| "num_tokens": 399905944.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 0.7397109557281841, |
| "epoch": 0.40536851683348496, |
| "grad_norm": 1.60407292842865, |
| "learning_rate": 1.7722419928825622e-06, |
| "loss": 0.2062, |
| "mean_token_accuracy": 0.945603883266449, |
| "num_tokens": 400634292.0, |
| "step": 2025 |
| }, |
| { |
| "entropy": 0.6445516106757251, |
| "epoch": 0.4063694267515924, |
| "grad_norm": 0.8769928216934204, |
| "learning_rate": 1.771129893238434e-06, |
| "loss": 0.1977, |
| "mean_token_accuracy": 0.9475978217341683, |
| "num_tokens": 401779381.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 0.7098718599839644, |
| "epoch": 0.40737033666969974, |
| "grad_norm": 0.7846065163612366, |
| "learning_rate": 1.770017793594306e-06, |
| "loss": 0.2102, |
| "mean_token_accuracy": 0.9430582647973841, |
| "num_tokens": 402847416.0, |
| "step": 2035 |
| }, |
| { |
| "entropy": 0.7275996368039738, |
| "epoch": 0.4083712465878071, |
| "grad_norm": 0.7160109877586365, |
| "learning_rate": 1.7689056939501778e-06, |
| "loss": 0.2096, |
| "mean_token_accuracy": 0.9433997192166068, |
| "num_tokens": 403874433.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.4083712465878071, |
| "eval_entropy": 0.6705086065120385, |
| "eval_loss": 0.18461880087852478, |
| "eval_mean_token_accuracy": 0.9473435878753662, |
| "eval_num_tokens": 403874433.0, |
| "eval_runtime": 7.0408, |
| "eval_samples_per_second": 138.194, |
| "eval_steps_per_second": 8.664, |
| "step": 2040 |
| }, |
| { |
| "entropy": 0.7333347418091514, |
| "epoch": 0.40937215650591446, |
| "grad_norm": 0.6281505823135376, |
| "learning_rate": 1.7677935943060498e-06, |
| "loss": 0.2069, |
| "mean_token_accuracy": 0.9447664049538699, |
| "num_tokens": 404833962.0, |
| "step": 2045 |
| }, |
| { |
| "entropy": 0.7319487474181435, |
| "epoch": 0.4103730664240218, |
| "grad_norm": 1.4577454328536987, |
| "learning_rate": 1.7666814946619217e-06, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.9480147020383315, |
| "num_tokens": 405569513.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 0.6517360118302432, |
| "epoch": 0.4113739763421292, |
| "grad_norm": 0.8996165990829468, |
| "learning_rate": 1.7655693950177935e-06, |
| "loss": 0.2004, |
| "mean_token_accuracy": 0.9475146136500618, |
| "num_tokens": 406712661.0, |
| "step": 2055 |
| }, |
| { |
| "entropy": 0.7000083235177127, |
| "epoch": 0.4123748862602366, |
| "grad_norm": 0.8524026274681091, |
| "learning_rate": 1.7644572953736654e-06, |
| "loss": 0.2011, |
| "mean_token_accuracy": 0.9456206717274406, |
| "num_tokens": 407800620.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 0.7201575975526463, |
| "epoch": 0.41337579617834397, |
| "grad_norm": 0.7635099291801453, |
| "learning_rate": 1.7633451957295373e-06, |
| "loss": 0.2058, |
| "mean_token_accuracy": 0.9447005716237155, |
| "num_tokens": 408832671.0, |
| "step": 2065 |
| }, |
| { |
| "entropy": 0.7325827403502031, |
| "epoch": 0.41437670609645133, |
| "grad_norm": 0.6240025758743286, |
| "learning_rate": 1.7622330960854093e-06, |
| "loss": 0.2053, |
| "mean_token_accuracy": 0.9453733026981354, |
| "num_tokens": 409777940.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.41437670609645133, |
| "eval_entropy": 0.6683724786414474, |
| "eval_loss": 0.18311864137649536, |
| "eval_mean_token_accuracy": 0.9476352060427431, |
| "eval_num_tokens": 409777940.0, |
| "eval_runtime": 7.0794, |
| "eval_samples_per_second": 137.44, |
| "eval_steps_per_second": 8.617, |
| "step": 2070 |
| }, |
| { |
| "entropy": 0.7300297211516987, |
| "epoch": 0.4153776160145587, |
| "grad_norm": 1.6698415279388428, |
| "learning_rate": 1.761120996441281e-06, |
| "loss": 0.2006, |
| "mean_token_accuracy": 0.9477990069172599, |
| "num_tokens": 410500764.0, |
| "step": 2075 |
| }, |
| { |
| "entropy": 0.6337598004124382, |
| "epoch": 0.41637852593266605, |
| "grad_norm": 0.881401002407074, |
| "learning_rate": 1.7600088967971528e-06, |
| "loss": 0.1969, |
| "mean_token_accuracy": 0.9473873138427734, |
| "num_tokens": 411638552.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 0.6980531494725835, |
| "epoch": 0.4173794358507734, |
| "grad_norm": 0.7880724668502808, |
| "learning_rate": 1.758896797153025e-06, |
| "loss": 0.2046, |
| "mean_token_accuracy": 0.9452372453429482, |
| "num_tokens": 412726553.0, |
| "step": 2085 |
| }, |
| { |
| "entropy": 0.721489062092521, |
| "epoch": 0.4183803457688808, |
| "grad_norm": 0.7589449882507324, |
| "learning_rate": 1.7577846975088968e-06, |
| "loss": 0.2078, |
| "mean_token_accuracy": 0.9432421592148867, |
| "num_tokens": 413743869.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 0.7336713259870355, |
| "epoch": 0.4193812556869882, |
| "grad_norm": 1.2119146585464478, |
| "learning_rate": 1.7566725978647686e-06, |
| "loss": 0.2028, |
| "mean_token_accuracy": 0.9461878337643364, |
| "num_tokens": 414696099.0, |
| "step": 2095 |
| }, |
| { |
| "entropy": 0.7307607759128917, |
| "epoch": 0.42038216560509556, |
| "grad_norm": 1.6642305850982666, |
| "learning_rate": 1.7555604982206405e-06, |
| "loss": 0.2065, |
| "mean_token_accuracy": 0.9463564807718451, |
| "num_tokens": 415430423.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.42038216560509556, |
| "eval_entropy": 0.6673324142323166, |
| "eval_loss": 0.1833495795726776, |
| "eval_mean_token_accuracy": 0.9478640595420462, |
| "eval_num_tokens": 415430423.0, |
| "eval_runtime": 7.0408, |
| "eval_samples_per_second": 138.194, |
| "eval_steps_per_second": 8.664, |
| "step": 2100 |
| }, |
| { |
| "entropy": 0.6451762475750663, |
| "epoch": 0.4213830755232029, |
| "grad_norm": 0.9465289115905762, |
| "learning_rate": 1.7544483985765123e-06, |
| "loss": 0.1989, |
| "mean_token_accuracy": 0.9476843329993161, |
| "num_tokens": 416551707.0, |
| "step": 2105 |
| }, |
| { |
| "entropy": 0.6898594834587791, |
| "epoch": 0.4223839854413103, |
| "grad_norm": 0.7581867575645447, |
| "learning_rate": 1.7533362989323844e-06, |
| "loss": 0.2012, |
| "mean_token_accuracy": 0.9459905033761805, |
| "num_tokens": 417645350.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 0.71252573186701, |
| "epoch": 0.42338489535941765, |
| "grad_norm": 0.7181347608566284, |
| "learning_rate": 1.752224199288256e-06, |
| "loss": 0.2085, |
| "mean_token_accuracy": 0.9445584345947612, |
| "num_tokens": 418716115.0, |
| "step": 2115 |
| }, |
| { |
| "entropy": 0.7133055407892573, |
| "epoch": 0.424385805277525, |
| "grad_norm": 0.6408241987228394, |
| "learning_rate": 1.751112099644128e-06, |
| "loss": 0.2027, |
| "mean_token_accuracy": 0.9469801886515183, |
| "num_tokens": 419663774.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 0.711867922002619, |
| "epoch": 0.42538671519563237, |
| "grad_norm": 1.6646242141723633, |
| "learning_rate": 1.75e-06, |
| "loss": 0.1944, |
| "mean_token_accuracy": 0.9488317923112349, |
| "num_tokens": 420383624.0, |
| "step": 2125 |
| }, |
| { |
| "entropy": 0.6277808471159502, |
| "epoch": 0.4263876251137398, |
| "grad_norm": 0.8758793473243713, |
| "learning_rate": 1.7488879003558718e-06, |
| "loss": 0.1946, |
| "mean_token_accuracy": 0.949054852398959, |
| "num_tokens": 421525651.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.4263876251137398, |
| "eval_entropy": 0.6551624673311828, |
| "eval_loss": 0.18077890574932098, |
| "eval_mean_token_accuracy": 0.9479863975868851, |
| "eval_num_tokens": 421525651.0, |
| "eval_runtime": 7.2475, |
| "eval_samples_per_second": 134.253, |
| "eval_steps_per_second": 8.417, |
| "step": 2130 |
| }, |
| { |
| "entropy": 0.6754489562728189, |
| "epoch": 0.42738853503184715, |
| "grad_norm": 0.8031311631202698, |
| "learning_rate": 1.747775800711744e-06, |
| "loss": 0.2031, |
| "mean_token_accuracy": 0.9457703324881467, |
| "num_tokens": 422612779.0, |
| "step": 2135 |
| }, |
| { |
| "entropy": 0.7074506429108707, |
| "epoch": 0.4283894449499545, |
| "grad_norm": 0.7394730448722839, |
| "learning_rate": 1.7466637010676155e-06, |
| "loss": 0.2102, |
| "mean_token_accuracy": 0.9433455748991533, |
| "num_tokens": 423647547.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 0.719589533589103, |
| "epoch": 0.4293903548680619, |
| "grad_norm": 0.7077816128730774, |
| "learning_rate": 1.7455516014234874e-06, |
| "loss": 0.2003, |
| "mean_token_accuracy": 0.9472663749348034, |
| "num_tokens": 424592674.0, |
| "step": 2145 |
| }, |
| { |
| "entropy": 0.7104438646273179, |
| "epoch": 0.43039126478616924, |
| "grad_norm": 1.5614880323410034, |
| "learning_rate": 1.7444395017793595e-06, |
| "loss": 0.2026, |
| "mean_token_accuracy": 0.9470372357151725, |
| "num_tokens": 425315977.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 0.6318686853755604, |
| "epoch": 0.4313921747042766, |
| "grad_norm": 0.9640966057777405, |
| "learning_rate": 1.7433274021352313e-06, |
| "loss": 0.1927, |
| "mean_token_accuracy": 0.9490994160825555, |
| "num_tokens": 426462515.0, |
| "step": 2155 |
| }, |
| { |
| "entropy": 0.6816817018118771, |
| "epoch": 0.43239308462238396, |
| "grad_norm": 0.7858216762542725, |
| "learning_rate": 1.742215302491103e-06, |
| "loss": 0.2074, |
| "mean_token_accuracy": 0.9445357848297465, |
| "num_tokens": 427573865.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.43239308462238396, |
| "eval_entropy": 0.6511259249976424, |
| "eval_loss": 0.1821490079164505, |
| "eval_mean_token_accuracy": 0.9482881309556179, |
| "eval_num_tokens": 427573865.0, |
| "eval_runtime": 7.0417, |
| "eval_samples_per_second": 138.178, |
| "eval_steps_per_second": 8.663, |
| "step": 2160 |
| }, |
| { |
| "entropy": 0.6951464016329159, |
| "epoch": 0.4333939945404914, |
| "grad_norm": 0.7225281000137329, |
| "learning_rate": 1.741103202846975e-06, |
| "loss": 0.2008, |
| "mean_token_accuracy": 0.9459953931244937, |
| "num_tokens": 428613823.0, |
| "step": 2165 |
| }, |
| { |
| "entropy": 0.7063021380792964, |
| "epoch": 0.43439490445859874, |
| "grad_norm": 0.6970705986022949, |
| "learning_rate": 1.739991103202847e-06, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.9472221742976795, |
| "num_tokens": 429555485.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 0.7152685084126212, |
| "epoch": 0.4353958143767061, |
| "grad_norm": 1.6057738065719604, |
| "learning_rate": 1.7388790035587188e-06, |
| "loss": 0.1993, |
| "mean_token_accuracy": 0.9480575875802474, |
| "num_tokens": 430289824.0, |
| "step": 2175 |
| }, |
| { |
| "entropy": 0.6313129712234844, |
| "epoch": 0.43639672429481347, |
| "grad_norm": 0.8678532838821411, |
| "learning_rate": 1.7377669039145906e-06, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.9491723103956743, |
| "num_tokens": 431416002.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 0.6787468869577754, |
| "epoch": 0.4373976342129208, |
| "grad_norm": 0.8015258312225342, |
| "learning_rate": 1.7366548042704625e-06, |
| "loss": 0.2065, |
| "mean_token_accuracy": 0.9452338890595869, |
| "num_tokens": 432495759.0, |
| "step": 2185 |
| }, |
| { |
| "entropy": 0.6967111151326787, |
| "epoch": 0.4383985441310282, |
| "grad_norm": 0.7446882128715515, |
| "learning_rate": 1.7355427046263345e-06, |
| "loss": 0.2016, |
| "mean_token_accuracy": 0.9461214417761022, |
| "num_tokens": 433540979.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.4383985441310282, |
| "eval_entropy": 0.6525588817283755, |
| "eval_loss": 0.1822170466184616, |
| "eval_mean_token_accuracy": 0.9486938636811053, |
| "eval_num_tokens": 433540979.0, |
| "eval_runtime": 7.0989, |
| "eval_samples_per_second": 137.064, |
| "eval_steps_per_second": 8.593, |
| "step": 2190 |
| }, |
| { |
| "entropy": 0.7109386972405694, |
| "epoch": 0.43939945404913555, |
| "grad_norm": 0.6667978763580322, |
| "learning_rate": 1.7344306049822064e-06, |
| "loss": 0.197, |
| "mean_token_accuracy": 0.947896112095226, |
| "num_tokens": 434493365.0, |
| "step": 2195 |
| }, |
| { |
| "entropy": 0.7043106214566665, |
| "epoch": 0.44040036396724297, |
| "grad_norm": 1.5319359302520752, |
| "learning_rate": 1.733318505338078e-06, |
| "loss": 0.1958, |
| "mean_token_accuracy": 0.9489297693425959, |
| "num_tokens": 435226568.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 0.6357729142362422, |
| "epoch": 0.44140127388535033, |
| "grad_norm": 0.9023846983909607, |
| "learning_rate": 1.7322064056939501e-06, |
| "loss": 0.1931, |
| "mean_token_accuracy": 0.9492196435278112, |
| "num_tokens": 436329189.0, |
| "step": 2205 |
| }, |
| { |
| "entropy": 0.6714753104881807, |
| "epoch": 0.4424021838034577, |
| "grad_norm": 0.8319141864776611, |
| "learning_rate": 1.731094306049822e-06, |
| "loss": 0.1977, |
| "mean_token_accuracy": 0.9467895637858997, |
| "num_tokens": 437424508.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 0.6920918749137358, |
| "epoch": 0.44340309372156506, |
| "grad_norm": 0.7682778239250183, |
| "learning_rate": 1.7299822064056938e-06, |
| "loss": 0.1971, |
| "mean_token_accuracy": 0.946890014410019, |
| "num_tokens": 438454313.0, |
| "step": 2215 |
| }, |
| { |
| "entropy": 0.7164150400595232, |
| "epoch": 0.4444040036396724, |
| "grad_norm": 0.7394376397132874, |
| "learning_rate": 1.728870106761566e-06, |
| "loss": 0.203, |
| "mean_token_accuracy": 0.9466155610301278, |
| "num_tokens": 439405721.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.4444040036396724, |
| "eval_entropy": 0.66180199140408, |
| "eval_loss": 0.18257291615009308, |
| "eval_mean_token_accuracy": 0.9479424806891895, |
| "eval_num_tokens": 439405721.0, |
| "eval_runtime": 7.0937, |
| "eval_samples_per_second": 137.163, |
| "eval_steps_per_second": 8.599, |
| "step": 2220 |
| }, |
| { |
| "entropy": 0.7252338122237812, |
| "epoch": 0.4454049135577798, |
| "grad_norm": 1.6949454545974731, |
| "learning_rate": 1.7277580071174375e-06, |
| "loss": 0.1974, |
| "mean_token_accuracy": 0.9479389017278498, |
| "num_tokens": 440135915.0, |
| "step": 2225 |
| }, |
| { |
| "entropy": 0.639058459888805, |
| "epoch": 0.44640582347588714, |
| "grad_norm": 0.9639801383018494, |
| "learning_rate": 1.7266459074733096e-06, |
| "loss": 0.1946, |
| "mean_token_accuracy": 0.9484925963661888, |
| "num_tokens": 441271311.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 0.6790904402732849, |
| "epoch": 0.44740673339399456, |
| "grad_norm": 0.9020050168037415, |
| "learning_rate": 1.7255338078291815e-06, |
| "loss": 0.1974, |
| "mean_token_accuracy": 0.9472232580184936, |
| "num_tokens": 442357624.0, |
| "step": 2235 |
| }, |
| { |
| "entropy": 0.7040961652994155, |
| "epoch": 0.4484076433121019, |
| "grad_norm": 0.7891590595245361, |
| "learning_rate": 1.7244217081850533e-06, |
| "loss": 0.2016, |
| "mean_token_accuracy": 0.9471836420622739, |
| "num_tokens": 443388566.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 0.726311549002474, |
| "epoch": 0.4494085532302093, |
| "grad_norm": 0.7292259931564331, |
| "learning_rate": 1.7233096085409252e-06, |
| "loss": 0.1985, |
| "mean_token_accuracy": 0.9474069486964832, |
| "num_tokens": 444351363.0, |
| "step": 2245 |
| }, |
| { |
| "entropy": 0.7288991510868073, |
| "epoch": 0.45040946314831665, |
| "grad_norm": 1.6061440706253052, |
| "learning_rate": 1.722197508896797e-06, |
| "loss": 0.1958, |
| "mean_token_accuracy": 0.9497716730291194, |
| "num_tokens": 445084627.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.45040946314831665, |
| "eval_entropy": 0.6615917594706426, |
| "eval_loss": 0.18212804198265076, |
| "eval_mean_token_accuracy": 0.9479947060835167, |
| "eval_num_tokens": 445084627.0, |
| "eval_runtime": 7.1076, |
| "eval_samples_per_second": 136.895, |
| "eval_steps_per_second": 8.582, |
| "step": 2250 |
| }, |
| { |
| "entropy": 0.6405609087510542, |
| "epoch": 0.451410373066424, |
| "grad_norm": 0.9403535723686218, |
| "learning_rate": 1.721085409252669e-06, |
| "loss": 0.1883, |
| "mean_token_accuracy": 0.9504117098721591, |
| "num_tokens": 446228365.0, |
| "step": 2255 |
| }, |
| { |
| "entropy": 0.6863735209811818, |
| "epoch": 0.45241128298453137, |
| "grad_norm": 0.9006670117378235, |
| "learning_rate": 1.719973309608541e-06, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.9467043746601451, |
| "num_tokens": 447304171.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 0.7067891944538464, |
| "epoch": 0.4534121929026388, |
| "grad_norm": 0.7302769422531128, |
| "learning_rate": 1.7188612099644126e-06, |
| "loss": 0.2, |
| "mean_token_accuracy": 0.9461231302131307, |
| "num_tokens": 448339733.0, |
| "step": 2265 |
| }, |
| { |
| "entropy": 0.7219751954078675, |
| "epoch": 0.45441310282074615, |
| "grad_norm": 0.6865111589431763, |
| "learning_rate": 1.7177491103202845e-06, |
| "loss": 0.1941, |
| "mean_token_accuracy": 0.948776437477632, |
| "num_tokens": 449292470.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 0.7329446852207184, |
| "epoch": 0.4554140127388535, |
| "grad_norm": 1.658300757408142, |
| "learning_rate": 1.7166370106761565e-06, |
| "loss": 0.1954, |
| "mean_token_accuracy": 0.9491787666624243, |
| "num_tokens": 450008277.0, |
| "step": 2275 |
| }, |
| { |
| "entropy": 0.6389858687465841, |
| "epoch": 0.4564149226569609, |
| "grad_norm": 0.9273704290390015, |
| "learning_rate": 1.7155249110320284e-06, |
| "loss": 0.1898, |
| "mean_token_accuracy": 0.9499964730306105, |
| "num_tokens": 451148916.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.4564149226569609, |
| "eval_entropy": 0.6680871458327184, |
| "eval_loss": 0.18033559620380402, |
| "eval_mean_token_accuracy": 0.9485022337710272, |
| "eval_num_tokens": 451148916.0, |
| "eval_runtime": 7.0712, |
| "eval_samples_per_second": 137.6, |
| "eval_steps_per_second": 8.627, |
| "step": 2280 |
| }, |
| { |
| "entropy": 0.698534585129131, |
| "epoch": 0.45741583257506824, |
| "grad_norm": 0.8281445503234863, |
| "learning_rate": 1.7144128113879003e-06, |
| "loss": 0.1988, |
| "mean_token_accuracy": 0.9470174924893813, |
| "num_tokens": 452233124.0, |
| "step": 2285 |
| }, |
| { |
| "entropy": 0.7227046912366694, |
| "epoch": 0.4584167424931756, |
| "grad_norm": 0.7393911480903625, |
| "learning_rate": 1.7133007117437721e-06, |
| "loss": 0.1935, |
| "mean_token_accuracy": 0.9472934657877142, |
| "num_tokens": 453277649.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 0.7179521495645697, |
| "epoch": 0.45941765241128296, |
| "grad_norm": 0.6617256999015808, |
| "learning_rate": 1.712188612099644e-06, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.9475378876382654, |
| "num_tokens": 454237241.0, |
| "step": 2295 |
| }, |
| { |
| "entropy": 0.7308073119683699, |
| "epoch": 0.4604185623293904, |
| "grad_norm": 1.585522174835205, |
| "learning_rate": 1.711076512455516e-06, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.9490763951431621, |
| "num_tokens": 454963043.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 0.6407526796514338, |
| "epoch": 0.46141947224749774, |
| "grad_norm": 0.9008808732032776, |
| "learning_rate": 1.709964412811388e-06, |
| "loss": 0.1895, |
| "mean_token_accuracy": 0.9498597432266582, |
| "num_tokens": 456106735.0, |
| "step": 2305 |
| }, |
| { |
| "entropy": 0.6931318703022871, |
| "epoch": 0.4624203821656051, |
| "grad_norm": 0.8182635307312012, |
| "learning_rate": 1.7088523131672596e-06, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.9471178504553708, |
| "num_tokens": 457176055.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.4624203821656051, |
| "eval_entropy": 0.6684376890542078, |
| "eval_loss": 0.18013876676559448, |
| "eval_mean_token_accuracy": 0.9488128394377037, |
| "eval_num_tokens": 457176055.0, |
| "eval_runtime": 7.0798, |
| "eval_samples_per_second": 137.434, |
| "eval_steps_per_second": 8.616, |
| "step": 2310 |
| }, |
| { |
| "entropy": 0.7130835376002572, |
| "epoch": 0.46342129208371247, |
| "grad_norm": 0.699058473110199, |
| "learning_rate": 1.7077402135231316e-06, |
| "loss": 0.1902, |
| "mean_token_accuracy": 0.9488946026021784, |
| "num_tokens": 458210295.0, |
| "step": 2315 |
| }, |
| { |
| "entropy": 0.7252059096639807, |
| "epoch": 0.46442220200181983, |
| "grad_norm": 0.6226775646209717, |
| "learning_rate": 1.7066281138790035e-06, |
| "loss": 0.1908, |
| "mean_token_accuracy": 0.9491387551481073, |
| "num_tokens": 459167123.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 0.7374506094238975, |
| "epoch": 0.4654231119199272, |
| "grad_norm": 1.667640209197998, |
| "learning_rate": 1.7055160142348755e-06, |
| "loss": 0.1973, |
| "mean_token_accuracy": 0.9483266061002558, |
| "num_tokens": 459902189.0, |
| "step": 2325 |
| }, |
| { |
| "entropy": 0.647046386924657, |
| "epoch": 0.46642402183803455, |
| "grad_norm": 0.9029215574264526, |
| "learning_rate": 1.7044039145907472e-06, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.9509288961237127, |
| "num_tokens": 461035882.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 0.6890198341824791, |
| "epoch": 0.46742493175614197, |
| "grad_norm": 0.8259357213973999, |
| "learning_rate": 1.703291814946619e-06, |
| "loss": 0.1888, |
| "mean_token_accuracy": 0.9496154855598103, |
| "num_tokens": 462118087.0, |
| "step": 2335 |
| }, |
| { |
| "entropy": 0.7170812931927768, |
| "epoch": 0.46842584167424933, |
| "grad_norm": 0.7089883685112, |
| "learning_rate": 1.7021797153024911e-06, |
| "loss": 0.1937, |
| "mean_token_accuracy": 0.9480150136080655, |
| "num_tokens": 463164516.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.46842584167424933, |
| "eval_entropy": 0.662775921039894, |
| "eval_loss": 0.18436135351657867, |
| "eval_mean_token_accuracy": 0.9480926853711488, |
| "eval_num_tokens": 463164516.0, |
| "eval_runtime": 7.3033, |
| "eval_samples_per_second": 133.228, |
| "eval_steps_per_second": 8.352, |
| "step": 2340 |
| }, |
| { |
| "entropy": 0.7226597574624148, |
| "epoch": 0.4694267515923567, |
| "grad_norm": 0.6807421445846558, |
| "learning_rate": 1.701067615658363e-06, |
| "loss": 0.1919, |
| "mean_token_accuracy": 0.9482672154903412, |
| "num_tokens": 464120922.0, |
| "step": 2345 |
| }, |
| { |
| "entropy": 0.7318018051710996, |
| "epoch": 0.47042766151046406, |
| "grad_norm": 1.6226071119308472, |
| "learning_rate": 1.6999555160142346e-06, |
| "loss": 0.1978, |
| "mean_token_accuracy": 0.9488935736092654, |
| "num_tokens": 464841415.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 0.6446803179654208, |
| "epoch": 0.4714285714285714, |
| "grad_norm": 0.8962976932525635, |
| "learning_rate": 1.6988434163701067e-06, |
| "loss": 0.1801, |
| "mean_token_accuracy": 0.952340427311984, |
| "num_tokens": 465966413.0, |
| "step": 2355 |
| }, |
| { |
| "entropy": 0.6918975301764229, |
| "epoch": 0.4724294813466788, |
| "grad_norm": 0.7808786034584045, |
| "learning_rate": 1.6977313167259786e-06, |
| "loss": 0.1912, |
| "mean_token_accuracy": 0.9489104704423384, |
| "num_tokens": 467047557.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 0.713727774403312, |
| "epoch": 0.47343039126478614, |
| "grad_norm": 0.709165632724762, |
| "learning_rate": 1.6966192170818506e-06, |
| "loss": 0.1897, |
| "mean_token_accuracy": 0.948500750281594, |
| "num_tokens": 468103404.0, |
| "step": 2365 |
| }, |
| { |
| "entropy": 0.7288664657961238, |
| "epoch": 0.47443130118289356, |
| "grad_norm": 0.6595885753631592, |
| "learning_rate": 1.6955071174377223e-06, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9496505059979179, |
| "num_tokens": 469059545.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.47443130118289356, |
| "eval_entropy": 0.6668027595418399, |
| "eval_loss": 0.1822473108768463, |
| "eval_mean_token_accuracy": 0.9485983565205434, |
| "eval_num_tokens": 469059545.0, |
| "eval_runtime": 7.0789, |
| "eval_samples_per_second": 137.45, |
| "eval_steps_per_second": 8.617, |
| "step": 2370 |
| }, |
| { |
| "entropy": 0.7264184512875297, |
| "epoch": 0.4754322111010009, |
| "grad_norm": 1.5922738313674927, |
| "learning_rate": 1.6943950177935941e-06, |
| "loss": 0.1903, |
| "mean_token_accuracy": 0.9503626883029938, |
| "num_tokens": 469786008.0, |
| "step": 2375 |
| }, |
| { |
| "entropy": 0.6371802779761228, |
| "epoch": 0.4764331210191083, |
| "grad_norm": 0.9065341949462891, |
| "learning_rate": 1.6932829181494662e-06, |
| "loss": 0.1832, |
| "mean_token_accuracy": 0.9514370045878671, |
| "num_tokens": 470929765.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 0.6833381346680901, |
| "epoch": 0.47743403093721565, |
| "grad_norm": 0.7842475175857544, |
| "learning_rate": 1.692170818505338e-06, |
| "loss": 0.1919, |
| "mean_token_accuracy": 0.9486998016184026, |
| "num_tokens": 472010349.0, |
| "step": 2385 |
| }, |
| { |
| "entropy": 0.7116641182791102, |
| "epoch": 0.478434940855323, |
| "grad_norm": 0.7250556349754333, |
| "learning_rate": 1.69105871886121e-06, |
| "loss": 0.1994, |
| "mean_token_accuracy": 0.9462671198628165, |
| "num_tokens": 473044662.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 0.7203712877902118, |
| "epoch": 0.47943585077343037, |
| "grad_norm": 0.6930083632469177, |
| "learning_rate": 1.6899466192170818e-06, |
| "loss": 0.1951, |
| "mean_token_accuracy": 0.9483953313394027, |
| "num_tokens": 473994468.0, |
| "step": 2395 |
| }, |
| { |
| "entropy": 0.7180362874811346, |
| "epoch": 0.48043676069153773, |
| "grad_norm": 1.6153886318206787, |
| "learning_rate": 1.6888345195729536e-06, |
| "loss": 0.1877, |
| "mean_token_accuracy": 0.9512313040820035, |
| "num_tokens": 474722958.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.48043676069153773, |
| "eval_entropy": 0.6513270115266081, |
| "eval_loss": 0.18145401775836945, |
| "eval_mean_token_accuracy": 0.9486565795101103, |
| "eval_num_tokens": 474722958.0, |
| "eval_runtime": 7.0749, |
| "eval_samples_per_second": 137.528, |
| "eval_steps_per_second": 8.622, |
| "step": 2400 |
| }, |
| { |
| "entropy": 0.631208659843965, |
| "epoch": 0.48143767060964515, |
| "grad_norm": 0.8330217003822327, |
| "learning_rate": 1.6877224199288255e-06, |
| "loss": 0.1832, |
| "mean_token_accuracy": 0.9511312663555145, |
| "num_tokens": 475864637.0, |
| "step": 2405 |
| }, |
| { |
| "entropy": 0.6784997463226319, |
| "epoch": 0.4824385805277525, |
| "grad_norm": 0.8162450194358826, |
| "learning_rate": 1.6866103202846975e-06, |
| "loss": 0.1877, |
| "mean_token_accuracy": 0.948906400528821, |
| "num_tokens": 476949278.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 0.7088199880990115, |
| "epoch": 0.4834394904458599, |
| "grad_norm": 0.7941007614135742, |
| "learning_rate": 1.6854982206405692e-06, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9472430489280007, |
| "num_tokens": 477976989.0, |
| "step": 2415 |
| }, |
| { |
| "entropy": 0.7254412935538725, |
| "epoch": 0.48444040036396724, |
| "grad_norm": 0.6573625802993774, |
| "learning_rate": 1.6843861209964413e-06, |
| "loss": 0.1898, |
| "mean_token_accuracy": 0.9490737053481015, |
| "num_tokens": 478923190.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 0.7253351894291964, |
| "epoch": 0.4854413102820746, |
| "grad_norm": 1.5769827365875244, |
| "learning_rate": 1.6832740213523131e-06, |
| "loss": 0.1865, |
| "mean_token_accuracy": 0.9515808555212888, |
| "num_tokens": 479660032.0, |
| "step": 2425 |
| }, |
| { |
| "entropy": 0.6324506296352906, |
| "epoch": 0.48644222020018196, |
| "grad_norm": 0.8730674982070923, |
| "learning_rate": 1.682161921708185e-06, |
| "loss": 0.1722, |
| "mean_token_accuracy": 0.9541928350925446, |
| "num_tokens": 480777956.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.48644222020018196, |
| "eval_entropy": 0.6624198419148805, |
| "eval_loss": 0.18191221356391907, |
| "eval_mean_token_accuracy": 0.948937864577184, |
| "eval_num_tokens": 480777956.0, |
| "eval_runtime": 7.0984, |
| "eval_samples_per_second": 137.073, |
| "eval_steps_per_second": 8.594, |
| "step": 2430 |
| }, |
| { |
| "entropy": 0.6939558029174805, |
| "epoch": 0.4874431301182894, |
| "grad_norm": 0.8086444735527039, |
| "learning_rate": 1.6810498220640568e-06, |
| "loss": 0.1983, |
| "mean_token_accuracy": 0.9463418738408522, |
| "num_tokens": 481850511.0, |
| "step": 2435 |
| }, |
| { |
| "entropy": 0.7101692611520941, |
| "epoch": 0.48844404003639674, |
| "grad_norm": 0.7348644733428955, |
| "learning_rate": 1.6799377224199287e-06, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9481533868746324, |
| "num_tokens": 482899017.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 0.71939834383401, |
| "epoch": 0.4894449499545041, |
| "grad_norm": 0.6322587132453918, |
| "learning_rate": 1.6788256227758006e-06, |
| "loss": 0.1858, |
| "mean_token_accuracy": 0.9497276311570948, |
| "num_tokens": 483867488.0, |
| "step": 2445 |
| }, |
| { |
| "entropy": 0.7129947499795394, |
| "epoch": 0.49044585987261147, |
| "grad_norm": 1.5901869535446167, |
| "learning_rate": 1.6777135231316726e-06, |
| "loss": 0.1817, |
| "mean_token_accuracy": 0.9520208895206451, |
| "num_tokens": 484607972.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 0.6236480347134851, |
| "epoch": 0.49144676979071883, |
| "grad_norm": 0.9655255079269409, |
| "learning_rate": 1.6766014234875443e-06, |
| "loss": 0.1794, |
| "mean_token_accuracy": 0.9524587219411677, |
| "num_tokens": 485738406.0, |
| "step": 2455 |
| }, |
| { |
| "entropy": 0.6700492823665792, |
| "epoch": 0.4924476797088262, |
| "grad_norm": 0.7714277505874634, |
| "learning_rate": 1.6754893238434163e-06, |
| "loss": 0.1904, |
| "mean_token_accuracy": 0.9482547689567913, |
| "num_tokens": 486828631.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.4924476797088262, |
| "eval_entropy": 0.6517836798409946, |
| "eval_loss": 0.18481405079364777, |
| "eval_mean_token_accuracy": 0.9480296187713498, |
| "eval_num_tokens": 486828631.0, |
| "eval_runtime": 7.0759, |
| "eval_samples_per_second": 137.509, |
| "eval_steps_per_second": 8.621, |
| "step": 2460 |
| }, |
| { |
| "entropy": 0.7008583583615043, |
| "epoch": 0.49344858962693355, |
| "grad_norm": 0.7727804780006409, |
| "learning_rate": 1.6743772241992882e-06, |
| "loss": 0.1935, |
| "mean_token_accuracy": 0.9479362059723248, |
| "num_tokens": 487864440.0, |
| "step": 2465 |
| }, |
| { |
| "entropy": 0.703695898977193, |
| "epoch": 0.49444949954504097, |
| "grad_norm": 0.6300666928291321, |
| "learning_rate": 1.67326512455516e-06, |
| "loss": 0.1843, |
| "mean_token_accuracy": 0.950631813027642, |
| "num_tokens": 488824928.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 0.7131873472170396, |
| "epoch": 0.49545040946314833, |
| "grad_norm": 1.5668567419052124, |
| "learning_rate": 1.6721530249110321e-06, |
| "loss": 0.1891, |
| "mean_token_accuracy": 0.950605512749065, |
| "num_tokens": 489558417.0, |
| "step": 2475 |
| }, |
| { |
| "entropy": 0.6274211555719376, |
| "epoch": 0.4964513193812557, |
| "grad_norm": 0.8960671424865723, |
| "learning_rate": 1.6710409252669038e-06, |
| "loss": 0.1808, |
| "mean_token_accuracy": 0.9516859596425836, |
| "num_tokens": 490705705.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 0.6673310320485722, |
| "epoch": 0.49745222929936306, |
| "grad_norm": 0.7930068373680115, |
| "learning_rate": 1.6699288256227756e-06, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.9504123080860485, |
| "num_tokens": 491806156.0, |
| "step": 2485 |
| }, |
| { |
| "entropy": 0.6888049499555068, |
| "epoch": 0.4984531392174704, |
| "grad_norm": 0.7340999245643616, |
| "learning_rate": 1.6688167259786477e-06, |
| "loss": 0.1825, |
| "mean_token_accuracy": 0.9502738291567022, |
| "num_tokens": 492861174.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.4984531392174704, |
| "eval_entropy": 0.6484415804753538, |
| "eval_loss": 0.18193961679935455, |
| "eval_mean_token_accuracy": 0.9487551157591773, |
| "eval_num_tokens": 492861174.0, |
| "eval_runtime": 7.0365, |
| "eval_samples_per_second": 138.278, |
| "eval_steps_per_second": 8.669, |
| "step": 2490 |
| }, |
| { |
| "entropy": 0.7058779543096368, |
| "epoch": 0.4994540491355778, |
| "grad_norm": 0.6413493156433105, |
| "learning_rate": 1.6677046263345196e-06, |
| "loss": 0.1878, |
| "mean_token_accuracy": 0.9502467995340174, |
| "num_tokens": 493819119.0, |
| "step": 2495 |
| }, |
| { |
| "entropy": 0.7087403059005737, |
| "epoch": 0.5004549590536852, |
| "grad_norm": 1.787781834602356, |
| "learning_rate": 1.6665925266903912e-06, |
| "loss": 0.1787, |
| "mean_token_accuracy": 0.9526755853132768, |
| "num_tokens": 494543819.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 0.6390963711521842, |
| "epoch": 0.5014558689717925, |
| "grad_norm": 0.9745569825172424, |
| "learning_rate": 1.6654804270462633e-06, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.9514152586460114, |
| "num_tokens": 495662161.0, |
| "step": 2505 |
| }, |
| { |
| "entropy": 0.6857650220394135, |
| "epoch": 0.5024567788898999, |
| "grad_norm": 0.7823364734649658, |
| "learning_rate": 1.6643683274021351e-06, |
| "loss": 0.1904, |
| "mean_token_accuracy": 0.9487736772407185, |
| "num_tokens": 496756895.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 0.7007447817108848, |
| "epoch": 0.5034576888080072, |
| "grad_norm": 0.7864211201667786, |
| "learning_rate": 1.6632562277580072e-06, |
| "loss": 0.1886, |
| "mean_token_accuracy": 0.949703172120181, |
| "num_tokens": 497791268.0, |
| "step": 2515 |
| }, |
| { |
| "entropy": 0.7246824242851951, |
| "epoch": 0.5044585987261146, |
| "grad_norm": 0.7322613596916199, |
| "learning_rate": 1.6621441281138788e-06, |
| "loss": 0.1911, |
| "mean_token_accuracy": 0.9495543116872961, |
| "num_tokens": 498738442.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.5044585987261146, |
| "eval_entropy": 0.6589406611489468, |
| "eval_loss": 0.1833851933479309, |
| "eval_mean_token_accuracy": 0.9485713624563373, |
| "eval_num_tokens": 498738442.0, |
| "eval_runtime": 7.0805, |
| "eval_samples_per_second": 137.419, |
| "eval_steps_per_second": 8.615, |
| "step": 2520 |
| }, |
| { |
| "entropy": 0.7190561719916083, |
| "epoch": 0.5054595086442221, |
| "grad_norm": 1.6090604066848755, |
| "learning_rate": 1.6610320284697507e-06, |
| "loss": 0.1893, |
| "mean_token_accuracy": 0.9508534241806377, |
| "num_tokens": 499466730.0, |
| "step": 2525 |
| }, |
| { |
| "entropy": 0.6477170705795288, |
| "epoch": 0.5064604185623294, |
| "grad_norm": 0.9411379098892212, |
| "learning_rate": 1.6599199288256228e-06, |
| "loss": 0.1869, |
| "mean_token_accuracy": 0.950556813586842, |
| "num_tokens": 500573434.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 0.6901047151197087, |
| "epoch": 0.5074613284804368, |
| "grad_norm": 0.8624676465988159, |
| "learning_rate": 1.6588078291814946e-06, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.9498747451738878, |
| "num_tokens": 501667610.0, |
| "step": 2535 |
| }, |
| { |
| "entropy": 0.6946311796253378, |
| "epoch": 0.5084622383985441, |
| "grad_norm": 0.7686476707458496, |
| "learning_rate": 1.6576957295373665e-06, |
| "loss": 0.1867, |
| "mean_token_accuracy": 0.9502540312030099, |
| "num_tokens": 502710197.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 0.7155399157242341, |
| "epoch": 0.5094631483166515, |
| "grad_norm": 0.6873733997344971, |
| "learning_rate": 1.6565836298932383e-06, |
| "loss": 0.1864, |
| "mean_token_accuracy": 0.9502958644520153, |
| "num_tokens": 503664692.0, |
| "step": 2545 |
| }, |
| { |
| "entropy": 0.7136711597442627, |
| "epoch": 0.5104640582347588, |
| "grad_norm": 1.6547688245773315, |
| "learning_rate": 1.6554715302491102e-06, |
| "loss": 0.1833, |
| "mean_token_accuracy": 0.9522597675973719, |
| "num_tokens": 504389185.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.5104640582347588, |
| "eval_entropy": 0.6552407546121566, |
| "eval_loss": 0.18142500519752502, |
| "eval_mean_token_accuracy": 0.9480548022223301, |
| "eval_num_tokens": 504389185.0, |
| "eval_runtime": 7.0471, |
| "eval_samples_per_second": 138.071, |
| "eval_steps_per_second": 8.656, |
| "step": 2550 |
| }, |
| { |
| "entropy": 0.6178887445818294, |
| "epoch": 0.5114649681528662, |
| "grad_norm": 0.913406491279602, |
| "learning_rate": 1.6543594306049823e-06, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9535938934846357, |
| "num_tokens": 505531871.0, |
| "step": 2555 |
| }, |
| { |
| "entropy": 0.6729612290859223, |
| "epoch": 0.5124658780709737, |
| "grad_norm": 0.944691002368927, |
| "learning_rate": 1.6532473309608541e-06, |
| "loss": 0.1858, |
| "mean_token_accuracy": 0.9497326493263245, |
| "num_tokens": 506624859.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 0.6915393555706197, |
| "epoch": 0.513466787989081, |
| "grad_norm": 0.7944353818893433, |
| "learning_rate": 1.6521352313167258e-06, |
| "loss": 0.1803, |
| "mean_token_accuracy": 0.951506213166497, |
| "num_tokens": 507654709.0, |
| "step": 2565 |
| }, |
| { |
| "entropy": 0.7096973836421967, |
| "epoch": 0.5144676979071884, |
| "grad_norm": 0.6219519972801208, |
| "learning_rate": 1.6510231316725978e-06, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9509352364323356, |
| "num_tokens": 508600841.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 0.7088993831114335, |
| "epoch": 0.5154686078252957, |
| "grad_norm": 1.649739384651184, |
| "learning_rate": 1.6499110320284697e-06, |
| "loss": 0.1774, |
| "mean_token_accuracy": 0.9526963141831485, |
| "num_tokens": 509326741.0, |
| "step": 2575 |
| }, |
| { |
| "entropy": 0.6317986461249265, |
| "epoch": 0.5164695177434031, |
| "grad_norm": 0.8715526461601257, |
| "learning_rate": 1.6487989323843416e-06, |
| "loss": 0.1724, |
| "mean_token_accuracy": 0.9545585063370792, |
| "num_tokens": 510445233.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.5164695177434031, |
| "eval_entropy": 0.65945937154723, |
| "eval_loss": 0.18114233016967773, |
| "eval_mean_token_accuracy": 0.9490967617660272, |
| "eval_num_tokens": 510445233.0, |
| "eval_runtime": 7.1179, |
| "eval_samples_per_second": 136.698, |
| "eval_steps_per_second": 8.57, |
| "step": 2580 |
| }, |
| { |
| "entropy": 0.6800177308646116, |
| "epoch": 0.5174704276615104, |
| "grad_norm": 0.8096470832824707, |
| "learning_rate": 1.6476868327402134e-06, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9498075154694644, |
| "num_tokens": 511535453.0, |
| "step": 2585 |
| }, |
| { |
| "entropy": 0.6936602221293883, |
| "epoch": 0.5184713375796178, |
| "grad_norm": 0.7464794516563416, |
| "learning_rate": 1.6465747330960853e-06, |
| "loss": 0.1846, |
| "mean_token_accuracy": 0.950649511272257, |
| "num_tokens": 512578283.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 0.7119996157559482, |
| "epoch": 0.5194722474977252, |
| "grad_norm": 0.6432804465293884, |
| "learning_rate": 1.6454626334519573e-06, |
| "loss": 0.1886, |
| "mean_token_accuracy": 0.9499502631750973, |
| "num_tokens": 513537315.0, |
| "step": 2595 |
| }, |
| { |
| "entropy": 0.7189283625646071, |
| "epoch": 0.5204731574158326, |
| "grad_norm": 1.5802396535873413, |
| "learning_rate": 1.6443505338078292e-06, |
| "loss": 0.1834, |
| "mean_token_accuracy": 0.9515245573087172, |
| "num_tokens": 514261680.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 0.6315281949260018, |
| "epoch": 0.52147406733394, |
| "grad_norm": 0.8882037401199341, |
| "learning_rate": 1.6432384341637008e-06, |
| "loss": 0.1771, |
| "mean_token_accuracy": 0.9534684544259852, |
| "num_tokens": 515406090.0, |
| "step": 2605 |
| }, |
| { |
| "entropy": 0.6785016363317317, |
| "epoch": 0.5224749772520473, |
| "grad_norm": 0.8530360460281372, |
| "learning_rate": 1.642126334519573e-06, |
| "loss": 0.1837, |
| "mean_token_accuracy": 0.9502465849572962, |
| "num_tokens": 516486142.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.5224749772520473, |
| "eval_entropy": 0.6628204916344315, |
| "eval_loss": 0.1812705546617508, |
| "eval_mean_token_accuracy": 0.948223913302187, |
| "eval_num_tokens": 516486142.0, |
| "eval_runtime": 7.0951, |
| "eval_samples_per_second": 137.138, |
| "eval_steps_per_second": 8.598, |
| "step": 2610 |
| }, |
| { |
| "entropy": 0.7054530333388935, |
| "epoch": 0.5234758871701547, |
| "grad_norm": 0.7850746512413025, |
| "learning_rate": 1.6410142348754448e-06, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9509090060537512, |
| "num_tokens": 517532763.0, |
| "step": 2615 |
| }, |
| { |
| "entropy": 0.7255071458491412, |
| "epoch": 0.524476797088262, |
| "grad_norm": 0.6404737234115601, |
| "learning_rate": 1.6399021352313166e-06, |
| "loss": 0.1885, |
| "mean_token_accuracy": 0.9504103817723014, |
| "num_tokens": 518475304.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 0.7156178160147233, |
| "epoch": 0.5254777070063694, |
| "grad_norm": 1.8426103591918945, |
| "learning_rate": 1.6387900355871887e-06, |
| "loss": 0.1817, |
| "mean_token_accuracy": 0.9521985969760202, |
| "num_tokens": 519192476.0, |
| "step": 2625 |
| }, |
| { |
| "entropy": 0.6239164311777462, |
| "epoch": 0.5264786169244768, |
| "grad_norm": 0.9522146582603455, |
| "learning_rate": 1.6376779359430603e-06, |
| "loss": 0.1754, |
| "mean_token_accuracy": 0.9539390303871849, |
| "num_tokens": 520333033.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 0.6752398347312754, |
| "epoch": 0.5274795268425841, |
| "grad_norm": 0.809895396232605, |
| "learning_rate": 1.6365658362989322e-06, |
| "loss": 0.1807, |
| "mean_token_accuracy": 0.9511864114891398, |
| "num_tokens": 521436825.0, |
| "step": 2635 |
| }, |
| { |
| "entropy": 0.690356595678763, |
| "epoch": 0.5284804367606916, |
| "grad_norm": 0.7257580161094666, |
| "learning_rate": 1.6354537366548043e-06, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9509031973101876, |
| "num_tokens": 522480077.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.5284804367606916, |
| "eval_entropy": 0.6481362190402922, |
| "eval_loss": 0.18130838871002197, |
| "eval_mean_token_accuracy": 0.9484924390667775, |
| "eval_num_tokens": 522480077.0, |
| "eval_runtime": 7.2611, |
| "eval_samples_per_second": 134.002, |
| "eval_steps_per_second": 8.401, |
| "step": 2640 |
| }, |
| { |
| "entropy": 0.704322841221636, |
| "epoch": 0.5294813466787989, |
| "grad_norm": 0.6928062438964844, |
| "learning_rate": 1.6343416370106761e-06, |
| "loss": 0.1791, |
| "mean_token_accuracy": 0.9527296142144637, |
| "num_tokens": 523435044.0, |
| "step": 2645 |
| }, |
| { |
| "entropy": 0.7071298014033924, |
| "epoch": 0.5304822565969063, |
| "grad_norm": 1.618299961090088, |
| "learning_rate": 1.633229537366548e-06, |
| "loss": 0.1773, |
| "mean_token_accuracy": 0.9537473727356304, |
| "num_tokens": 524169591.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 0.6280927267941562, |
| "epoch": 0.5314831665150136, |
| "grad_norm": 0.8909013867378235, |
| "learning_rate": 1.6321174377224198e-06, |
| "loss": 0.1787, |
| "mean_token_accuracy": 0.9528289025480097, |
| "num_tokens": 525350154.0, |
| "step": 2655 |
| }, |
| { |
| "entropy": 0.6755202797326174, |
| "epoch": 0.532484076433121, |
| "grad_norm": 0.7887572646141052, |
| "learning_rate": 1.6310053380782917e-06, |
| "loss": 0.1805, |
| "mean_token_accuracy": 0.9506685668771917, |
| "num_tokens": 526444507.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 0.6938620551065965, |
| "epoch": 0.5334849863512284, |
| "grad_norm": 0.7199150919914246, |
| "learning_rate": 1.6298932384341638e-06, |
| "loss": 0.1816, |
| "mean_token_accuracy": 0.9513141361149875, |
| "num_tokens": 527503125.0, |
| "step": 2665 |
| }, |
| { |
| "entropy": 0.7077038569883867, |
| "epoch": 0.5344858962693357, |
| "grad_norm": 0.7154355645179749, |
| "learning_rate": 1.6287811387900354e-06, |
| "loss": 0.1768, |
| "mean_token_accuracy": 0.9529465122656389, |
| "num_tokens": 528449606.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.5344858962693357, |
| "eval_entropy": 0.6517275425254322, |
| "eval_loss": 0.17996351420879364, |
| "eval_mean_token_accuracy": 0.9491330697888234, |
| "eval_num_tokens": 528449606.0, |
| "eval_runtime": 7.0362, |
| "eval_samples_per_second": 138.285, |
| "eval_steps_per_second": 8.669, |
| "step": 2670 |
| }, |
| { |
| "entropy": 0.7160382747650147, |
| "epoch": 0.5354868061874432, |
| "grad_norm": 1.6408438682556152, |
| "learning_rate": 1.6276690391459073e-06, |
| "loss": 0.1817, |
| "mean_token_accuracy": 0.9525136871771379, |
| "num_tokens": 529188247.0, |
| "step": 2675 |
| }, |
| { |
| "entropy": 0.6233914153142409, |
| "epoch": 0.5364877161055505, |
| "grad_norm": 0.9246336221694946, |
| "learning_rate": 1.6265569395017793e-06, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.954184738072482, |
| "num_tokens": 530332960.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 0.6778095136989247, |
| "epoch": 0.5374886260236579, |
| "grad_norm": 0.8488349914550781, |
| "learning_rate": 1.6254448398576512e-06, |
| "loss": 0.184, |
| "mean_token_accuracy": 0.950068386034532, |
| "num_tokens": 531420665.0, |
| "step": 2685 |
| }, |
| { |
| "entropy": 0.6974740269509229, |
| "epoch": 0.5384895359417653, |
| "grad_norm": 0.7441538572311401, |
| "learning_rate": 1.624332740213523e-06, |
| "loss": 0.1816, |
| "mean_token_accuracy": 0.9505076592618769, |
| "num_tokens": 532457212.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 0.7071244949644262, |
| "epoch": 0.5394904458598726, |
| "grad_norm": 0.6449328660964966, |
| "learning_rate": 1.623220640569395e-06, |
| "loss": 0.1789, |
| "mean_token_accuracy": 0.9523183091120286, |
| "num_tokens": 533421804.0, |
| "step": 2695 |
| }, |
| { |
| "entropy": 0.7078518325632269, |
| "epoch": 0.54049135577798, |
| "grad_norm": 1.4397194385528564, |
| "learning_rate": 1.6221085409252668e-06, |
| "loss": 0.1795, |
| "mean_token_accuracy": 0.9529074343768034, |
| "num_tokens": 534147596.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.54049135577798, |
| "eval_entropy": 0.6496536575379919, |
| "eval_loss": 0.18150362372398376, |
| "eval_mean_token_accuracy": 0.948933268179659, |
| "eval_num_tokens": 534147596.0, |
| "eval_runtime": 7.1231, |
| "eval_samples_per_second": 136.598, |
| "eval_steps_per_second": 8.564, |
| "step": 2700 |
| }, |
| { |
| "entropy": 0.6244452785361897, |
| "epoch": 0.5414922656960873, |
| "grad_norm": 0.8995323181152344, |
| "learning_rate": 1.6209964412811388e-06, |
| "loss": 0.173, |
| "mean_token_accuracy": 0.9548817623745312, |
| "num_tokens": 535275617.0, |
| "step": 2705 |
| }, |
| { |
| "entropy": 0.6707280232147736, |
| "epoch": 0.5424931756141947, |
| "grad_norm": 0.776010274887085, |
| "learning_rate": 1.6198843416370107e-06, |
| "loss": 0.182, |
| "mean_token_accuracy": 0.9512561657211998, |
| "num_tokens": 536378664.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 0.6926113960417835, |
| "epoch": 0.543494085532302, |
| "grad_norm": 0.7570468783378601, |
| "learning_rate": 1.6187722419928823e-06, |
| "loss": 0.1795, |
| "mean_token_accuracy": 0.9521033758466894, |
| "num_tokens": 537435601.0, |
| "step": 2715 |
| }, |
| { |
| "entropy": 0.7047263752330434, |
| "epoch": 0.5444949954504095, |
| "grad_norm": 0.726445198059082, |
| "learning_rate": 1.6176601423487544e-06, |
| "loss": 0.1784, |
| "mean_token_accuracy": 0.9523573181845925, |
| "num_tokens": 538399382.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 0.7051477047530088, |
| "epoch": 0.5454959053685169, |
| "grad_norm": 1.5214438438415527, |
| "learning_rate": 1.6165480427046263e-06, |
| "loss": 0.179, |
| "mean_token_accuracy": 0.9527212950316343, |
| "num_tokens": 539145348.0, |
| "step": 2725 |
| }, |
| { |
| "entropy": 0.6262573410164226, |
| "epoch": 0.5464968152866242, |
| "grad_norm": 0.8618422150611877, |
| "learning_rate": 1.6154359430604983e-06, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.9551108999685808, |
| "num_tokens": 540256367.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.5464968152866242, |
| "eval_entropy": 0.6563674035619517, |
| "eval_loss": 0.17830216884613037, |
| "eval_mean_token_accuracy": 0.9493675964777587, |
| "eval_num_tokens": 540256367.0, |
| "eval_runtime": 7.0929, |
| "eval_samples_per_second": 137.179, |
| "eval_steps_per_second": 8.6, |
| "step": 2730 |
| }, |
| { |
| "entropy": 0.6828876172954386, |
| "epoch": 0.5474977252047316, |
| "grad_norm": 0.819709062576294, |
| "learning_rate": 1.61432384341637e-06, |
| "loss": 0.1797, |
| "mean_token_accuracy": 0.9517435491085052, |
| "num_tokens": 541348519.0, |
| "step": 2735 |
| }, |
| { |
| "entropy": 0.7007702973755923, |
| "epoch": 0.5484986351228389, |
| "grad_norm": 0.8111042976379395, |
| "learning_rate": 1.6132117437722418e-06, |
| "loss": 0.1811, |
| "mean_token_accuracy": 0.9512450467456471, |
| "num_tokens": 542379850.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 0.720324994488196, |
| "epoch": 0.5494995450409463, |
| "grad_norm": 0.8011656403541565, |
| "learning_rate": 1.612099644128114e-06, |
| "loss": 0.181, |
| "mean_token_accuracy": 0.9518745259805159, |
| "num_tokens": 543352827.0, |
| "step": 2745 |
| }, |
| { |
| "entropy": 0.7194907562299209, |
| "epoch": 0.5505004549590536, |
| "grad_norm": 1.7628045082092285, |
| "learning_rate": 1.6109875444839858e-06, |
| "loss": 0.1727, |
| "mean_token_accuracy": 0.9550723487680609, |
| "num_tokens": 544083387.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 0.6324405716224151, |
| "epoch": 0.5515013648771611, |
| "grad_norm": 0.9303938746452332, |
| "learning_rate": 1.6098754448398574e-06, |
| "loss": 0.1765, |
| "mean_token_accuracy": 0.9533820531585, |
| "num_tokens": 545240701.0, |
| "step": 2755 |
| }, |
| { |
| "entropy": 0.6824088400060481, |
| "epoch": 0.5525022747952685, |
| "grad_norm": 0.7898913025856018, |
| "learning_rate": 1.6087633451957295e-06, |
| "loss": 0.1758, |
| "mean_token_accuracy": 0.9526371836662293, |
| "num_tokens": 546313405.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.5525022747952685, |
| "eval_entropy": 0.6607245241032272, |
| "eval_loss": 0.18013106286525726, |
| "eval_mean_token_accuracy": 0.9495335397173147, |
| "eval_num_tokens": 546313405.0, |
| "eval_runtime": 6.9999, |
| "eval_samples_per_second": 139.003, |
| "eval_steps_per_second": 8.714, |
| "step": 2760 |
| }, |
| { |
| "entropy": 0.7003410195762461, |
| "epoch": 0.5535031847133758, |
| "grad_norm": 0.8289366364479065, |
| "learning_rate": 1.6076512455516013e-06, |
| "loss": 0.1794, |
| "mean_token_accuracy": 0.9513268579136241, |
| "num_tokens": 547348958.0, |
| "step": 2765 |
| }, |
| { |
| "entropy": 0.7149239838123321, |
| "epoch": 0.5545040946314832, |
| "grad_norm": 0.6572934985160828, |
| "learning_rate": 1.6065391459074732e-06, |
| "loss": 0.1772, |
| "mean_token_accuracy": 0.9532779801975597, |
| "num_tokens": 548314371.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 0.7151028931140899, |
| "epoch": 0.5555050045495905, |
| "grad_norm": 1.5846747159957886, |
| "learning_rate": 1.605427046263345e-06, |
| "loss": 0.174, |
| "mean_token_accuracy": 0.9540536219423468, |
| "num_tokens": 549045682.0, |
| "step": 2775 |
| }, |
| { |
| "entropy": 0.6212464993650263, |
| "epoch": 0.5565059144676979, |
| "grad_norm": 0.9082188010215759, |
| "learning_rate": 1.604314946619217e-06, |
| "loss": 0.169, |
| "mean_token_accuracy": 0.9555608651854776, |
| "num_tokens": 550218972.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 0.6785877959294753, |
| "epoch": 0.5575068243858052, |
| "grad_norm": 0.8211308121681213, |
| "learning_rate": 1.603202846975089e-06, |
| "loss": 0.178, |
| "mean_token_accuracy": 0.9523198788816278, |
| "num_tokens": 551311414.0, |
| "step": 2785 |
| }, |
| { |
| "entropy": 0.6972319600257006, |
| "epoch": 0.5585077343039127, |
| "grad_norm": 0.7803236246109009, |
| "learning_rate": 1.6020907473309608e-06, |
| "loss": 0.1847, |
| "mean_token_accuracy": 0.9511200400916013, |
| "num_tokens": 552341292.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.5585077343039127, |
| "eval_entropy": 0.6579079383709392, |
| "eval_loss": 0.17929911613464355, |
| "eval_mean_token_accuracy": 0.9495372889471836, |
| "eval_num_tokens": 552341292.0, |
| "eval_runtime": 7.0916, |
| "eval_samples_per_second": 137.205, |
| "eval_steps_per_second": 8.602, |
| "step": 2790 |
| }, |
| { |
| "entropy": 0.7150760515169664, |
| "epoch": 0.5595086442220201, |
| "grad_norm": 0.7405098080635071, |
| "learning_rate": 1.6009786476868327e-06, |
| "loss": 0.1755, |
| "mean_token_accuracy": 0.9534146623177961, |
| "num_tokens": 553288742.0, |
| "step": 2795 |
| }, |
| { |
| "entropy": 0.7124831140041351, |
| "epoch": 0.5605095541401274, |
| "grad_norm": 1.5424504280090332, |
| "learning_rate": 1.5998665480427046e-06, |
| "loss": 0.1741, |
| "mean_token_accuracy": 0.9544130991805684, |
| "num_tokens": 554020302.0, |
| "step": 2800 |
| }, |
| { |
| "entropy": 0.6266002264889804, |
| "epoch": 0.5615104640582348, |
| "grad_norm": 0.8750737309455872, |
| "learning_rate": 1.5987544483985764e-06, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.955253835699775, |
| "num_tokens": 555155713.0, |
| "step": 2805 |
| }, |
| { |
| "entropy": 0.6803346837108786, |
| "epoch": 0.5625113739763421, |
| "grad_norm": 0.8157915472984314, |
| "learning_rate": 1.5976423487544483e-06, |
| "loss": 0.1824, |
| "mean_token_accuracy": 0.950763221762397, |
| "num_tokens": 556265345.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 0.6968496783213182, |
| "epoch": 0.5635122838944495, |
| "grad_norm": 0.7749494910240173, |
| "learning_rate": 1.5965302491103203e-06, |
| "loss": 0.1783, |
| "mean_token_accuracy": 0.9526708814230832, |
| "num_tokens": 557289078.0, |
| "step": 2815 |
| }, |
| { |
| "entropy": 0.7060149173844944, |
| "epoch": 0.5645131938125568, |
| "grad_norm": 0.6400516033172607, |
| "learning_rate": 1.595418149466192e-06, |
| "loss": 0.1746, |
| "mean_token_accuracy": 0.9539100381461056, |
| "num_tokens": 558245587.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.5645131938125568, |
| "eval_entropy": 0.6540780389895204, |
| "eval_loss": 0.18272945284843445, |
| "eval_mean_token_accuracy": 0.9483291203858423, |
| "eval_num_tokens": 558245587.0, |
| "eval_runtime": 7.0762, |
| "eval_samples_per_second": 137.503, |
| "eval_steps_per_second": 8.62, |
| "step": 2820 |
| }, |
| { |
| "entropy": 0.7129332022233443, |
| "epoch": 0.5655141037306642, |
| "grad_norm": 1.6981102228164673, |
| "learning_rate": 1.594306049822064e-06, |
| "loss": 0.1727, |
| "mean_token_accuracy": 0.9546095479618419, |
| "num_tokens": 558972025.0, |
| "step": 2825 |
| }, |
| { |
| "entropy": 0.6247928223826669, |
| "epoch": 0.5665150136487717, |
| "grad_norm": 0.9023392796516418, |
| "learning_rate": 1.593193950177936e-06, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.9542244569821792, |
| "num_tokens": 560119210.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 0.6733576593073931, |
| "epoch": 0.567515923566879, |
| "grad_norm": 0.7821236252784729, |
| "learning_rate": 1.5920818505338078e-06, |
| "loss": 0.1767, |
| "mean_token_accuracy": 0.951957995783199, |
| "num_tokens": 561195725.0, |
| "step": 2835 |
| }, |
| { |
| "entropy": 0.7028111894022334, |
| "epoch": 0.5685168334849864, |
| "grad_norm": 0.745370626449585, |
| "learning_rate": 1.5909697508896796e-06, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9535439290783622, |
| "num_tokens": 562247195.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 0.7104091267694127, |
| "epoch": 0.5695177434030937, |
| "grad_norm": 0.606145977973938, |
| "learning_rate": 1.5898576512455515e-06, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9540031877431002, |
| "num_tokens": 563213222.0, |
| "step": 2845 |
| }, |
| { |
| "entropy": 0.7080640597776933, |
| "epoch": 0.5705186533212011, |
| "grad_norm": 1.5943220853805542, |
| "learning_rate": 1.5887455516014233e-06, |
| "loss": 0.1775, |
| "mean_token_accuracy": 0.9533238221298564, |
| "num_tokens": 563958366.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.5705186533212011, |
| "eval_entropy": 0.6547147309193846, |
| "eval_loss": 0.18087640404701233, |
| "eval_mean_token_accuracy": 0.9491668822335415, |
| "eval_num_tokens": 563958366.0, |
| "eval_runtime": 7.112, |
| "eval_samples_per_second": 136.811, |
| "eval_steps_per_second": 8.577, |
| "step": 2850 |
| }, |
| { |
| "entropy": 0.6236652623523365, |
| "epoch": 0.5715195632393084, |
| "grad_norm": 0.9184789061546326, |
| "learning_rate": 1.5876334519572954e-06, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9561629105697979, |
| "num_tokens": 565085789.0, |
| "step": 2855 |
| }, |
| { |
| "entropy": 0.677449183030562, |
| "epoch": 0.5725204731574158, |
| "grad_norm": 0.8137527704238892, |
| "learning_rate": 1.586521352313167e-06, |
| "loss": 0.1773, |
| "mean_token_accuracy": 0.9524840208617124, |
| "num_tokens": 566182679.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 0.6954392140561884, |
| "epoch": 0.5735213830755233, |
| "grad_norm": 0.7889710068702698, |
| "learning_rate": 1.585409252669039e-06, |
| "loss": 0.1776, |
| "mean_token_accuracy": 0.9529242997819727, |
| "num_tokens": 567225094.0, |
| "step": 2865 |
| }, |
| { |
| "entropy": 0.6996893541379409, |
| "epoch": 0.5745222929936306, |
| "grad_norm": 0.678756058216095, |
| "learning_rate": 1.584297153024911e-06, |
| "loss": 0.17, |
| "mean_token_accuracy": 0.9549407476728613, |
| "num_tokens": 568176148.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 0.701075277003375, |
| "epoch": 0.575523202911738, |
| "grad_norm": 1.6416276693344116, |
| "learning_rate": 1.5831850533807828e-06, |
| "loss": 0.1726, |
| "mean_token_accuracy": 0.9550453679128127, |
| "num_tokens": 568919638.0, |
| "step": 2875 |
| }, |
| { |
| "entropy": 0.6135414800860665, |
| "epoch": 0.5765241128298453, |
| "grad_norm": 0.9266515374183655, |
| "learning_rate": 1.582072953736655e-06, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.9551390783353285, |
| "num_tokens": 570088747.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.5765241128298453, |
| "eval_entropy": 0.6452143426801338, |
| "eval_loss": 0.17846497893333435, |
| "eval_mean_token_accuracy": 0.9499536039399319, |
| "eval_num_tokens": 570088747.0, |
| "eval_runtime": 7.1363, |
| "eval_samples_per_second": 136.344, |
| "eval_steps_per_second": 8.548, |
| "step": 2880 |
| }, |
| { |
| "entropy": 0.6611321086233313, |
| "epoch": 0.5775250227479527, |
| "grad_norm": 0.8491476774215698, |
| "learning_rate": 1.5809608540925266e-06, |
| "loss": 0.1699, |
| "mean_token_accuracy": 0.9547013607892123, |
| "num_tokens": 571174715.0, |
| "step": 2885 |
| }, |
| { |
| "entropy": 0.6863727271556854, |
| "epoch": 0.57852593266606, |
| "grad_norm": 0.7314031720161438, |
| "learning_rate": 1.5798487544483984e-06, |
| "loss": 0.1758, |
| "mean_token_accuracy": 0.9533765917474574, |
| "num_tokens": 572199578.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 0.6956823023882779, |
| "epoch": 0.5795268425841674, |
| "grad_norm": 0.6401289105415344, |
| "learning_rate": 1.5787366548042705e-06, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9559003060514276, |
| "num_tokens": 573157944.0, |
| "step": 2895 |
| }, |
| { |
| "entropy": 0.7030810995535417, |
| "epoch": 0.5805277525022748, |
| "grad_norm": 1.6768332719802856, |
| "learning_rate": 1.5776245551601423e-06, |
| "loss": 0.1739, |
| "mean_token_accuracy": 0.954583527283235, |
| "num_tokens": 573890695.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 0.6071485982699828, |
| "epoch": 0.5815286624203821, |
| "grad_norm": 0.887630045413971, |
| "learning_rate": 1.576512455516014e-06, |
| "loss": 0.1616, |
| "mean_token_accuracy": 0.9568493794311177, |
| "num_tokens": 575040060.0, |
| "step": 2905 |
| }, |
| { |
| "entropy": 0.670040900869803, |
| "epoch": 0.5825295723384896, |
| "grad_norm": 0.8263089060783386, |
| "learning_rate": 1.575400355871886e-06, |
| "loss": 0.1751, |
| "mean_token_accuracy": 0.9534888061610135, |
| "num_tokens": 576133302.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.5825295723384896, |
| "eval_entropy": 0.6536685472629109, |
| "eval_loss": 0.18158204853534698, |
| "eval_mean_token_accuracy": 0.9491394347831851, |
| "eval_num_tokens": 576133302.0, |
| "eval_runtime": 7.0798, |
| "eval_samples_per_second": 137.433, |
| "eval_steps_per_second": 8.616, |
| "step": 2910 |
| }, |
| { |
| "entropy": 0.6866442734544927, |
| "epoch": 0.5835304822565969, |
| "grad_norm": 0.7574155926704407, |
| "learning_rate": 1.574288256227758e-06, |
| "loss": 0.1705, |
| "mean_token_accuracy": 0.9546013420278375, |
| "num_tokens": 577182306.0, |
| "step": 2915 |
| }, |
| { |
| "entropy": 0.6976790409196507, |
| "epoch": 0.5845313921747043, |
| "grad_norm": 0.7004702687263489, |
| "learning_rate": 1.57317615658363e-06, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.9545241507616911, |
| "num_tokens": 578143268.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 0.6992522610859437, |
| "epoch": 0.5855323020928116, |
| "grad_norm": 1.5859788656234741, |
| "learning_rate": 1.5720640569395016e-06, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.957004501061006, |
| "num_tokens": 578880587.0, |
| "step": 2925 |
| }, |
| { |
| "entropy": 0.6218387118794702, |
| "epoch": 0.586533212010919, |
| "grad_norm": 0.9949880838394165, |
| "learning_rate": 1.5709519572953735e-06, |
| "loss": 0.1693, |
| "mean_token_accuracy": 0.955338594046506, |
| "num_tokens": 580027281.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 0.6679159836335615, |
| "epoch": 0.5875341219290264, |
| "grad_norm": 0.7988151907920837, |
| "learning_rate": 1.5698398576512456e-06, |
| "loss": 0.175, |
| "mean_token_accuracy": 0.9529053601351651, |
| "num_tokens": 581093873.0, |
| "step": 2935 |
| }, |
| { |
| "entropy": 0.698068075559356, |
| "epoch": 0.5885350318471337, |
| "grad_norm": 0.7373477816581726, |
| "learning_rate": 1.5687277580071174e-06, |
| "loss": 0.17, |
| "mean_token_accuracy": 0.9542166119272059, |
| "num_tokens": 582126221.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.5885350318471337, |
| "eval_entropy": 0.6519819942654156, |
| "eval_loss": 0.1814005970954895, |
| "eval_mean_token_accuracy": 0.9493043364071455, |
| "eval_num_tokens": 582126221.0, |
| "eval_runtime": 7.1253, |
| "eval_samples_per_second": 136.555, |
| "eval_steps_per_second": 8.561, |
| "step": 2940 |
| }, |
| { |
| "entropy": 0.7105110601945357, |
| "epoch": 0.5895359417652412, |
| "grad_norm": 0.6741722822189331, |
| "learning_rate": 1.567615658362989e-06, |
| "loss": 0.1753, |
| "mean_token_accuracy": 0.9536024895581332, |
| "num_tokens": 583074496.0, |
| "step": 2945 |
| }, |
| { |
| "entropy": 0.7069875858046791, |
| "epoch": 0.5905368516833485, |
| "grad_norm": 1.6635076999664307, |
| "learning_rate": 1.5665035587188611e-06, |
| "loss": 0.1708, |
| "mean_token_accuracy": 0.9553401833230799, |
| "num_tokens": 583801034.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 0.6250316668640483, |
| "epoch": 0.5915377616014559, |
| "grad_norm": 0.9111377596855164, |
| "learning_rate": 1.565391459074733e-06, |
| "loss": 0.1675, |
| "mean_token_accuracy": 0.9554700141603296, |
| "num_tokens": 584923018.0, |
| "step": 2955 |
| }, |
| { |
| "entropy": 0.6771905362606049, |
| "epoch": 0.5925386715195632, |
| "grad_norm": 0.8384516835212708, |
| "learning_rate": 1.564279359430605e-06, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.9523442192511125, |
| "num_tokens": 586002885.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 0.6946805049072612, |
| "epoch": 0.5935395814376706, |
| "grad_norm": 0.8512565493583679, |
| "learning_rate": 1.563167259786477e-06, |
| "loss": 0.1748, |
| "mean_token_accuracy": 0.9534324310042641, |
| "num_tokens": 587033715.0, |
| "step": 2965 |
| }, |
| { |
| "entropy": 0.7199535329233516, |
| "epoch": 0.594540491355778, |
| "grad_norm": 0.9583800435066223, |
| "learning_rate": 1.5620551601423486e-06, |
| "loss": 0.1718, |
| "mean_token_accuracy": 0.9540681860663675, |
| "num_tokens": 587973781.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.594540491355778, |
| "eval_entropy": 0.6598269665827516, |
| "eval_loss": 0.17958512902259827, |
| "eval_mean_token_accuracy": 0.9494309190843926, |
| "eval_num_tokens": 587973781.0, |
| "eval_runtime": 7.1961, |
| "eval_samples_per_second": 135.212, |
| "eval_steps_per_second": 8.477, |
| "step": 2970 |
| }, |
| { |
| "entropy": 0.7052282398397273, |
| "epoch": 0.5955414012738853, |
| "grad_norm": 1.6504220962524414, |
| "learning_rate": 1.5609430604982206e-06, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.9569182553074577, |
| "num_tokens": 588703254.0, |
| "step": 2975 |
| }, |
| { |
| "entropy": 0.6265523303638805, |
| "epoch": 0.5965423111919927, |
| "grad_norm": 0.9333195090293884, |
| "learning_rate": 1.5598309608540925e-06, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.95631789077412, |
| "num_tokens": 589845740.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 0.679840994423086, |
| "epoch": 0.5975432211101, |
| "grad_norm": 0.8418663740158081, |
| "learning_rate": 1.5587188612099643e-06, |
| "loss": 0.1789, |
| "mean_token_accuracy": 0.9520544881170446, |
| "num_tokens": 590927438.0, |
| "step": 2985 |
| }, |
| { |
| "entropy": 0.7017120361328125, |
| "epoch": 0.5985441310282075, |
| "grad_norm": 0.7685884833335876, |
| "learning_rate": 1.5576067615658362e-06, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.9537156251343814, |
| "num_tokens": 591971110.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 0.7107149928808212, |
| "epoch": 0.5995450409463148, |
| "grad_norm": 0.7431087493896484, |
| "learning_rate": 1.556494661921708e-06, |
| "loss": 0.1694, |
| "mean_token_accuracy": 0.954865367304195, |
| "num_tokens": 592933967.0, |
| "step": 2995 |
| }, |
| { |
| "entropy": 0.7013540771874515, |
| "epoch": 0.6005459508644222, |
| "grad_norm": 1.5792326927185059, |
| "learning_rate": 1.55538256227758e-06, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9574482348832217, |
| "num_tokens": 593674210.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.6005459508644222, |
| "eval_entropy": 0.6583689304648853, |
| "eval_loss": 0.18258357048034668, |
| "eval_mean_token_accuracy": 0.9492516068161511, |
| "eval_num_tokens": 593674210.0, |
| "eval_runtime": 7.2234, |
| "eval_samples_per_second": 134.702, |
| "eval_steps_per_second": 8.445, |
| "step": 3000 |
| }, |
| { |
| "entropy": 0.6216048316522078, |
| "epoch": 0.6015468607825296, |
| "grad_norm": 0.9363070130348206, |
| "learning_rate": 1.554270462633452e-06, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9563505985520103, |
| "num_tokens": 594804599.0, |
| "step": 3005 |
| }, |
| { |
| "entropy": 0.6832993455908515, |
| "epoch": 0.6025477707006369, |
| "grad_norm": 0.8691864609718323, |
| "learning_rate": 1.5531583629893236e-06, |
| "loss": 0.1699, |
| "mean_token_accuracy": 0.9539537310600281, |
| "num_tokens": 595873162.0, |
| "step": 3010 |
| }, |
| { |
| "entropy": 0.7007348017259077, |
| "epoch": 0.6035486806187443, |
| "grad_norm": 0.7877841591835022, |
| "learning_rate": 1.5520462633451957e-06, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9534028855237093, |
| "num_tokens": 596910846.0, |
| "step": 3015 |
| }, |
| { |
| "entropy": 0.7141089824112978, |
| "epoch": 0.6045495905368516, |
| "grad_norm": 0.6574937105178833, |
| "learning_rate": 1.5509341637010676e-06, |
| "loss": 0.1731, |
| "mean_token_accuracy": 0.9544164771383459, |
| "num_tokens": 597853139.0, |
| "step": 3020 |
| }, |
| { |
| "entropy": 0.7040485745126551, |
| "epoch": 0.6055505004549591, |
| "grad_norm": 1.726340889930725, |
| "learning_rate": 1.5498220640569394e-06, |
| "loss": 0.1645, |
| "mean_token_accuracy": 0.9566915214061738, |
| "num_tokens": 598577635.0, |
| "step": 3025 |
| }, |
| { |
| "entropy": 0.6146221702749078, |
| "epoch": 0.6065514103730664, |
| "grad_norm": 0.9489019513130188, |
| "learning_rate": 1.5487099644128113e-06, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9564465994184668, |
| "num_tokens": 599712819.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.6065514103730664, |
| "eval_entropy": 0.6538055861582521, |
| "eval_loss": 0.18010924756526947, |
| "eval_mean_token_accuracy": 0.9496048923398628, |
| "eval_num_tokens": 599712819.0, |
| "eval_runtime": 6.9982, |
| "eval_samples_per_second": 139.035, |
| "eval_steps_per_second": 8.716, |
| "step": 3030 |
| }, |
| { |
| "entropy": 0.6649877328764309, |
| "epoch": 0.6075523202911738, |
| "grad_norm": 0.7716711163520813, |
| "learning_rate": 1.5475978647686831e-06, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9545083528215235, |
| "num_tokens": 600810295.0, |
| "step": 3035 |
| }, |
| { |
| "entropy": 0.6926416860385375, |
| "epoch": 0.6085532302092812, |
| "grad_norm": 0.7539160251617432, |
| "learning_rate": 1.546485765124555e-06, |
| "loss": 0.1712, |
| "mean_token_accuracy": 0.9541196048259735, |
| "num_tokens": 601845533.0, |
| "step": 3040 |
| }, |
| { |
| "entropy": 0.7097321336919611, |
| "epoch": 0.6095541401273885, |
| "grad_norm": 0.954349160194397, |
| "learning_rate": 1.545373665480427e-06, |
| "loss": 0.1657, |
| "mean_token_accuracy": 0.9558960502797907, |
| "num_tokens": 602800251.0, |
| "step": 3045 |
| }, |
| { |
| "entropy": 0.7092912332578138, |
| "epoch": 0.6105550500454959, |
| "grad_norm": 1.6350897550582886, |
| "learning_rate": 1.544261565836299e-06, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9553542906587774, |
| "num_tokens": 603536113.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 0.6223157600923018, |
| "epoch": 0.6115559599636032, |
| "grad_norm": 0.913377583026886, |
| "learning_rate": 1.5431494661921708e-06, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9570931732654572, |
| "num_tokens": 604670520.0, |
| "step": 3055 |
| }, |
| { |
| "entropy": 0.6658590446818958, |
| "epoch": 0.6125568698817107, |
| "grad_norm": 0.8451462984085083, |
| "learning_rate": 1.5420373665480426e-06, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9555998970161784, |
| "num_tokens": 605754295.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.6125568698817107, |
| "eval_entropy": 0.6519810190943421, |
| "eval_loss": 0.18205370008945465, |
| "eval_mean_token_accuracy": 0.9495203221430544, |
| "eval_num_tokens": 605754295.0, |
| "eval_runtime": 7.058, |
| "eval_samples_per_second": 137.859, |
| "eval_steps_per_second": 8.643, |
| "step": 3060 |
| }, |
| { |
| "entropy": 0.691884211789478, |
| "epoch": 0.6135577797998181, |
| "grad_norm": 0.7430902123451233, |
| "learning_rate": 1.5409252669039145e-06, |
| "loss": 0.1689, |
| "mean_token_accuracy": 0.9547361292622306, |
| "num_tokens": 606778333.0, |
| "step": 3065 |
| }, |
| { |
| "entropy": 0.7035522225228223, |
| "epoch": 0.6145586897179254, |
| "grad_norm": 0.6056403517723083, |
| "learning_rate": 1.5398131672597866e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9561255595900796, |
| "num_tokens": 607736004.0, |
| "step": 3070 |
| }, |
| { |
| "entropy": 0.7008915657346899, |
| "epoch": 0.6155595996360328, |
| "grad_norm": 1.5326563119888306, |
| "learning_rate": 1.5387010676156582e-06, |
| "loss": 0.1652, |
| "mean_token_accuracy": 0.9567563999782909, |
| "num_tokens": 608474829.0, |
| "step": 3075 |
| }, |
| { |
| "entropy": 0.6191458604552529, |
| "epoch": 0.6165605095541401, |
| "grad_norm": 0.9065665602684021, |
| "learning_rate": 1.53758896797153e-06, |
| "loss": 0.1582, |
| "mean_token_accuracy": 0.9579779603264549, |
| "num_tokens": 609604074.0, |
| "step": 3080 |
| }, |
| { |
| "entropy": 0.6666526832363823, |
| "epoch": 0.6175614194722475, |
| "grad_norm": 0.8556954860687256, |
| "learning_rate": 1.5364768683274021e-06, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9543328859589316, |
| "num_tokens": 610710246.0, |
| "step": 3085 |
| }, |
| { |
| "entropy": 0.6837726238099011, |
| "epoch": 0.6185623293903548, |
| "grad_norm": 0.8487630486488342, |
| "learning_rate": 1.535364768683274e-06, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.955160356651653, |
| "num_tokens": 611758840.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.6185623293903548, |
| "eval_entropy": 0.6574973884176035, |
| "eval_loss": 0.179812490940094, |
| "eval_mean_token_accuracy": 0.949747121724926, |
| "eval_num_tokens": 611758840.0, |
| "eval_runtime": 7.067, |
| "eval_samples_per_second": 137.683, |
| "eval_steps_per_second": 8.632, |
| "step": 3090 |
| }, |
| { |
| "entropy": 0.7014839009805159, |
| "epoch": 0.6195632393084622, |
| "grad_norm": 0.6837453246116638, |
| "learning_rate": 1.5342526690391456e-06, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.9557089160789143, |
| "num_tokens": 612699784.0, |
| "step": 3095 |
| }, |
| { |
| "entropy": 0.7075679730285298, |
| "epoch": 0.6205641492265697, |
| "grad_norm": 1.7436314821243286, |
| "learning_rate": 1.5331405693950177e-06, |
| "loss": 0.169, |
| "mean_token_accuracy": 0.9560685184868899, |
| "num_tokens": 613436633.0, |
| "step": 3100 |
| }, |
| { |
| "entropy": 0.6202307202599265, |
| "epoch": 0.621565059144677, |
| "grad_norm": 0.9475667476654053, |
| "learning_rate": 1.5320284697508896e-06, |
| "loss": 0.1597, |
| "mean_token_accuracy": 0.9571539521217346, |
| "num_tokens": 614596800.0, |
| "step": 3105 |
| }, |
| { |
| "entropy": 0.6672575666145845, |
| "epoch": 0.6225659690627844, |
| "grad_norm": 0.8185185194015503, |
| "learning_rate": 1.5309163701067616e-06, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9542947286909277, |
| "num_tokens": 615704382.0, |
| "step": 3110 |
| }, |
| { |
| "entropy": 0.6795628358017315, |
| "epoch": 0.6235668789808917, |
| "grad_norm": 0.7307755351066589, |
| "learning_rate": 1.5298042704626333e-06, |
| "loss": 0.1627, |
| "mean_token_accuracy": 0.9561434664509513, |
| "num_tokens": 616757442.0, |
| "step": 3115 |
| }, |
| { |
| "entropy": 0.6966196049343456, |
| "epoch": 0.6245677888989991, |
| "grad_norm": 0.6424974799156189, |
| "learning_rate": 1.5286921708185051e-06, |
| "loss": 0.167, |
| "mean_token_accuracy": 0.9560314021327279, |
| "num_tokens": 617714286.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.6245677888989991, |
| "eval_entropy": 0.6488942484386632, |
| "eval_loss": 0.18082934617996216, |
| "eval_mean_token_accuracy": 0.9497447346077591, |
| "eval_num_tokens": 617714286.0, |
| "eval_runtime": 7.0333, |
| "eval_samples_per_second": 138.342, |
| "eval_steps_per_second": 8.673, |
| "step": 3120 |
| }, |
| { |
| "entropy": 0.6996615810827775, |
| "epoch": 0.6255686988171064, |
| "grad_norm": 1.6187618970870972, |
| "learning_rate": 1.5275800711743772e-06, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9579841077327729, |
| "num_tokens": 618452310.0, |
| "step": 3125 |
| }, |
| { |
| "entropy": 0.6223932883956216, |
| "epoch": 0.6265696087352138, |
| "grad_norm": 0.9200411438941956, |
| "learning_rate": 1.526467971530249e-06, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9576196242462505, |
| "num_tokens": 619611422.0, |
| "step": 3130 |
| }, |
| { |
| "entropy": 0.6693487199870023, |
| "epoch": 0.6275705186533213, |
| "grad_norm": 0.8405710458755493, |
| "learning_rate": 1.525355871886121e-06, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9551270468668505, |
| "num_tokens": 620685183.0, |
| "step": 3135 |
| }, |
| { |
| "entropy": 0.6933458956805143, |
| "epoch": 0.6285714285714286, |
| "grad_norm": 0.9356978535652161, |
| "learning_rate": 1.5242437722419928e-06, |
| "loss": 0.1653, |
| "mean_token_accuracy": 0.9554397490891543, |
| "num_tokens": 621712160.0, |
| "step": 3140 |
| }, |
| { |
| "entropy": 0.6994579350406473, |
| "epoch": 0.629572338489536, |
| "grad_norm": 0.8684320449829102, |
| "learning_rate": 1.5231316725978646e-06, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.9550400712273338, |
| "num_tokens": 622669547.0, |
| "step": 3145 |
| }, |
| { |
| "entropy": 0.7034677063876932, |
| "epoch": 0.6305732484076433, |
| "grad_norm": 1.671410083770752, |
| "learning_rate": 1.5220195729537367e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9572344660758972, |
| "num_tokens": 623402046.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.6305732484076433, |
| "eval_entropy": 0.6508955388772683, |
| "eval_loss": 0.18214590847492218, |
| "eval_mean_token_accuracy": 0.9497071631619187, |
| "eval_num_tokens": 623402046.0, |
| "eval_runtime": 7.0379, |
| "eval_samples_per_second": 138.251, |
| "eval_steps_per_second": 8.667, |
| "step": 3150 |
| }, |
| { |
| "entropy": 0.6169803651896391, |
| "epoch": 0.6315741583257507, |
| "grad_norm": 0.9965471625328064, |
| "learning_rate": 1.5209074733096086e-06, |
| "loss": 0.159, |
| "mean_token_accuracy": 0.9576476606455716, |
| "num_tokens": 624557115.0, |
| "step": 3155 |
| }, |
| { |
| "entropy": 0.6633053736253218, |
| "epoch": 0.632575068243858, |
| "grad_norm": 0.8597959280014038, |
| "learning_rate": 1.5197953736654802e-06, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.9550956438888203, |
| "num_tokens": 625630354.0, |
| "step": 3160 |
| }, |
| { |
| "entropy": 0.6893996604464271, |
| "epoch": 0.6335759781619654, |
| "grad_norm": 0.7524270415306091, |
| "learning_rate": 1.5186832740213523e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9566006205298684, |
| "num_tokens": 626655720.0, |
| "step": 3165 |
| }, |
| { |
| "entropy": 0.7058270321650939, |
| "epoch": 0.6345768880800728, |
| "grad_norm": 0.6807648539543152, |
| "learning_rate": 1.5175711743772241e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9561190323396163, |
| "num_tokens": 627597028.0, |
| "step": 3170 |
| }, |
| { |
| "entropy": 0.703032106702978, |
| "epoch": 0.6355777979981801, |
| "grad_norm": 1.6004669666290283, |
| "learning_rate": 1.516459074733096e-06, |
| "loss": 0.158, |
| "mean_token_accuracy": 0.9586949603124099, |
| "num_tokens": 628310494.0, |
| "step": 3175 |
| }, |
| { |
| "entropy": 0.6183769884434613, |
| "epoch": 0.6365787079162876, |
| "grad_norm": 0.9058781862258911, |
| "learning_rate": 1.5153469750889679e-06, |
| "loss": 0.1559, |
| "mean_token_accuracy": 0.9587956016713922, |
| "num_tokens": 629439229.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.6365787079162876, |
| "eval_entropy": 0.646690167364527, |
| "eval_loss": 0.18040700256824493, |
| "eval_mean_token_accuracy": 0.9494497678318962, |
| "eval_num_tokens": 629439229.0, |
| "eval_runtime": 7.0218, |
| "eval_samples_per_second": 138.569, |
| "eval_steps_per_second": 8.687, |
| "step": 3180 |
| }, |
| { |
| "entropy": 0.6690399782224135, |
| "epoch": 0.6375796178343949, |
| "grad_norm": 0.8933009505271912, |
| "learning_rate": 1.5142348754448397e-06, |
| "loss": 0.1685, |
| "mean_token_accuracy": 0.955499666929245, |
| "num_tokens": 630526225.0, |
| "step": 3185 |
| }, |
| { |
| "entropy": 0.6908215509219603, |
| "epoch": 0.6385805277525023, |
| "grad_norm": 0.7805888056755066, |
| "learning_rate": 1.5131227758007118e-06, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.9543212841857563, |
| "num_tokens": 631544968.0, |
| "step": 3190 |
| }, |
| { |
| "entropy": 0.7041902406649156, |
| "epoch": 0.6395814376706096, |
| "grad_norm": 0.6156824827194214, |
| "learning_rate": 1.5120106761565836e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.957361562685533, |
| "num_tokens": 632498779.0, |
| "step": 3195 |
| }, |
| { |
| "entropy": 0.7018732940608805, |
| "epoch": 0.640582347588717, |
| "grad_norm": 1.7362315654754639, |
| "learning_rate": 1.5108985765124555e-06, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.9588197816501964, |
| "num_tokens": 633231622.0, |
| "step": 3200 |
| }, |
| { |
| "entropy": 0.6234849360856143, |
| "epoch": 0.6415832575068244, |
| "grad_norm": 0.9099482297897339, |
| "learning_rate": 1.5097864768683274e-06, |
| "loss": 0.1613, |
| "mean_token_accuracy": 0.9571101091124795, |
| "num_tokens": 634372922.0, |
| "step": 3205 |
| }, |
| { |
| "entropy": 0.6708677102218975, |
| "epoch": 0.6425841674249317, |
| "grad_norm": 0.8247345089912415, |
| "learning_rate": 1.5086743772241992e-06, |
| "loss": 0.1665, |
| "mean_token_accuracy": 0.9549422193657268, |
| "num_tokens": 635453965.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.6425841674249317, |
| "eval_entropy": 0.6494777632541344, |
| "eval_loss": 0.18078424036502838, |
| "eval_mean_token_accuracy": 0.9497652522853164, |
| "eval_num_tokens": 635453965.0, |
| "eval_runtime": 7.0204, |
| "eval_samples_per_second": 138.595, |
| "eval_steps_per_second": 8.689, |
| "step": 3210 |
| }, |
| { |
| "entropy": 0.6948196053504944, |
| "epoch": 0.6435850773430392, |
| "grad_norm": 0.7620670795440674, |
| "learning_rate": 1.507562277580071e-06, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.9559367472475225, |
| "num_tokens": 636472033.0, |
| "step": 3215 |
| }, |
| { |
| "entropy": 0.7079345066439021, |
| "epoch": 0.6445859872611465, |
| "grad_norm": 0.6674084663391113, |
| "learning_rate": 1.5064501779359431e-06, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.9555874754082073, |
| "num_tokens": 637412606.0, |
| "step": 3220 |
| }, |
| { |
| "entropy": 0.7107769147916274, |
| "epoch": 0.6455868971792539, |
| "grad_norm": 1.6964831352233887, |
| "learning_rate": 1.5053380782918148e-06, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.9574269002134149, |
| "num_tokens": 638133615.0, |
| "step": 3225 |
| }, |
| { |
| "entropy": 0.6168817777525295, |
| "epoch": 0.6465878070973612, |
| "grad_norm": 0.9298244118690491, |
| "learning_rate": 1.5042259786476866e-06, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.9585052159699526, |
| "num_tokens": 639270319.0, |
| "step": 3230 |
| }, |
| { |
| "entropy": 0.6654906495050951, |
| "epoch": 0.6475887170154686, |
| "grad_norm": 0.8299368023872375, |
| "learning_rate": 1.5031138790035587e-06, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9553114105354655, |
| "num_tokens": 640343322.0, |
| "step": 3235 |
| }, |
| { |
| "entropy": 0.6908837380734357, |
| "epoch": 0.648589626933576, |
| "grad_norm": 0.7933794260025024, |
| "learning_rate": 1.5020017793594306e-06, |
| "loss": 0.1643, |
| "mean_token_accuracy": 0.9559738993644714, |
| "num_tokens": 641375050.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.648589626933576, |
| "eval_entropy": 0.6472069806739932, |
| "eval_loss": 0.18293221294879913, |
| "eval_mean_token_accuracy": 0.9496043148587962, |
| "eval_num_tokens": 641375050.0, |
| "eval_runtime": 7.0336, |
| "eval_samples_per_second": 138.335, |
| "eval_steps_per_second": 8.673, |
| "step": 3240 |
| }, |
| { |
| "entropy": 0.702154829827222, |
| "epoch": 0.6495905368516833, |
| "grad_norm": 0.6860081553459167, |
| "learning_rate": 1.5008896797153024e-06, |
| "loss": 0.1632, |
| "mean_token_accuracy": 0.9570187379013408, |
| "num_tokens": 642328426.0, |
| "step": 3245 |
| }, |
| { |
| "entropy": 0.6984548650004647, |
| "epoch": 0.6505914467697907, |
| "grad_norm": 1.5585992336273193, |
| "learning_rate": 1.4997775800711743e-06, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9584351718425751, |
| "num_tokens": 643060158.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 0.6129773226651278, |
| "epoch": 0.6515923566878981, |
| "grad_norm": 0.9925711750984192, |
| "learning_rate": 1.4986654804270461e-06, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.958180884881453, |
| "num_tokens": 644209016.0, |
| "step": 3255 |
| }, |
| { |
| "entropy": 0.6673231913284822, |
| "epoch": 0.6525932666060055, |
| "grad_norm": 0.8757086992263794, |
| "learning_rate": 1.4975533807829182e-06, |
| "loss": 0.1648, |
| "mean_token_accuracy": 0.9552432694218376, |
| "num_tokens": 645283312.0, |
| "step": 3260 |
| }, |
| { |
| "entropy": 0.6877216878262433, |
| "epoch": 0.6535941765241128, |
| "grad_norm": 0.7658048272132874, |
| "learning_rate": 1.4964412811387899e-06, |
| "loss": 0.1604, |
| "mean_token_accuracy": 0.9569750054316087, |
| "num_tokens": 646304671.0, |
| "step": 3265 |
| }, |
| { |
| "entropy": 0.7039268119768662, |
| "epoch": 0.6545950864422202, |
| "grad_norm": 0.6302322149276733, |
| "learning_rate": 1.4953291814946617e-06, |
| "loss": 0.1626, |
| "mean_token_accuracy": 0.9568539722399279, |
| "num_tokens": 647257247.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.6545950864422202, |
| "eval_entropy": 0.6477563601048266, |
| "eval_loss": 0.1822510063648224, |
| "eval_mean_token_accuracy": 0.9499242520723187, |
| "eval_num_tokens": 647257247.0, |
| "eval_runtime": 7.0229, |
| "eval_samples_per_second": 138.547, |
| "eval_steps_per_second": 8.686, |
| "step": 3270 |
| }, |
| { |
| "entropy": 0.6974148641933094, |
| "epoch": 0.6555959963603276, |
| "grad_norm": 1.5133696794509888, |
| "learning_rate": 1.4942170818505338e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9593302407047966, |
| "num_tokens": 647996410.0, |
| "step": 3275 |
| }, |
| { |
| "entropy": 0.6135740900581533, |
| "epoch": 0.6565969062784349, |
| "grad_norm": 0.9020703434944153, |
| "learning_rate": 1.4931049822064056e-06, |
| "loss": 0.1536, |
| "mean_token_accuracy": 0.9590423145077446, |
| "num_tokens": 649158135.0, |
| "step": 3280 |
| }, |
| { |
| "entropy": 0.6622402567755092, |
| "epoch": 0.6575978161965423, |
| "grad_norm": 0.8561988472938538, |
| "learning_rate": 1.4919928825622777e-06, |
| "loss": 0.1606, |
| "mean_token_accuracy": 0.9568440372293646, |
| "num_tokens": 650238501.0, |
| "step": 3285 |
| }, |
| { |
| "entropy": 0.6838861806826158, |
| "epoch": 0.6585987261146496, |
| "grad_norm": 0.8391448259353638, |
| "learning_rate": 1.4908807829181494e-06, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.9573293057355013, |
| "num_tokens": 651266847.0, |
| "step": 3290 |
| }, |
| { |
| "entropy": 0.7013624326749281, |
| "epoch": 0.6595996360327571, |
| "grad_norm": 0.8588127493858337, |
| "learning_rate": 1.4897686832740212e-06, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9582955512133512, |
| "num_tokens": 652220456.0, |
| "step": 3295 |
| }, |
| { |
| "entropy": 0.7058557136492296, |
| "epoch": 0.6606005459508644, |
| "grad_norm": 1.6215109825134277, |
| "learning_rate": 1.4886565836298933e-06, |
| "loss": 0.1558, |
| "mean_token_accuracy": 0.9595519033345309, |
| "num_tokens": 652954302.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.6606005459508644, |
| "eval_entropy": 0.6499209511475484, |
| "eval_loss": 0.1842976212501526, |
| "eval_mean_token_accuracy": 0.948709644255091, |
| "eval_num_tokens": 652954302.0, |
| "eval_runtime": 7.0384, |
| "eval_samples_per_second": 138.241, |
| "eval_steps_per_second": 8.667, |
| "step": 3300 |
| }, |
| { |
| "entropy": 0.621334047480063, |
| "epoch": 0.6616014558689718, |
| "grad_norm": 0.9623603820800781, |
| "learning_rate": 1.4875444839857651e-06, |
| "loss": 0.1577, |
| "mean_token_accuracy": 0.9584027409553528, |
| "num_tokens": 654085373.0, |
| "step": 3305 |
| }, |
| { |
| "entropy": 0.6754447023976933, |
| "epoch": 0.6626023657870792, |
| "grad_norm": 0.8001890778541565, |
| "learning_rate": 1.4864323843416368e-06, |
| "loss": 0.1625, |
| "mean_token_accuracy": 0.9564595872705633, |
| "num_tokens": 655160148.0, |
| "step": 3310 |
| }, |
| { |
| "entropy": 0.6801682867787101, |
| "epoch": 0.6636032757051865, |
| "grad_norm": 0.7729578018188477, |
| "learning_rate": 1.4853202846975089e-06, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.956423576853492, |
| "num_tokens": 656214275.0, |
| "step": 3315 |
| }, |
| { |
| "entropy": 0.6868817286057906, |
| "epoch": 0.6646041856232939, |
| "grad_norm": 0.6810210943222046, |
| "learning_rate": 1.4842081850533807e-06, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.9585105110298503, |
| "num_tokens": 657178543.0, |
| "step": 3320 |
| }, |
| { |
| "entropy": 0.7001429920846766, |
| "epoch": 0.6656050955414012, |
| "grad_norm": 1.6506801843643188, |
| "learning_rate": 1.4830960854092528e-06, |
| "loss": 0.1543, |
| "mean_token_accuracy": 0.9600406169891358, |
| "num_tokens": 657908037.0, |
| "step": 3325 |
| }, |
| { |
| "entropy": 0.6132907515222376, |
| "epoch": 0.6666060054595087, |
| "grad_norm": 0.8838356733322144, |
| "learning_rate": 1.4819839857651244e-06, |
| "loss": 0.1537, |
| "mean_token_accuracy": 0.9591153843836351, |
| "num_tokens": 659055420.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.6666060054595087, |
| "eval_entropy": 0.64075055913847, |
| "eval_loss": 0.18135496973991394, |
| "eval_mean_token_accuracy": 0.949864628862162, |
| "eval_num_tokens": 659055420.0, |
| "eval_runtime": 7.0585, |
| "eval_samples_per_second": 137.848, |
| "eval_steps_per_second": 8.642, |
| "step": 3330 |
| }, |
| { |
| "entropy": 0.6668086566708304, |
| "epoch": 0.667606915377616, |
| "grad_norm": 0.8098176121711731, |
| "learning_rate": 1.4808718861209963e-06, |
| "loss": 0.1621, |
| "mean_token_accuracy": 0.9567366887222637, |
| "num_tokens": 660143889.0, |
| "step": 3335 |
| }, |
| { |
| "entropy": 0.6945254163308577, |
| "epoch": 0.6686078252957234, |
| "grad_norm": 0.8045607209205627, |
| "learning_rate": 1.4797597864768684e-06, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.9565726925026287, |
| "num_tokens": 661162373.0, |
| "step": 3340 |
| }, |
| { |
| "entropy": 0.7033212970603596, |
| "epoch": 0.6696087352138308, |
| "grad_norm": 0.6349719762802124, |
| "learning_rate": 1.4786476868327402e-06, |
| "loss": 0.1617, |
| "mean_token_accuracy": 0.9573416639458049, |
| "num_tokens": 662112855.0, |
| "step": 3345 |
| }, |
| { |
| "entropy": 0.7066628607836637, |
| "epoch": 0.6706096451319381, |
| "grad_norm": 1.7244939804077148, |
| "learning_rate": 1.4775355871886119e-06, |
| "loss": 0.1561, |
| "mean_token_accuracy": 0.9585809138688174, |
| "num_tokens": 662839231.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 0.6230565829710527, |
| "epoch": 0.6716105550500455, |
| "grad_norm": 0.9981881380081177, |
| "learning_rate": 1.476423487544484e-06, |
| "loss": 0.1544, |
| "mean_token_accuracy": 0.9593272588469766, |
| "num_tokens": 663977485.0, |
| "step": 3355 |
| }, |
| { |
| "entropy": 0.6668084019964392, |
| "epoch": 0.6726114649681528, |
| "grad_norm": 0.8376649618148804, |
| "learning_rate": 1.4753113879003558e-06, |
| "loss": 0.1577, |
| "mean_token_accuracy": 0.9574681758880615, |
| "num_tokens": 665089323.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.6726114649681528, |
| "eval_entropy": 0.6494350780229099, |
| "eval_loss": 0.18222181499004364, |
| "eval_mean_token_accuracy": 0.9497965378839461, |
| "eval_num_tokens": 665089323.0, |
| "eval_runtime": 7.0401, |
| "eval_samples_per_second": 138.209, |
| "eval_steps_per_second": 8.665, |
| "step": 3360 |
| }, |
| { |
| "entropy": 0.6925623609261079, |
| "epoch": 0.6736123748862602, |
| "grad_norm": 0.786201000213623, |
| "learning_rate": 1.4741992882562276e-06, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9576037005944685, |
| "num_tokens": 666117675.0, |
| "step": 3365 |
| }, |
| { |
| "entropy": 0.7104368925094604, |
| "epoch": 0.6746132848043676, |
| "grad_norm": 0.736659049987793, |
| "learning_rate": 1.4730871886120997e-06, |
| "loss": 0.1572, |
| "mean_token_accuracy": 0.9585932124744762, |
| "num_tokens": 667063051.0, |
| "step": 3370 |
| }, |
| { |
| "entropy": 0.7050894742662256, |
| "epoch": 0.675614194722475, |
| "grad_norm": 1.705169916152954, |
| "learning_rate": 1.4719750889679714e-06, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9589919149875641, |
| "num_tokens": 667787190.0, |
| "step": 3375 |
| }, |
| { |
| "entropy": 0.6144024074077606, |
| "epoch": 0.6766151046405824, |
| "grad_norm": 0.940698504447937, |
| "learning_rate": 1.4708629893238434e-06, |
| "loss": 0.1504, |
| "mean_token_accuracy": 0.9602103607221083, |
| "num_tokens": 668931786.0, |
| "step": 3380 |
| }, |
| { |
| "entropy": 0.6675797638568011, |
| "epoch": 0.6776160145586897, |
| "grad_norm": 0.859804093837738, |
| "learning_rate": 1.4697508896797153e-06, |
| "loss": 0.1538, |
| "mean_token_accuracy": 0.9588768585161729, |
| "num_tokens": 669987112.0, |
| "step": 3385 |
| }, |
| { |
| "entropy": 0.6947231168096716, |
| "epoch": 0.6786169244767971, |
| "grad_norm": 0.7688744068145752, |
| "learning_rate": 1.4686387900355871e-06, |
| "loss": 0.1608, |
| "mean_token_accuracy": 0.9564679145812989, |
| "num_tokens": 671037275.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.6786169244767971, |
| "eval_entropy": 0.6460012275664533, |
| "eval_loss": 0.18215857446193695, |
| "eval_mean_token_accuracy": 0.9496130151826827, |
| "eval_num_tokens": 671037275.0, |
| "eval_runtime": 7.0121, |
| "eval_samples_per_second": 138.76, |
| "eval_steps_per_second": 8.699, |
| "step": 3390 |
| }, |
| { |
| "entropy": 0.7101943942633542, |
| "epoch": 0.6796178343949044, |
| "grad_norm": 0.6788628101348877, |
| "learning_rate": 1.467526690391459e-06, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9584887022321874, |
| "num_tokens": 672001957.0, |
| "step": 3395 |
| }, |
| { |
| "entropy": 0.7138026226650585, |
| "epoch": 0.6806187443130118, |
| "grad_norm": 1.73914635181427, |
| "learning_rate": 1.4664145907473309e-06, |
| "loss": 0.1551, |
| "mean_token_accuracy": 0.9598342695019462, |
| "num_tokens": 672725611.0, |
| "step": 3400 |
| }, |
| { |
| "entropy": 0.6170999803326347, |
| "epoch": 0.6816196542311193, |
| "grad_norm": 0.9073975682258606, |
| "learning_rate": 1.4653024911032027e-06, |
| "loss": 0.1485, |
| "mean_token_accuracy": 0.960613077878952, |
| "num_tokens": 673851656.0, |
| "step": 3405 |
| }, |
| { |
| "entropy": 0.6696691445328973, |
| "epoch": 0.6826205641492266, |
| "grad_norm": 0.8153337836265564, |
| "learning_rate": 1.4641903914590748e-06, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.957689621773633, |
| "num_tokens": 674934713.0, |
| "step": 3410 |
| }, |
| { |
| "entropy": 0.6971980799328197, |
| "epoch": 0.683621474067334, |
| "grad_norm": 0.7351928949356079, |
| "learning_rate": 1.4630782918149464e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9583224740895357, |
| "num_tokens": 675967339.0, |
| "step": 3415 |
| }, |
| { |
| "entropy": 0.6999681651592254, |
| "epoch": 0.6846223839854413, |
| "grad_norm": 0.9492703676223755, |
| "learning_rate": 1.4619661921708185e-06, |
| "loss": 0.1508, |
| "mean_token_accuracy": 0.9597205470908772, |
| "num_tokens": 676914390.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.6846223839854413, |
| "eval_entropy": 0.6426876177553271, |
| "eval_loss": 0.18153499066829681, |
| "eval_mean_token_accuracy": 0.9500644890988459, |
| "eval_num_tokens": 676914390.0, |
| "eval_runtime": 7.0075, |
| "eval_samples_per_second": 138.851, |
| "eval_steps_per_second": 8.705, |
| "step": 3420 |
| }, |
| { |
| "entropy": 0.7060933086005124, |
| "epoch": 0.6856232939035487, |
| "grad_norm": 1.5791497230529785, |
| "learning_rate": 1.4608540925266904e-06, |
| "loss": 0.1552, |
| "mean_token_accuracy": 0.9589927136898041, |
| "num_tokens": 677638601.0, |
| "step": 3425 |
| }, |
| { |
| "entropy": 0.6128041752360084, |
| "epoch": 0.686624203821656, |
| "grad_norm": 0.9395958781242371, |
| "learning_rate": 1.4597419928825622e-06, |
| "loss": 0.1481, |
| "mean_token_accuracy": 0.9608120690692555, |
| "num_tokens": 678785268.0, |
| "step": 3430 |
| }, |
| { |
| "entropy": 0.6682237459854646, |
| "epoch": 0.6876251137397634, |
| "grad_norm": 0.8034733533859253, |
| "learning_rate": 1.458629893238434e-06, |
| "loss": 0.1563, |
| "mean_token_accuracy": 0.9580443588170138, |
| "num_tokens": 679846215.0, |
| "step": 3435 |
| }, |
| { |
| "entropy": 0.6912163682959297, |
| "epoch": 0.6886260236578708, |
| "grad_norm": 0.7852122187614441, |
| "learning_rate": 1.457517793594306e-06, |
| "loss": 0.1562, |
| "mean_token_accuracy": 0.957829516584223, |
| "num_tokens": 680888257.0, |
| "step": 3440 |
| }, |
| { |
| "entropy": 0.7082947004925121, |
| "epoch": 0.6896269335759782, |
| "grad_norm": 0.6746036410331726, |
| "learning_rate": 1.4564056939501778e-06, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.9582202104004947, |
| "num_tokens": 681859900.0, |
| "step": 3445 |
| }, |
| { |
| "entropy": 0.7029309023510326, |
| "epoch": 0.6906278434940856, |
| "grad_norm": 1.6336463689804077, |
| "learning_rate": 1.4552935943060499e-06, |
| "loss": 0.1518, |
| "mean_token_accuracy": 0.9600752207365904, |
| "num_tokens": 682593875.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.6906278434940856, |
| "eval_entropy": 0.650386592403787, |
| "eval_loss": 0.18218755722045898, |
| "eval_mean_token_accuracy": 0.9497824721649045, |
| "eval_num_tokens": 682593875.0, |
| "eval_runtime": 6.9925, |
| "eval_samples_per_second": 139.149, |
| "eval_steps_per_second": 8.724, |
| "step": 3450 |
| }, |
| { |
| "entropy": 0.6209653827277097, |
| "epoch": 0.6916287534121929, |
| "grad_norm": 0.9056838750839233, |
| "learning_rate": 1.4541814946619217e-06, |
| "loss": 0.1525, |
| "mean_token_accuracy": 0.9596792773766951, |
| "num_tokens": 683728552.0, |
| "step": 3455 |
| }, |
| { |
| "entropy": 0.6720722063021226, |
| "epoch": 0.6926296633303003, |
| "grad_norm": 0.848111093044281, |
| "learning_rate": 1.4530693950177934e-06, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9583455557172949, |
| "num_tokens": 684820291.0, |
| "step": 3460 |
| }, |
| { |
| "entropy": 0.6981573473323476, |
| "epoch": 0.6936305732484076, |
| "grad_norm": 0.8181219100952148, |
| "learning_rate": 1.4519572953736654e-06, |
| "loss": 0.1619, |
| "mean_token_accuracy": 0.9571636861020868, |
| "num_tokens": 685843222.0, |
| "step": 3465 |
| }, |
| { |
| "entropy": 0.6986917571587996, |
| "epoch": 0.694631483166515, |
| "grad_norm": 0.6309542059898376, |
| "learning_rate": 1.4508451957295373e-06, |
| "loss": 0.1487, |
| "mean_token_accuracy": 0.960549614646218, |
| "num_tokens": 686795079.0, |
| "step": 3470 |
| }, |
| { |
| "entropy": 0.6992609934373335, |
| "epoch": 0.6956323930846224, |
| "grad_norm": 1.6340988874435425, |
| "learning_rate": 1.4497330960854094e-06, |
| "loss": 0.1526, |
| "mean_token_accuracy": 0.9600770901549947, |
| "num_tokens": 687534250.0, |
| "step": 3475 |
| }, |
| { |
| "entropy": 0.6203337536616759, |
| "epoch": 0.6966333030027297, |
| "grad_norm": 0.9461851119995117, |
| "learning_rate": 1.448620996441281e-06, |
| "loss": 0.1462, |
| "mean_token_accuracy": 0.9612834345210682, |
| "num_tokens": 688659595.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.6966333030027297, |
| "eval_entropy": 0.6457425873787677, |
| "eval_loss": 0.18017198145389557, |
| "eval_mean_token_accuracy": 0.9507011683260809, |
| "eval_num_tokens": 688659595.0, |
| "eval_runtime": 7.1026, |
| "eval_samples_per_second": 136.992, |
| "eval_steps_per_second": 8.588, |
| "step": 3480 |
| }, |
| { |
| "entropy": 0.6704084252769297, |
| "epoch": 0.6976342129208372, |
| "grad_norm": 0.8556516766548157, |
| "learning_rate": 1.4475088967971529e-06, |
| "loss": 0.1551, |
| "mean_token_accuracy": 0.9571811556816101, |
| "num_tokens": 689757561.0, |
| "step": 3485 |
| }, |
| { |
| "entropy": 0.6925178121436726, |
| "epoch": 0.6986351228389445, |
| "grad_norm": 0.7813107967376709, |
| "learning_rate": 1.446396797153025e-06, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.958230844410983, |
| "num_tokens": 690812177.0, |
| "step": 3490 |
| }, |
| { |
| "entropy": 0.716183881055225, |
| "epoch": 0.6996360327570519, |
| "grad_norm": 0.6608054637908936, |
| "learning_rate": 1.4452846975088968e-06, |
| "loss": 0.1551, |
| "mean_token_accuracy": 0.9585723102092742, |
| "num_tokens": 691767285.0, |
| "step": 3495 |
| }, |
| { |
| "entropy": 0.7106182558970018, |
| "epoch": 0.7006369426751592, |
| "grad_norm": 1.6784389019012451, |
| "learning_rate": 1.4441725978647684e-06, |
| "loss": 0.1527, |
| "mean_token_accuracy": 0.959889015826312, |
| "num_tokens": 692508977.0, |
| "step": 3500 |
| }, |
| { |
| "entropy": 0.6119228395548734, |
| "epoch": 0.7016378525932666, |
| "grad_norm": 0.9040566086769104, |
| "learning_rate": 1.4430604982206405e-06, |
| "loss": 0.1463, |
| "mean_token_accuracy": 0.9611663525754756, |
| "num_tokens": 693677838.0, |
| "step": 3505 |
| }, |
| { |
| "entropy": 0.6678372830152511, |
| "epoch": 0.702638762511374, |
| "grad_norm": 0.8394715189933777, |
| "learning_rate": 1.4419483985765124e-06, |
| "loss": 0.1591, |
| "mean_token_accuracy": 0.9574539330872622, |
| "num_tokens": 694757367.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.702638762511374, |
| "eval_entropy": 0.6457846052333956, |
| "eval_loss": 0.18403349816799164, |
| "eval_mean_token_accuracy": 0.9498394170745474, |
| "eval_num_tokens": 694757367.0, |
| "eval_runtime": 7.199, |
| "eval_samples_per_second": 135.158, |
| "eval_steps_per_second": 8.473, |
| "step": 3510 |
| }, |
| { |
| "entropy": 0.6866304833780635, |
| "epoch": 0.7036396724294813, |
| "grad_norm": 0.7923183441162109, |
| "learning_rate": 1.4408362989323844e-06, |
| "loss": 0.1548, |
| "mean_token_accuracy": 0.9585600186478008, |
| "num_tokens": 695807809.0, |
| "step": 3515 |
| }, |
| { |
| "entropy": 0.698677041313865, |
| "epoch": 0.7046405823475888, |
| "grad_norm": 0.6395448446273804, |
| "learning_rate": 1.439724199288256e-06, |
| "loss": 0.1552, |
| "mean_token_accuracy": 0.9591563501141288, |
| "num_tokens": 696762341.0, |
| "step": 3520 |
| }, |
| { |
| "entropy": 0.7000049211762168, |
| "epoch": 0.7056414922656961, |
| "grad_norm": 1.704610824584961, |
| "learning_rate": 1.438612099644128e-06, |
| "loss": 0.1483, |
| "mean_token_accuracy": 0.9610144132917577, |
| "num_tokens": 697491178.0, |
| "step": 3525 |
| }, |
| { |
| "entropy": 0.616519127108834, |
| "epoch": 0.7066424021838035, |
| "grad_norm": 0.938034176826477, |
| "learning_rate": 1.4375e-06, |
| "loss": 0.1466, |
| "mean_token_accuracy": 0.961005428162488, |
| "num_tokens": 698636465.0, |
| "step": 3530 |
| }, |
| { |
| "entropy": 0.6671069808981636, |
| "epoch": 0.7076433121019108, |
| "grad_norm": 0.8367746472358704, |
| "learning_rate": 1.4363879003558719e-06, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.957984118569981, |
| "num_tokens": 699733654.0, |
| "step": 3535 |
| }, |
| { |
| "entropy": 0.686641216007146, |
| "epoch": 0.7086442220200182, |
| "grad_norm": 0.7497020959854126, |
| "learning_rate": 1.4352758007117437e-06, |
| "loss": 0.1528, |
| "mean_token_accuracy": 0.9585920035839081, |
| "num_tokens": 700769270.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.7086442220200182, |
| "eval_entropy": 0.6463534255496791, |
| "eval_loss": 0.1813378483057022, |
| "eval_mean_token_accuracy": 0.9505206639649438, |
| "eval_num_tokens": 700769270.0, |
| "eval_runtime": 7.0738, |
| "eval_samples_per_second": 137.55, |
| "eval_steps_per_second": 8.623, |
| "step": 3540 |
| }, |
| { |
| "entropy": 0.6957107446410439, |
| "epoch": 0.7096451319381256, |
| "grad_norm": 0.6717329025268555, |
| "learning_rate": 1.4341637010676156e-06, |
| "loss": 0.1505, |
| "mean_token_accuracy": 0.960306400602514, |
| "num_tokens": 701718494.0, |
| "step": 3545 |
| }, |
| { |
| "entropy": 0.7116701342842796, |
| "epoch": 0.7106460418562329, |
| "grad_norm": 1.768558144569397, |
| "learning_rate": 1.4330516014234874e-06, |
| "loss": 0.1514, |
| "mean_token_accuracy": 0.9600772873921828, |
| "num_tokens": 702436336.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 0.6127610867673701, |
| "epoch": 0.7116469517743403, |
| "grad_norm": 1.0162396430969238, |
| "learning_rate": 1.4319395017793595e-06, |
| "loss": 0.1489, |
| "mean_token_accuracy": 0.9602883994579315, |
| "num_tokens": 703581833.0, |
| "step": 3555 |
| }, |
| { |
| "entropy": 0.6664238596504385, |
| "epoch": 0.7126478616924476, |
| "grad_norm": 1.0451632738113403, |
| "learning_rate": 1.4308274021352314e-06, |
| "loss": 0.154, |
| "mean_token_accuracy": 0.9591329000212929, |
| "num_tokens": 704674589.0, |
| "step": 3560 |
| }, |
| { |
| "entropy": 0.6827213899655775, |
| "epoch": 0.7136487716105551, |
| "grad_norm": 1.1186326742172241, |
| "learning_rate": 1.429715302491103e-06, |
| "loss": 0.1515, |
| "mean_token_accuracy": 0.9586131052537398, |
| "num_tokens": 705709505.0, |
| "step": 3565 |
| }, |
| { |
| "entropy": 0.7037599785761399, |
| "epoch": 0.7146496815286624, |
| "grad_norm": 1.0326136350631714, |
| "learning_rate": 1.428603202846975e-06, |
| "loss": 0.1504, |
| "mean_token_accuracy": 0.9594932919198816, |
| "num_tokens": 706667750.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.7146496815286624, |
| "eval_entropy": 0.646297568180522, |
| "eval_loss": 0.18333254754543304, |
| "eval_mean_token_accuracy": 0.9501433626550143, |
| "eval_num_tokens": 706667750.0, |
| "eval_runtime": 7.0291, |
| "eval_samples_per_second": 138.425, |
| "eval_steps_per_second": 8.678, |
| "step": 3570 |
| }, |
| { |
| "entropy": 0.7007351406595924, |
| "epoch": 0.7156505914467698, |
| "grad_norm": 1.76486337184906, |
| "learning_rate": 1.427491103202847e-06, |
| "loss": 0.1477, |
| "mean_token_accuracy": 0.9616134372624484, |
| "num_tokens": 707404649.0, |
| "step": 3575 |
| }, |
| { |
| "entropy": 0.6178434678099373, |
| "epoch": 0.7166515013648772, |
| "grad_norm": 0.9774680137634277, |
| "learning_rate": 1.4263790035587188e-06, |
| "loss": 0.1466, |
| "mean_token_accuracy": 0.9608237499540503, |
| "num_tokens": 708535434.0, |
| "step": 3580 |
| }, |
| { |
| "entropy": 0.6685388267040253, |
| "epoch": 0.7176524112829845, |
| "grad_norm": 0.8269554972648621, |
| "learning_rate": 1.4252669039145906e-06, |
| "loss": 0.1526, |
| "mean_token_accuracy": 0.9589363054795699, |
| "num_tokens": 709623931.0, |
| "step": 3585 |
| }, |
| { |
| "entropy": 0.6837363061579791, |
| "epoch": 0.7186533212010919, |
| "grad_norm": 0.759075939655304, |
| "learning_rate": 1.4241548042704625e-06, |
| "loss": 0.1502, |
| "mean_token_accuracy": 0.9594019618901339, |
| "num_tokens": 710666538.0, |
| "step": 3590 |
| }, |
| { |
| "entropy": 0.6967312319712206, |
| "epoch": 0.7196542311191992, |
| "grad_norm": 0.7646484971046448, |
| "learning_rate": 1.4230427046263344e-06, |
| "loss": 0.1507, |
| "mean_token_accuracy": 0.9600866355679252, |
| "num_tokens": 711616703.0, |
| "step": 3595 |
| }, |
| { |
| "entropy": 0.7033360708843578, |
| "epoch": 0.7206551410373067, |
| "grad_norm": 1.6652473211288452, |
| "learning_rate": 1.4219306049822064e-06, |
| "loss": 0.1527, |
| "mean_token_accuracy": 0.9600472737442364, |
| "num_tokens": 712339902.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.7206551410373067, |
| "eval_entropy": 0.6492331389520989, |
| "eval_loss": 0.18077199161052704, |
| "eval_mean_token_accuracy": 0.9501383842014876, |
| "eval_num_tokens": 712339902.0, |
| "eval_runtime": 7.1169, |
| "eval_samples_per_second": 136.717, |
| "eval_steps_per_second": 8.571, |
| "step": 3600 |
| }, |
| { |
| "entropy": 0.6149807561527599, |
| "epoch": 0.721656050955414, |
| "grad_norm": 0.928352415561676, |
| "learning_rate": 1.420818505338078e-06, |
| "loss": 0.1451, |
| "mean_token_accuracy": 0.9617514366453345, |
| "num_tokens": 713469631.0, |
| "step": 3605 |
| }, |
| { |
| "entropy": 0.6708079273050481, |
| "epoch": 0.7226569608735214, |
| "grad_norm": 0.8746845722198486, |
| "learning_rate": 1.4197064056939501e-06, |
| "loss": 0.1553, |
| "mean_token_accuracy": 0.9591595194556496, |
| "num_tokens": 714568149.0, |
| "step": 3610 |
| }, |
| { |
| "entropy": 0.6869456586512652, |
| "epoch": 0.7236578707916288, |
| "grad_norm": 0.7715699076652527, |
| "learning_rate": 1.418594306049822e-06, |
| "loss": 0.1499, |
| "mean_token_accuracy": 0.9593700820749457, |
| "num_tokens": 715610113.0, |
| "step": 3615 |
| }, |
| { |
| "entropy": 0.6990539791909132, |
| "epoch": 0.7246587807097361, |
| "grad_norm": 0.7243727445602417, |
| "learning_rate": 1.4174822064056939e-06, |
| "loss": 0.1513, |
| "mean_token_accuracy": 0.9602306030013344, |
| "num_tokens": 716584071.0, |
| "step": 3620 |
| }, |
| { |
| "entropy": 0.7046049112623388, |
| "epoch": 0.7256596906278435, |
| "grad_norm": 1.6954907178878784, |
| "learning_rate": 1.416370106761566e-06, |
| "loss": 0.1451, |
| "mean_token_accuracy": 0.962337300452319, |
| "num_tokens": 717316967.0, |
| "step": 3625 |
| }, |
| { |
| "entropy": 0.6160386166789314, |
| "epoch": 0.7266606005459508, |
| "grad_norm": 0.9596716165542603, |
| "learning_rate": 1.4152580071174376e-06, |
| "loss": 0.1451, |
| "mean_token_accuracy": 0.9615418428724463, |
| "num_tokens": 718445509.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.7266606005459508, |
| "eval_entropy": 0.6465183843354709, |
| "eval_loss": 0.1834045797586441, |
| "eval_mean_token_accuracy": 0.94984726143665, |
| "eval_num_tokens": 718445509.0, |
| "eval_runtime": 7.0491, |
| "eval_samples_per_second": 138.031, |
| "eval_steps_per_second": 8.654, |
| "step": 3630 |
| }, |
| { |
| "entropy": 0.6670124200257388, |
| "epoch": 0.7276615104640582, |
| "grad_norm": 0.8411712646484375, |
| "learning_rate": 1.4141459074733094e-06, |
| "loss": 0.1517, |
| "mean_token_accuracy": 0.9583657335151325, |
| "num_tokens": 719527229.0, |
| "step": 3635 |
| }, |
| { |
| "entropy": 0.691483823819594, |
| "epoch": 0.7286624203821656, |
| "grad_norm": 0.8118385672569275, |
| "learning_rate": 1.4130338078291815e-06, |
| "loss": 0.152, |
| "mean_token_accuracy": 0.9590539748018438, |
| "num_tokens": 720562230.0, |
| "step": 3640 |
| }, |
| { |
| "entropy": 0.6978759061206471, |
| "epoch": 0.729663330300273, |
| "grad_norm": 0.6536839604377747, |
| "learning_rate": 1.4119217081850534e-06, |
| "loss": 0.1491, |
| "mean_token_accuracy": 0.9607249758460304, |
| "num_tokens": 721510665.0, |
| "step": 3645 |
| }, |
| { |
| "entropy": 0.692423168908466, |
| "epoch": 0.7306642402183804, |
| "grad_norm": 1.5654547214508057, |
| "learning_rate": 1.4108096085409252e-06, |
| "loss": 0.1432, |
| "mean_token_accuracy": 0.9622391191395846, |
| "num_tokens": 722236574.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 0.6100410997867585, |
| "epoch": 0.7316651501364877, |
| "grad_norm": 0.9501330852508545, |
| "learning_rate": 1.409697508896797e-06, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.9622214566577565, |
| "num_tokens": 723386101.0, |
| "step": 3655 |
| }, |
| { |
| "entropy": 0.6609509679404172, |
| "epoch": 0.7326660600545951, |
| "grad_norm": 0.8339934945106506, |
| "learning_rate": 1.408585409252669e-06, |
| "loss": 0.1517, |
| "mean_token_accuracy": 0.958848465572704, |
| "num_tokens": 724478073.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.7326660600545951, |
| "eval_entropy": 0.6399203392325855, |
| "eval_loss": 0.18328502774238586, |
| "eval_mean_token_accuracy": 0.9502141133683627, |
| "eval_num_tokens": 724478073.0, |
| "eval_runtime": 7.0325, |
| "eval_samples_per_second": 138.358, |
| "eval_steps_per_second": 8.674, |
| "step": 3660 |
| }, |
| { |
| "entropy": 0.6784357306632128, |
| "epoch": 0.7336669699727024, |
| "grad_norm": 0.7843255996704102, |
| "learning_rate": 1.407473309608541e-06, |
| "loss": 0.1497, |
| "mean_token_accuracy": 0.9594681257551366, |
| "num_tokens": 725530725.0, |
| "step": 3665 |
| }, |
| { |
| "entropy": 0.6910421122204173, |
| "epoch": 0.7346678798908098, |
| "grad_norm": 0.6633345484733582, |
| "learning_rate": 1.4063612099644126e-06, |
| "loss": 0.1475, |
| "mean_token_accuracy": 0.9606767145070163, |
| "num_tokens": 726494609.0, |
| "step": 3670 |
| }, |
| { |
| "entropy": 0.6835452020168304, |
| "epoch": 0.7356687898089171, |
| "grad_norm": 1.633773922920227, |
| "learning_rate": 1.4052491103202845e-06, |
| "loss": 0.1456, |
| "mean_token_accuracy": 0.9621837160804055, |
| "num_tokens": 727238125.0, |
| "step": 3675 |
| }, |
| { |
| "entropy": 0.603841777823188, |
| "epoch": 0.7366696997270246, |
| "grad_norm": 0.9435672760009766, |
| "learning_rate": 1.4041370106761566e-06, |
| "loss": 0.1412, |
| "mean_token_accuracy": 0.9622807643630288, |
| "num_tokens": 728383176.0, |
| "step": 3680 |
| }, |
| { |
| "entropy": 0.6598623080687089, |
| "epoch": 0.737670609645132, |
| "grad_norm": 0.9847542643547058, |
| "learning_rate": 1.4030249110320284e-06, |
| "loss": 0.1476, |
| "mean_token_accuracy": 0.960059937021949, |
| "num_tokens": 729471217.0, |
| "step": 3685 |
| }, |
| { |
| "entropy": 0.6746501594781875, |
| "epoch": 0.7386715195632393, |
| "grad_norm": 0.8633275628089905, |
| "learning_rate": 1.4019128113879003e-06, |
| "loss": 0.1535, |
| "mean_token_accuracy": 0.9588108068162745, |
| "num_tokens": 730506240.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.7386715195632393, |
| "eval_entropy": 0.6451824957230052, |
| "eval_loss": 0.1821545660495758, |
| "eval_mean_token_accuracy": 0.9503177375089927, |
| "eval_num_tokens": 730506240.0, |
| "eval_runtime": 7.0055, |
| "eval_samples_per_second": 138.891, |
| "eval_steps_per_second": 8.707, |
| "step": 3690 |
| }, |
| { |
| "entropy": 0.6850697804581035, |
| "epoch": 0.7396724294813467, |
| "grad_norm": 0.7276564836502075, |
| "learning_rate": 1.4008007117437721e-06, |
| "loss": 0.1467, |
| "mean_token_accuracy": 0.9607731239362196, |
| "num_tokens": 731447363.0, |
| "step": 3695 |
| }, |
| { |
| "entropy": 0.6955865442752838, |
| "epoch": 0.740673339399454, |
| "grad_norm": 1.6277596950531006, |
| "learning_rate": 1.399688612099644e-06, |
| "loss": 0.1486, |
| "mean_token_accuracy": 0.9611599418249998, |
| "num_tokens": 732166557.0, |
| "step": 3700 |
| }, |
| { |
| "entropy": 0.605426854707978, |
| "epoch": 0.7416742493175614, |
| "grad_norm": 0.9806519746780396, |
| "learning_rate": 1.398576512455516e-06, |
| "loss": 0.1463, |
| "mean_token_accuracy": 0.9612675450064919, |
| "num_tokens": 733291659.0, |
| "step": 3705 |
| }, |
| { |
| "entropy": 0.6565829529003664, |
| "epoch": 0.7426751592356687, |
| "grad_norm": 0.8315209746360779, |
| "learning_rate": 1.397464412811388e-06, |
| "loss": 0.1497, |
| "mean_token_accuracy": 0.9587733626365662, |
| "num_tokens": 734373176.0, |
| "step": 3710 |
| }, |
| { |
| "entropy": 0.6744102640585465, |
| "epoch": 0.7436760691537762, |
| "grad_norm": 0.7913417816162109, |
| "learning_rate": 1.3963523131672596e-06, |
| "loss": 0.1491, |
| "mean_token_accuracy": 0.9598020212216811, |
| "num_tokens": 735417403.0, |
| "step": 3715 |
| }, |
| { |
| "entropy": 0.6832040979103609, |
| "epoch": 0.7446769790718836, |
| "grad_norm": 0.6335570216178894, |
| "learning_rate": 1.3952402135231316e-06, |
| "loss": 0.1478, |
| "mean_token_accuracy": 0.9613436964425174, |
| "num_tokens": 736379877.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.7446769790718836, |
| "eval_entropy": 0.6417029988570292, |
| "eval_loss": 0.18389441072940826, |
| "eval_mean_token_accuracy": 0.9506415773610599, |
| "eval_num_tokens": 736379877.0, |
| "eval_runtime": 6.9931, |
| "eval_samples_per_second": 139.137, |
| "eval_steps_per_second": 8.723, |
| "step": 3720 |
| }, |
| { |
| "entropy": 0.6932811298153617, |
| "epoch": 0.7456778889899909, |
| "grad_norm": 1.6904598474502563, |
| "learning_rate": 1.3941281138790035e-06, |
| "loss": 0.1446, |
| "mean_token_accuracy": 0.9619651420549913, |
| "num_tokens": 737125840.0, |
| "step": 3725 |
| }, |
| { |
| "entropy": 0.6003106886690314, |
| "epoch": 0.7466787989080983, |
| "grad_norm": 0.9378305077552795, |
| "learning_rate": 1.3930160142348756e-06, |
| "loss": 0.14, |
| "mean_token_accuracy": 0.9627970738844438, |
| "num_tokens": 738254662.0, |
| "step": 3730 |
| }, |
| { |
| "entropy": 0.6617660430344668, |
| "epoch": 0.7476797088262056, |
| "grad_norm": 0.8425555229187012, |
| "learning_rate": 1.3919039145907472e-06, |
| "loss": 0.1504, |
| "mean_token_accuracy": 0.9594602817838842, |
| "num_tokens": 739351062.0, |
| "step": 3735 |
| }, |
| { |
| "entropy": 0.6762841075658799, |
| "epoch": 0.748680618744313, |
| "grad_norm": 0.7894054055213928, |
| "learning_rate": 1.390791814946619e-06, |
| "loss": 0.1473, |
| "mean_token_accuracy": 0.9604934166778217, |
| "num_tokens": 740390420.0, |
| "step": 3740 |
| }, |
| { |
| "entropy": 0.6912749702280218, |
| "epoch": 0.7496815286624203, |
| "grad_norm": 0.674893856048584, |
| "learning_rate": 1.3896797153024911e-06, |
| "loss": 0.1498, |
| "mean_token_accuracy": 0.9602149833332408, |
| "num_tokens": 741347590.0, |
| "step": 3745 |
| }, |
| { |
| "entropy": 0.6899059181863612, |
| "epoch": 0.7506824385805277, |
| "grad_norm": 1.6593986749649048, |
| "learning_rate": 1.388567615658363e-06, |
| "loss": 0.1446, |
| "mean_token_accuracy": 0.9622206286950545, |
| "num_tokens": 742079535.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.7506824385805277, |
| "eval_entropy": 0.6426626101869052, |
| "eval_loss": 0.18340618908405304, |
| "eval_mean_token_accuracy": 0.950067000310929, |
| "eval_num_tokens": 742079535.0, |
| "eval_runtime": 7.0509, |
| "eval_samples_per_second": 137.997, |
| "eval_steps_per_second": 8.651, |
| "step": 3750 |
| }, |
| { |
| "entropy": 0.6002561170946468, |
| "epoch": 0.7516833484986352, |
| "grad_norm": 0.9747891426086426, |
| "learning_rate": 1.3874555160142347e-06, |
| "loss": 0.1398, |
| "mean_token_accuracy": 0.9628812367265874, |
| "num_tokens": 743216959.0, |
| "step": 3755 |
| }, |
| { |
| "entropy": 0.6451197315346111, |
| "epoch": 0.7526842584167425, |
| "grad_norm": 0.8404658436775208, |
| "learning_rate": 1.3863434163701067e-06, |
| "loss": 0.1482, |
| "mean_token_accuracy": 0.9605841652913527, |
| "num_tokens": 744296247.0, |
| "step": 3760 |
| }, |
| { |
| "entropy": 0.6663664116100831, |
| "epoch": 0.7536851683348499, |
| "grad_norm": 0.8358835577964783, |
| "learning_rate": 1.3852313167259786e-06, |
| "loss": 0.1478, |
| "mean_token_accuracy": 0.9607979530637915, |
| "num_tokens": 745340381.0, |
| "step": 3765 |
| }, |
| { |
| "entropy": 0.6912969998338006, |
| "epoch": 0.7546860782529572, |
| "grad_norm": 0.6832510828971863, |
| "learning_rate": 1.3841192170818504e-06, |
| "loss": 0.1438, |
| "mean_token_accuracy": 0.9614791436628862, |
| "num_tokens": 746282199.0, |
| "step": 3770 |
| }, |
| { |
| "entropy": 0.6806035925041546, |
| "epoch": 0.7556869881710646, |
| "grad_norm": 1.5931488275527954, |
| "learning_rate": 1.3830071174377223e-06, |
| "loss": 0.1384, |
| "mean_token_accuracy": 0.9636109731414101, |
| "num_tokens": 747012358.0, |
| "step": 3775 |
| }, |
| { |
| "entropy": 0.599098454280333, |
| "epoch": 0.756687898089172, |
| "grad_norm": 0.974331796169281, |
| "learning_rate": 1.3818950177935942e-06, |
| "loss": 0.1401, |
| "mean_token_accuracy": 0.962726751240817, |
| "num_tokens": 748131245.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.756687898089172, |
| "eval_entropy": 0.6385055829267032, |
| "eval_loss": 0.18179599940776825, |
| "eval_mean_token_accuracy": 0.9507253433837265, |
| "eval_num_tokens": 748131245.0, |
| "eval_runtime": 7.076, |
| "eval_samples_per_second": 137.507, |
| "eval_steps_per_second": 8.621, |
| "step": 3780 |
| }, |
| { |
| "entropy": 0.6537848071618514, |
| "epoch": 0.7576888080072793, |
| "grad_norm": 0.834989607334137, |
| "learning_rate": 1.3807829181494662e-06, |
| "loss": 0.1487, |
| "mean_token_accuracy": 0.9598635267127644, |
| "num_tokens": 749231997.0, |
| "step": 3785 |
| }, |
| { |
| "entropy": 0.677148444273255, |
| "epoch": 0.7586897179253868, |
| "grad_norm": 0.8472671508789062, |
| "learning_rate": 1.379670818505338e-06, |
| "loss": 0.1497, |
| "mean_token_accuracy": 0.959491520578211, |
| "num_tokens": 750255932.0, |
| "step": 3790 |
| }, |
| { |
| "entropy": 0.6869501252066005, |
| "epoch": 0.7596906278434941, |
| "grad_norm": 0.8762997984886169, |
| "learning_rate": 1.37855871886121e-06, |
| "loss": 0.1497, |
| "mean_token_accuracy": 0.9598960773511367, |
| "num_tokens": 751208745.0, |
| "step": 3795 |
| }, |
| { |
| "entropy": 0.6860447016629305, |
| "epoch": 0.7606915377616015, |
| "grad_norm": 1.5384591817855835, |
| "learning_rate": 1.3774466192170818e-06, |
| "loss": 0.1446, |
| "mean_token_accuracy": 0.962565876678987, |
| "num_tokens": 751930047.0, |
| "step": 3800 |
| }, |
| { |
| "entropy": 0.6060644594105807, |
| "epoch": 0.7616924476797088, |
| "grad_norm": 1.1648614406585693, |
| "learning_rate": 1.3763345195729537e-06, |
| "loss": 0.1388, |
| "mean_token_accuracy": 0.9629409508271651, |
| "num_tokens": 753058832.0, |
| "step": 3805 |
| }, |
| { |
| "entropy": 0.6572167962789536, |
| "epoch": 0.7626933575978162, |
| "grad_norm": 0.8445199131965637, |
| "learning_rate": 1.3752224199288255e-06, |
| "loss": 0.1498, |
| "mean_token_accuracy": 0.9599610285325484, |
| "num_tokens": 754149822.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.7626933575978162, |
| "eval_entropy": 0.6403389221332112, |
| "eval_loss": 0.18195439875125885, |
| "eval_mean_token_accuracy": 0.9507655999699577, |
| "eval_num_tokens": 754149822.0, |
| "eval_runtime": 7.0319, |
| "eval_samples_per_second": 138.37, |
| "eval_steps_per_second": 8.675, |
| "step": 3810 |
| }, |
| { |
| "entropy": 0.6741142516786401, |
| "epoch": 0.7636942675159236, |
| "grad_norm": 0.7991525530815125, |
| "learning_rate": 1.3741103202846976e-06, |
| "loss": 0.1478, |
| "mean_token_accuracy": 0.9602746784687042, |
| "num_tokens": 755198711.0, |
| "step": 3815 |
| }, |
| { |
| "entropy": 0.6725743767890063, |
| "epoch": 0.7646951774340309, |
| "grad_norm": 0.7079398036003113, |
| "learning_rate": 1.3729982206405692e-06, |
| "loss": 0.1394, |
| "mean_token_accuracy": 0.9624946919354526, |
| "num_tokens": 756163341.0, |
| "step": 3820 |
| }, |
| { |
| "entropy": 0.6698882056908174, |
| "epoch": 0.7656960873521383, |
| "grad_norm": 1.74517822265625, |
| "learning_rate": 1.3718861209964413e-06, |
| "loss": 0.1407, |
| "mean_token_accuracy": 0.9628500087694688, |
| "num_tokens": 756905491.0, |
| "step": 3825 |
| }, |
| { |
| "entropy": 0.5904871030287309, |
| "epoch": 0.7666969972702457, |
| "grad_norm": 0.9414699077606201, |
| "learning_rate": 1.3707740213523132e-06, |
| "loss": 0.141, |
| "mean_token_accuracy": 0.9624408916993574, |
| "num_tokens": 758060283.0, |
| "step": 3830 |
| }, |
| { |
| "entropy": 0.6385924938050184, |
| "epoch": 0.7676979071883531, |
| "grad_norm": 0.8462045788764954, |
| "learning_rate": 1.369661921708185e-06, |
| "loss": 0.1453, |
| "mean_token_accuracy": 0.9610617979006334, |
| "num_tokens": 759147397.0, |
| "step": 3835 |
| }, |
| { |
| "entropy": 0.6599980823018334, |
| "epoch": 0.7686988171064604, |
| "grad_norm": 0.7839226126670837, |
| "learning_rate": 1.3685498220640569e-06, |
| "loss": 0.1471, |
| "mean_token_accuracy": 0.9601600359786641, |
| "num_tokens": 760195635.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.7686988171064604, |
| "eval_entropy": 0.6299580189048267, |
| "eval_loss": 0.18395261466503143, |
| "eval_mean_token_accuracy": 0.9503186227845364, |
| "eval_num_tokens": 760195635.0, |
| "eval_runtime": 7.0623, |
| "eval_samples_per_second": 137.773, |
| "eval_steps_per_second": 8.637, |
| "step": 3840 |
| }, |
| { |
| "entropy": 0.6697973516854373, |
| "epoch": 0.7696997270245678, |
| "grad_norm": 0.7063069939613342, |
| "learning_rate": 1.3674377224199287e-06, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.962298633293672, |
| "num_tokens": 761156776.0, |
| "step": 3845 |
| }, |
| { |
| "entropy": 0.6719630772417242, |
| "epoch": 0.7707006369426752, |
| "grad_norm": 1.6459494829177856, |
| "learning_rate": 1.3663256227758006e-06, |
| "loss": 0.1417, |
| "mean_token_accuracy": 0.9631251654841683, |
| "num_tokens": 761884891.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 0.5801961367780512, |
| "epoch": 0.7717015468607825, |
| "grad_norm": 0.9611675143241882, |
| "learning_rate": 1.3652135231316726e-06, |
| "loss": 0.1371, |
| "mean_token_accuracy": 0.963437082008882, |
| "num_tokens": 763026278.0, |
| "step": 3855 |
| }, |
| { |
| "entropy": 0.6409909424456683, |
| "epoch": 0.7727024567788899, |
| "grad_norm": 0.904933750629425, |
| "learning_rate": 1.3641014234875443e-06, |
| "loss": 0.145, |
| "mean_token_accuracy": 0.9613065215674313, |
| "num_tokens": 764133161.0, |
| "step": 3860 |
| }, |
| { |
| "entropy": 0.6667505315758965, |
| "epoch": 0.7737033666969972, |
| "grad_norm": 0.8145974278450012, |
| "learning_rate": 1.3629893238434162e-06, |
| "loss": 0.1426, |
| "mean_token_accuracy": 0.9608027013865384, |
| "num_tokens": 765182864.0, |
| "step": 3865 |
| }, |
| { |
| "entropy": 0.6728833125396209, |
| "epoch": 0.7747042766151047, |
| "grad_norm": 0.6522021889686584, |
| "learning_rate": 1.3618772241992882e-06, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.96253671429374, |
| "num_tokens": 766151392.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.7747042766151047, |
| "eval_entropy": 0.6360755171932158, |
| "eval_loss": 0.18470442295074463, |
| "eval_mean_token_accuracy": 0.9502559939368826, |
| "eval_num_tokens": 766151392.0, |
| "eval_runtime": 7.0452, |
| "eval_samples_per_second": 138.108, |
| "eval_steps_per_second": 8.658, |
| "step": 3870 |
| }, |
| { |
| "entropy": 0.6780615603381938, |
| "epoch": 0.775705186533212, |
| "grad_norm": 1.7449678182601929, |
| "learning_rate": 1.36076512455516e-06, |
| "loss": 0.1426, |
| "mean_token_accuracy": 0.9625177778980949, |
| "num_tokens": 766878686.0, |
| "step": 3875 |
| }, |
| { |
| "entropy": 0.5946085046638142, |
| "epoch": 0.7767060964513194, |
| "grad_norm": 0.929898202419281, |
| "learning_rate": 1.3596530249110321e-06, |
| "loss": 0.142, |
| "mean_token_accuracy": 0.9622558994726701, |
| "num_tokens": 768017789.0, |
| "step": 3880 |
| }, |
| { |
| "entropy": 0.6389347041195089, |
| "epoch": 0.7777070063694268, |
| "grad_norm": 0.8491219878196716, |
| "learning_rate": 1.3585409252669038e-06, |
| "loss": 0.1447, |
| "mean_token_accuracy": 0.9610752945596521, |
| "num_tokens": 769116436.0, |
| "step": 3885 |
| }, |
| { |
| "entropy": 0.652755316008221, |
| "epoch": 0.7787079162875341, |
| "grad_norm": 0.7734766006469727, |
| "learning_rate": 1.3574288256227757e-06, |
| "loss": 0.1448, |
| "mean_token_accuracy": 0.9613142360340465, |
| "num_tokens": 770163077.0, |
| "step": 3890 |
| }, |
| { |
| "entropy": 0.6724637372927232, |
| "epoch": 0.7797088262056415, |
| "grad_norm": 0.7407189607620239, |
| "learning_rate": 1.3563167259786477e-06, |
| "loss": 0.1449, |
| "mean_token_accuracy": 0.9617013438181443, |
| "num_tokens": 771128050.0, |
| "step": 3895 |
| }, |
| { |
| "entropy": 0.6701836732300845, |
| "epoch": 0.7807097361237488, |
| "grad_norm": 1.544957160949707, |
| "learning_rate": 1.3552046263345196e-06, |
| "loss": 0.1402, |
| "mean_token_accuracy": 0.9635271229527214, |
| "num_tokens": 771862832.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.7807097361237488, |
| "eval_entropy": 0.6336434786436987, |
| "eval_loss": 0.18497776985168457, |
| "eval_mean_token_accuracy": 0.9496565441616246, |
| "eval_num_tokens": 771862832.0, |
| "eval_runtime": 7.2324, |
| "eval_samples_per_second": 134.533, |
| "eval_steps_per_second": 8.434, |
| "step": 3900 |
| }, |
| { |
| "entropy": 0.6014582438902422, |
| "epoch": 0.7817106460418562, |
| "grad_norm": 0.9415779709815979, |
| "learning_rate": 1.3540925266903912e-06, |
| "loss": 0.1389, |
| "mean_token_accuracy": 0.9630598826841874, |
| "num_tokens": 772987977.0, |
| "step": 3905 |
| }, |
| { |
| "entropy": 0.6386843873695893, |
| "epoch": 0.7827115559599636, |
| "grad_norm": 0.8790847063064575, |
| "learning_rate": 1.3529804270462633e-06, |
| "loss": 0.1427, |
| "mean_token_accuracy": 0.9621189491315322, |
| "num_tokens": 774081270.0, |
| "step": 3910 |
| }, |
| { |
| "entropy": 0.6616236562078649, |
| "epoch": 0.783712465878071, |
| "grad_norm": 0.8104801177978516, |
| "learning_rate": 1.3518683274021352e-06, |
| "loss": 0.1497, |
| "mean_token_accuracy": 0.9597180469469591, |
| "num_tokens": 775149795.0, |
| "step": 3915 |
| }, |
| { |
| "entropy": 0.6556011256846515, |
| "epoch": 0.7847133757961784, |
| "grad_norm": 0.672337532043457, |
| "learning_rate": 1.3507562277580072e-06, |
| "loss": 0.1356, |
| "mean_token_accuracy": 0.9633873435583982, |
| "num_tokens": 776126187.0, |
| "step": 3920 |
| }, |
| { |
| "entropy": 0.6662999545986003, |
| "epoch": 0.7857142857142857, |
| "grad_norm": 1.6722906827926636, |
| "learning_rate": 1.3496441281138789e-06, |
| "loss": 0.1377, |
| "mean_token_accuracy": 0.9638949724760922, |
| "num_tokens": 776852493.0, |
| "step": 3925 |
| }, |
| { |
| "entropy": 0.5871605239131233, |
| "epoch": 0.7867151956323931, |
| "grad_norm": 0.9676291942596436, |
| "learning_rate": 1.3485320284697507e-06, |
| "loss": 0.1399, |
| "mean_token_accuracy": 0.9627551940354434, |
| "num_tokens": 777990863.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.7867151956323931, |
| "eval_entropy": 0.6308732477367901, |
| "eval_loss": 0.18097124993801117, |
| "eval_mean_token_accuracy": 0.9505264221644792, |
| "eval_num_tokens": 777990863.0, |
| "eval_runtime": 7.0433, |
| "eval_samples_per_second": 138.145, |
| "eval_steps_per_second": 8.661, |
| "step": 3930 |
| }, |
| { |
| "entropy": 0.6423692865805193, |
| "epoch": 0.7877161055505004, |
| "grad_norm": 0.8802201747894287, |
| "learning_rate": 1.3474199288256228e-06, |
| "loss": 0.1438, |
| "mean_token_accuracy": 0.961167681759054, |
| "num_tokens": 779081702.0, |
| "step": 3935 |
| }, |
| { |
| "entropy": 0.6667531590570103, |
| "epoch": 0.7887170154686078, |
| "grad_norm": 0.7931551933288574, |
| "learning_rate": 1.3463078291814947e-06, |
| "loss": 0.14, |
| "mean_token_accuracy": 0.9613506761464206, |
| "num_tokens": 780111893.0, |
| "step": 3940 |
| }, |
| { |
| "entropy": 0.6715754471041939, |
| "epoch": 0.7897179253867151, |
| "grad_norm": 0.7701306939125061, |
| "learning_rate": 1.3451957295373665e-06, |
| "loss": 0.1411, |
| "mean_token_accuracy": 0.9625847984444011, |
| "num_tokens": 781073403.0, |
| "step": 3945 |
| }, |
| { |
| "entropy": 0.6649499947374518, |
| "epoch": 0.7907188353048226, |
| "grad_norm": 1.5195958614349365, |
| "learning_rate": 1.3440836298932384e-06, |
| "loss": 0.1359, |
| "mean_token_accuracy": 0.9639710442586379, |
| "num_tokens": 781810124.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 0.583565341071649, |
| "epoch": 0.79171974522293, |
| "grad_norm": 0.8992461562156677, |
| "learning_rate": 1.3429715302491102e-06, |
| "loss": 0.1341, |
| "mean_token_accuracy": 0.9644046192819422, |
| "num_tokens": 782971983.0, |
| "step": 3955 |
| }, |
| { |
| "entropy": 0.640065108646046, |
| "epoch": 0.7927206551410373, |
| "grad_norm": 0.8525144457817078, |
| "learning_rate": 1.3418594306049823e-06, |
| "loss": 0.14, |
| "mean_token_accuracy": 0.9617968380451203, |
| "num_tokens": 784058621.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.7927206551410373, |
| "eval_entropy": 0.635074506529042, |
| "eval_loss": 0.18335242569446564, |
| "eval_mean_token_accuracy": 0.9508119587038384, |
| "eval_num_tokens": 784058621.0, |
| "eval_runtime": 7.1026, |
| "eval_samples_per_second": 136.992, |
| "eval_steps_per_second": 8.588, |
| "step": 3960 |
| }, |
| { |
| "entropy": 0.6650575345212763, |
| "epoch": 0.7937215650591447, |
| "grad_norm": 0.8279107213020325, |
| "learning_rate": 1.3407473309608542e-06, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.9613479056141593, |
| "num_tokens": 785094205.0, |
| "step": 3965 |
| }, |
| { |
| "entropy": 0.6705412731929259, |
| "epoch": 0.794722474977252, |
| "grad_norm": 0.6302039623260498, |
| "learning_rate": 1.3396352313167258e-06, |
| "loss": 0.1407, |
| "mean_token_accuracy": 0.9632759739052166, |
| "num_tokens": 786054123.0, |
| "step": 3970 |
| }, |
| { |
| "entropy": 0.672145242582668, |
| "epoch": 0.7957233848953594, |
| "grad_norm": 1.680174708366394, |
| "learning_rate": 1.3385231316725979e-06, |
| "loss": 0.1349, |
| "mean_token_accuracy": 0.964633910222487, |
| "num_tokens": 786786537.0, |
| "step": 3975 |
| }, |
| { |
| "entropy": 0.5964198713952845, |
| "epoch": 0.7967242948134667, |
| "grad_norm": 0.9663762450218201, |
| "learning_rate": 1.3374110320284697e-06, |
| "loss": 0.1359, |
| "mean_token_accuracy": 0.963967761668292, |
| "num_tokens": 787899948.0, |
| "step": 3980 |
| }, |
| { |
| "entropy": 0.6433003907853907, |
| "epoch": 0.7977252047315742, |
| "grad_norm": 0.8532348275184631, |
| "learning_rate": 1.3362989323843416e-06, |
| "loss": 0.1421, |
| "mean_token_accuracy": 0.961438084190542, |
| "num_tokens": 788992170.0, |
| "step": 3985 |
| }, |
| { |
| "entropy": 0.6622095113450831, |
| "epoch": 0.7987261146496816, |
| "grad_norm": 0.762026309967041, |
| "learning_rate": 1.3351868327402134e-06, |
| "loss": 0.1432, |
| "mean_token_accuracy": 0.9612802380865271, |
| "num_tokens": 790043605.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.7987261146496816, |
| "eval_entropy": 0.6395599700388361, |
| "eval_loss": 0.18441607058048248, |
| "eval_mean_token_accuracy": 0.9507672083182414, |
| "eval_num_tokens": 790043605.0, |
| "eval_runtime": 7.0864, |
| "eval_samples_per_second": 137.305, |
| "eval_steps_per_second": 8.608, |
| "step": 3990 |
| }, |
| { |
| "entropy": 0.6702088984576139, |
| "epoch": 0.7997270245677889, |
| "grad_norm": 0.6676604151725769, |
| "learning_rate": 1.3340747330960853e-06, |
| "loss": 0.1413, |
| "mean_token_accuracy": 0.9625859986652028, |
| "num_tokens": 791005452.0, |
| "step": 3995 |
| }, |
| { |
| "entropy": 0.672667917880145, |
| "epoch": 0.8007279344858963, |
| "grad_norm": 1.7214157581329346, |
| "learning_rate": 1.3329626334519572e-06, |
| "loss": 0.1364, |
| "mean_token_accuracy": 0.9645379922606728, |
| "num_tokens": 791732168.0, |
| "step": 4000 |
| }, |
| { |
| "entropy": 0.5873585137453946, |
| "epoch": 0.8017288444040036, |
| "grad_norm": 0.9482730031013489, |
| "learning_rate": 1.3318505338078292e-06, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9647230906919999, |
| "num_tokens": 792872336.0, |
| "step": 4005 |
| }, |
| { |
| "entropy": 0.6405410547148097, |
| "epoch": 0.802729754322111, |
| "grad_norm": 0.8936473727226257, |
| "learning_rate": 1.3307384341637009e-06, |
| "loss": 0.14, |
| "mean_token_accuracy": 0.9622243496504697, |
| "num_tokens": 793939516.0, |
| "step": 4010 |
| }, |
| { |
| "entropy": 0.6661894949999723, |
| "epoch": 0.8037306642402183, |
| "grad_norm": 0.7600812315940857, |
| "learning_rate": 1.329626334519573e-06, |
| "loss": 0.1397, |
| "mean_token_accuracy": 0.961715197021311, |
| "num_tokens": 794994800.0, |
| "step": 4015 |
| }, |
| { |
| "entropy": 0.6729139336130836, |
| "epoch": 0.8047315741583257, |
| "grad_norm": 0.6852670907974243, |
| "learning_rate": 1.3285142348754448e-06, |
| "loss": 0.1411, |
| "mean_token_accuracy": 0.9618587027896535, |
| "num_tokens": 795942393.0, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.8047315741583257, |
| "eval_entropy": 0.6385497704881137, |
| "eval_loss": 0.18298886716365814, |
| "eval_mean_token_accuracy": 0.95083493092021, |
| "eval_num_tokens": 795942393.0, |
| "eval_runtime": 7.22, |
| "eval_samples_per_second": 134.764, |
| "eval_steps_per_second": 8.449, |
| "step": 4020 |
| }, |
| { |
| "entropy": 0.6744548082351685, |
| "epoch": 0.8057324840764332, |
| "grad_norm": 1.6530650854110718, |
| "learning_rate": 1.3274021352313167e-06, |
| "loss": 0.1324, |
| "mean_token_accuracy": 0.965755269202319, |
| "num_tokens": 796668510.0, |
| "step": 4025 |
| }, |
| { |
| "entropy": 0.5910497521812266, |
| "epoch": 0.8067333939945405, |
| "grad_norm": 0.9345577359199524, |
| "learning_rate": 1.3262900355871887e-06, |
| "loss": 0.1355, |
| "mean_token_accuracy": 0.9639625316316431, |
| "num_tokens": 797807447.0, |
| "step": 4030 |
| }, |
| { |
| "entropy": 0.6318013099106875, |
| "epoch": 0.8077343039126479, |
| "grad_norm": 0.8773898482322693, |
| "learning_rate": 1.3251779359430604e-06, |
| "loss": 0.1404, |
| "mean_token_accuracy": 0.9624045025218617, |
| "num_tokens": 798900021.0, |
| "step": 4035 |
| }, |
| { |
| "entropy": 0.6586670068177309, |
| "epoch": 0.8087352138307552, |
| "grad_norm": 0.7887452244758606, |
| "learning_rate": 1.3240658362989322e-06, |
| "loss": 0.1415, |
| "mean_token_accuracy": 0.9622185707092286, |
| "num_tokens": 799936750.0, |
| "step": 4040 |
| }, |
| { |
| "entropy": 0.6714841157197953, |
| "epoch": 0.8097361237488626, |
| "grad_norm": 0.6408624649047852, |
| "learning_rate": 1.3229537366548043e-06, |
| "loss": 0.1395, |
| "mean_token_accuracy": 0.9631155973131006, |
| "num_tokens": 800890339.0, |
| "step": 4045 |
| }, |
| { |
| "entropy": 0.671166700666601, |
| "epoch": 0.8107370336669699, |
| "grad_norm": 1.5876646041870117, |
| "learning_rate": 1.3218416370106762e-06, |
| "loss": 0.1345, |
| "mean_token_accuracy": 0.9646015541120009, |
| "num_tokens": 801621520.0, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.8107370336669699, |
| "eval_entropy": 0.6287256723544636, |
| "eval_loss": 0.18470019102096558, |
| "eval_mean_token_accuracy": 0.950807981803769, |
| "eval_num_tokens": 801621520.0, |
| "eval_runtime": 7.0146, |
| "eval_samples_per_second": 138.711, |
| "eval_steps_per_second": 8.696, |
| "step": 4050 |
| }, |
| { |
| "entropy": 0.5836854978041215, |
| "epoch": 0.8117379435850773, |
| "grad_norm": 0.9563764929771423, |
| "learning_rate": 1.320729537366548e-06, |
| "loss": 0.137, |
| "mean_token_accuracy": 0.9636379382827065, |
| "num_tokens": 802760229.0, |
| "step": 4055 |
| }, |
| { |
| "entropy": 0.6356291532516479, |
| "epoch": 0.8127388535031848, |
| "grad_norm": 0.8574218153953552, |
| "learning_rate": 1.3196174377224199e-06, |
| "loss": 0.1469, |
| "mean_token_accuracy": 0.9603158907456831, |
| "num_tokens": 803856177.0, |
| "step": 4060 |
| }, |
| { |
| "entropy": 0.6495377407832579, |
| "epoch": 0.8137397634212921, |
| "grad_norm": 0.8074198365211487, |
| "learning_rate": 1.3185053380782917e-06, |
| "loss": 0.1386, |
| "mean_token_accuracy": 0.9626891878518191, |
| "num_tokens": 804911386.0, |
| "step": 4065 |
| }, |
| { |
| "entropy": 0.6640802432190288, |
| "epoch": 0.8147406733393995, |
| "grad_norm": 0.7216334342956543, |
| "learning_rate": 1.3173932384341638e-06, |
| "loss": 0.1376, |
| "mean_token_accuracy": 0.9628864521330053, |
| "num_tokens": 805866645.0, |
| "step": 4070 |
| }, |
| { |
| "entropy": 0.6685602345249869, |
| "epoch": 0.8157415832575068, |
| "grad_norm": 1.7459522485733032, |
| "learning_rate": 1.3162811387900354e-06, |
| "loss": 0.1392, |
| "mean_token_accuracy": 0.9641159864989194, |
| "num_tokens": 806595104.0, |
| "step": 4075 |
| }, |
| { |
| "entropy": 0.5927186499942433, |
| "epoch": 0.8167424931756142, |
| "grad_norm": 0.9222381114959717, |
| "learning_rate": 1.3151690391459073e-06, |
| "loss": 0.1358, |
| "mean_token_accuracy": 0.9640862822532654, |
| "num_tokens": 807727469.0, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.8167424931756142, |
| "eval_entropy": 0.6358805328118996, |
| "eval_loss": 0.18385587632656097, |
| "eval_mean_token_accuracy": 0.9509023574532055, |
| "eval_num_tokens": 807727469.0, |
| "eval_runtime": 7.0308, |
| "eval_samples_per_second": 138.392, |
| "eval_steps_per_second": 8.676, |
| "step": 4080 |
| }, |
| { |
| "entropy": 0.6393450959162279, |
| "epoch": 0.8177434030937215, |
| "grad_norm": 0.84666907787323, |
| "learning_rate": 1.3140569395017794e-06, |
| "loss": 0.1404, |
| "mean_token_accuracy": 0.961673707853664, |
| "num_tokens": 808810685.0, |
| "step": 4085 |
| }, |
| { |
| "entropy": 0.6551467876542698, |
| "epoch": 0.8187443130118289, |
| "grad_norm": 0.8185970783233643, |
| "learning_rate": 1.3129448398576512e-06, |
| "loss": 0.1404, |
| "mean_token_accuracy": 0.9627237601713701, |
| "num_tokens": 809853102.0, |
| "step": 4090 |
| }, |
| { |
| "entropy": 0.6675399541854858, |
| "epoch": 0.8197452229299363, |
| "grad_norm": 0.6775302886962891, |
| "learning_rate": 1.3118327402135229e-06, |
| "loss": 0.1372, |
| "mean_token_accuracy": 0.9637189258228649, |
| "num_tokens": 810804376.0, |
| "step": 4095 |
| }, |
| { |
| "entropy": 0.668658479235389, |
| "epoch": 0.8207461328480437, |
| "grad_norm": 1.633571743965149, |
| "learning_rate": 1.310720640569395e-06, |
| "loss": 0.1355, |
| "mean_token_accuracy": 0.9643102878873998, |
| "num_tokens": 811535129.0, |
| "step": 4100 |
| }, |
| { |
| "entropy": 0.5880933046340943, |
| "epoch": 0.8217470427661511, |
| "grad_norm": 0.9041305780410767, |
| "learning_rate": 1.3096085409252668e-06, |
| "loss": 0.1334, |
| "mean_token_accuracy": 0.9647295447913083, |
| "num_tokens": 812666591.0, |
| "step": 4105 |
| }, |
| { |
| "entropy": 0.6378572691570629, |
| "epoch": 0.8227479526842584, |
| "grad_norm": 0.8185557723045349, |
| "learning_rate": 1.3084964412811389e-06, |
| "loss": 0.1345, |
| "mean_token_accuracy": 0.963734941590916, |
| "num_tokens": 813752886.0, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.8227479526842584, |
| "eval_entropy": 0.6318896456820066, |
| "eval_loss": 0.18597018718719482, |
| "eval_mean_token_accuracy": 0.950390275384559, |
| "eval_num_tokens": 813752886.0, |
| "eval_runtime": 7.083, |
| "eval_samples_per_second": 137.37, |
| "eval_steps_per_second": 8.612, |
| "step": 4110 |
| }, |
| { |
| "entropy": 0.6565879512916911, |
| "epoch": 0.8237488626023658, |
| "grad_norm": 0.8251848816871643, |
| "learning_rate": 1.3073843416370107e-06, |
| "loss": 0.1373, |
| "mean_token_accuracy": 0.9621736976233396, |
| "num_tokens": 814780578.0, |
| "step": 4115 |
| }, |
| { |
| "entropy": 0.6591654620387337, |
| "epoch": 0.8247497725204732, |
| "grad_norm": 0.680801510810852, |
| "learning_rate": 1.3062722419928824e-06, |
| "loss": 0.134, |
| "mean_token_accuracy": 0.9642984504049474, |
| "num_tokens": 815727309.0, |
| "step": 4120 |
| }, |
| { |
| "entropy": 0.6689842855388468, |
| "epoch": 0.8257506824385805, |
| "grad_norm": 1.6310633420944214, |
| "learning_rate": 1.3051601423487544e-06, |
| "loss": 0.132, |
| "mean_token_accuracy": 0.9654631246219981, |
| "num_tokens": 816454050.0, |
| "step": 4125 |
| }, |
| { |
| "entropy": 0.5842076683586294, |
| "epoch": 0.8267515923566879, |
| "grad_norm": 1.0107406377792358, |
| "learning_rate": 1.3040480427046263e-06, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.965171016888185, |
| "num_tokens": 817616130.0, |
| "step": 4130 |
| }, |
| { |
| "entropy": 0.6371192531152206, |
| "epoch": 0.8277525022747952, |
| "grad_norm": 0.8968419432640076, |
| "learning_rate": 1.3029359430604982e-06, |
| "loss": 0.1369, |
| "mean_token_accuracy": 0.9628690156069669, |
| "num_tokens": 818690928.0, |
| "step": 4135 |
| }, |
| { |
| "entropy": 0.6599432755600322, |
| "epoch": 0.8287534121929027, |
| "grad_norm": 0.9083675146102905, |
| "learning_rate": 1.30182384341637e-06, |
| "loss": 0.1427, |
| "mean_token_accuracy": 0.9619063783775677, |
| "num_tokens": 819719290.0, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.8287534121929027, |
| "eval_entropy": 0.627330626620621, |
| "eval_loss": 0.18673530220985413, |
| "eval_mean_token_accuracy": 0.9505173554185962, |
| "eval_num_tokens": 819719290.0, |
| "eval_runtime": 7.0694, |
| "eval_samples_per_second": 137.636, |
| "eval_steps_per_second": 8.629, |
| "step": 4140 |
| }, |
| { |
| "entropy": 0.6650434326041829, |
| "epoch": 0.82975432211101, |
| "grad_norm": 0.7410187721252441, |
| "learning_rate": 1.3007117437722419e-06, |
| "loss": 0.1343, |
| "mean_token_accuracy": 0.9638854568654841, |
| "num_tokens": 820658717.0, |
| "step": 4145 |
| }, |
| { |
| "entropy": 0.6653602497144179, |
| "epoch": 0.8307552320291174, |
| "grad_norm": 1.657799243927002, |
| "learning_rate": 1.299599644128114e-06, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.9655446870760485, |
| "num_tokens": 821387042.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 0.5887753250924024, |
| "epoch": 0.8317561419472248, |
| "grad_norm": 0.9166984558105469, |
| "learning_rate": 1.2984875444839858e-06, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.9651481601324948, |
| "num_tokens": 822524995.0, |
| "step": 4155 |
| }, |
| { |
| "entropy": 0.6301268073645505, |
| "epoch": 0.8327570518653321, |
| "grad_norm": 0.8409593105316162, |
| "learning_rate": 1.2973754448398574e-06, |
| "loss": 0.1316, |
| "mean_token_accuracy": 0.9642227351665497, |
| "num_tokens": 823595330.0, |
| "step": 4160 |
| }, |
| { |
| "entropy": 0.6482060646468942, |
| "epoch": 0.8337579617834395, |
| "grad_norm": 0.828996479511261, |
| "learning_rate": 1.2962633451957295e-06, |
| "loss": 0.141, |
| "mean_token_accuracy": 0.9620839937166734, |
| "num_tokens": 824625650.0, |
| "step": 4165 |
| }, |
| { |
| "entropy": 0.6599689732898365, |
| "epoch": 0.8347588717015468, |
| "grad_norm": 0.6699210405349731, |
| "learning_rate": 1.2951512455516014e-06, |
| "loss": 0.1399, |
| "mean_token_accuracy": 0.9629984969442541, |
| "num_tokens": 825583528.0, |
| "step": 4170 |
| }, |
| { |
| "epoch": 0.8347588717015468, |
| "eval_entropy": 0.629080096229178, |
| "eval_loss": 0.18436747789382935, |
| "eval_mean_token_accuracy": 0.9509243994462685, |
| "eval_num_tokens": 825583528.0, |
| "eval_runtime": 7.0802, |
| "eval_samples_per_second": 137.426, |
| "eval_steps_per_second": 8.616, |
| "step": 4170 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 9992, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 30, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3661070247602422e+19, |
| "train_batch_size": 3, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|