{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991673605328892, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011101859561476548, "grad_norm": 1.140625, "learning_rate": 2.222222222222222e-06, "loss": 1.0928, "mean_token_accuracy": 0.717196784696785, "step": 5 }, { "epoch": 0.022203719122953096, "grad_norm": 1.1015625, "learning_rate": 4.444444444444444e-06, "loss": 1.1024, "mean_token_accuracy": 0.7158291392437733, "step": 10 }, { "epoch": 0.03330557868442964, "grad_norm": 1.03125, "learning_rate": 6.666666666666667e-06, "loss": 1.0782, "mean_token_accuracy": 0.7194658119658119, "step": 15 }, { "epoch": 0.04440743824590619, "grad_norm": 0.96484375, "learning_rate": 8.888888888888888e-06, "loss": 1.0777, "mean_token_accuracy": 0.7197588522588524, "step": 20 }, { "epoch": 0.055509297807382736, "grad_norm": 0.7734375, "learning_rate": 1.1111111111111113e-05, "loss": 1.0614, "mean_token_accuracy": 0.7216748066748067, "step": 25 }, { "epoch": 0.06661115736885928, "grad_norm": 0.6796875, "learning_rate": 1.3333333333333333e-05, "loss": 1.059, "mean_token_accuracy": 0.719067969067969, "step": 30 }, { "epoch": 0.07771301693033583, "grad_norm": 0.6171875, "learning_rate": 1.555555555555556e-05, "loss": 1.0328, "mean_token_accuracy": 0.7237962962962966, "step": 35 }, { "epoch": 0.08881487649181238, "grad_norm": 0.55859375, "learning_rate": 1.7777777777777777e-05, "loss": 1.014, "mean_token_accuracy": 0.7259676434676435, "step": 40 }, { "epoch": 0.09991673605328892, "grad_norm": 0.46484375, "learning_rate": 2e-05, "loss": 1.0066, "mean_token_accuracy": 0.725960520960521, "step": 45 }, { "epoch": 0.11101859561476547, "grad_norm": 0.4140625, "learning_rate": 1.9992479525042305e-05, "loss": 0.9537, "mean_token_accuracy": 0.7386558811558809, "step": 50 }, { "epoch": 0.11101859561476547, "eval_loss": 0.9796226024627686, "eval_mean_token_accuracy": 0.7271781595311009, "eval_runtime": 9.7194, "eval_samples_per_second": 13.272, "eval_steps_per_second": 3.395, "step": 50 }, { "epoch": 0.12212045517624202, "grad_norm": 0.359375, "learning_rate": 1.996992941167792e-05, "loss": 0.9602, "mean_token_accuracy": 0.7342826617826618, "step": 55 }, { "epoch": 0.13322231473771856, "grad_norm": 0.326171875, "learning_rate": 1.9932383577419432e-05, "loss": 0.916, "mean_token_accuracy": 0.74437307661244, "step": 60 }, { "epoch": 0.1443241742991951, "grad_norm": 0.3515625, "learning_rate": 1.9879898494768093e-05, "loss": 0.909, "mean_token_accuracy": 0.7461548636548637, "step": 65 }, { "epoch": 0.15542603386067166, "grad_norm": 0.3125, "learning_rate": 1.9812553106273848e-05, "loss": 0.9079, "mean_token_accuracy": 0.74519129019129, "step": 70 }, { "epoch": 0.16652789342214822, "grad_norm": 0.333984375, "learning_rate": 1.973044870579824e-05, "loss": 0.9071, "mean_token_accuracy": 0.7446357346357346, "step": 75 }, { "epoch": 0.17762975298362477, "grad_norm": 0.30078125, "learning_rate": 1.9633708786158803e-05, "loss": 0.8953, "mean_token_accuracy": 0.746184371184371, "step": 80 }, { "epoch": 0.18873161254510132, "grad_norm": 0.275390625, "learning_rate": 1.9522478853384154e-05, "loss": 0.8582, "mean_token_accuracy": 0.7562728937728938, "step": 85 }, { "epoch": 0.19983347210657784, "grad_norm": 0.296875, "learning_rate": 1.9396926207859085e-05, "loss": 0.8719, "mean_token_accuracy": 0.7528663003663003, "step": 90 }, { "epoch": 0.2109353316680544, "grad_norm": 0.271484375, "learning_rate": 1.9257239692688907e-05, "loss": 0.8735, "mean_token_accuracy": 0.7512274542096703, "step": 95 }, { "epoch": 0.22203719122953094, "grad_norm": 0.2734375, "learning_rate": 1.9103629409661468e-05, "loss": 0.8656, "mean_token_accuracy": 0.7536752136752136, "step": 100 }, { "epoch": 0.22203719122953094, "eval_loss": 0.9016062617301941, "eval_mean_token_accuracy": 0.7405376758317936, "eval_runtime": 9.7188, "eval_samples_per_second": 13.273, "eval_steps_per_second": 3.395, "step": 100 }, { "epoch": 0.2331390507910075, "grad_norm": 0.2734375, "learning_rate": 1.8936326403234125e-05, "loss": 0.8687, "mean_token_accuracy": 0.7531064306064306, "step": 105 }, { "epoch": 0.24424091035248405, "grad_norm": 0.267578125, "learning_rate": 1.8755582313020912e-05, "loss": 0.8548, "mean_token_accuracy": 0.75515422674015, "step": 110 }, { "epoch": 0.2553427699139606, "grad_norm": 0.26953125, "learning_rate": 1.8561668995302668e-05, "loss": 0.8543, "mean_token_accuracy": 0.7542501017501018, "step": 115 }, { "epoch": 0.2664446294754371, "grad_norm": 0.26953125, "learning_rate": 1.8354878114129368e-05, "loss": 0.8373, "mean_token_accuracy": 0.7589896214896215, "step": 120 }, { "epoch": 0.2775464890369137, "grad_norm": 0.265625, "learning_rate": 1.8135520702629677e-05, "loss": 0.8489, "mean_token_accuracy": 0.7568091168091166, "step": 125 }, { "epoch": 0.2886483485983902, "grad_norm": 0.267578125, "learning_rate": 1.7903926695187595e-05, "loss": 0.8557, "mean_token_accuracy": 0.7542979242979242, "step": 130 }, { "epoch": 0.2997502081598668, "grad_norm": 0.26171875, "learning_rate": 1.766044443118978e-05, "loss": 0.8329, "mean_token_accuracy": 0.7595054945054944, "step": 135 }, { "epoch": 0.31085206772134333, "grad_norm": 0.2490234375, "learning_rate": 1.740544013109005e-05, "loss": 0.874, "mean_token_accuracy": 0.748472730972731, "step": 140 }, { "epoch": 0.32195392728281985, "grad_norm": 0.255859375, "learning_rate": 1.7139297345578992e-05, "loss": 0.8295, "mean_token_accuracy": 0.7612077737077737, "step": 145 }, { "epoch": 0.33305578684429643, "grad_norm": 0.251953125, "learning_rate": 1.686241637868734e-05, "loss": 0.8092, "mean_token_accuracy": 0.7669424094424093, "step": 150 }, { "epoch": 0.33305578684429643, "eval_loss": 0.8762778639793396, "eval_mean_token_accuracy": 0.7449022654905009, "eval_runtime": 9.7018, "eval_samples_per_second": 13.296, "eval_steps_per_second": 3.401, "step": 150 }, { "epoch": 0.34415764640577295, "grad_norm": 0.251953125, "learning_rate": 1.657521368569064e-05, "loss": 0.8355, "mean_token_accuracy": 0.7576159951159951, "step": 155 }, { "epoch": 0.35525950596724953, "grad_norm": 0.2578125, "learning_rate": 1.627812124672099e-05, "loss": 0.8398, "mean_token_accuracy": 0.7564778061483249, "step": 160 }, { "epoch": 0.36636136552872606, "grad_norm": 0.251953125, "learning_rate": 1.5971585917027864e-05, "loss": 0.8576, "mean_token_accuracy": 0.7524267399267397, "step": 165 }, { "epoch": 0.37746322509020264, "grad_norm": 0.2578125, "learning_rate": 1.5656068754865388e-05, "loss": 0.8254, "mean_token_accuracy": 0.760271918721826, "step": 170 }, { "epoch": 0.38856508465167916, "grad_norm": 0.255859375, "learning_rate": 1.5332044328016916e-05, "loss": 0.8502, "mean_token_accuracy": 0.7546072446072445, "step": 175 }, { "epoch": 0.3996669442131557, "grad_norm": 0.25390625, "learning_rate": 1.5000000000000002e-05, "loss": 0.8418, "mean_token_accuracy": 0.7569129019129018, "step": 180 }, { "epoch": 0.41076880377463226, "grad_norm": 0.255859375, "learning_rate": 1.4660435197025391e-05, "loss": 0.8346, "mean_token_accuracy": 0.7576149776149775, "step": 185 }, { "epoch": 0.4218706633361088, "grad_norm": 0.24609375, "learning_rate": 1.4313860656812537e-05, "loss": 0.8478, "mean_token_accuracy": 0.7535816035816034, "step": 190 }, { "epoch": 0.43297252289758537, "grad_norm": 0.259765625, "learning_rate": 1.396079766039157e-05, "loss": 0.8107, "mean_token_accuracy": 0.7654395604395604, "step": 195 }, { "epoch": 0.4440743824590619, "grad_norm": 0.251953125, "learning_rate": 1.3601777248047105e-05, "loss": 0.8124, "mean_token_accuracy": 0.763194953194953, "step": 200 }, { "epoch": 0.4440743824590619, "eval_loss": 0.8637130260467529, "eval_mean_token_accuracy": 0.7472996502408268, "eval_runtime": 9.6971, "eval_samples_per_second": 13.303, "eval_steps_per_second": 3.403, "step": 200 }, { "epoch": 0.4551762420205384, "grad_norm": 0.25390625, "learning_rate": 1.3237339420583213e-05, "loss": 0.8375, "mean_token_accuracy": 0.757132682132682, "step": 205 }, { "epoch": 0.466278101582015, "grad_norm": 0.251953125, "learning_rate": 1.2868032327110904e-05, "loss": 0.8577, "mean_token_accuracy": 0.750907610907611, "step": 210 }, { "epoch": 0.4773799611434915, "grad_norm": 0.2431640625, "learning_rate": 1.2494411440579814e-05, "loss": 0.82, "mean_token_accuracy": 0.7625814000814001, "step": 215 }, { "epoch": 0.4884818207049681, "grad_norm": 0.25390625, "learning_rate": 1.211703872229411e-05, "loss": 0.8099, "mean_token_accuracy": 0.7646743996743997, "step": 220 }, { "epoch": 0.4995836802664446, "grad_norm": 0.2490234375, "learning_rate": 1.1736481776669307e-05, "loss": 0.8195, "mean_token_accuracy": 0.7627518372346456, "step": 225 }, { "epoch": 0.5106855398279212, "grad_norm": 0.2490234375, "learning_rate": 1.1353312997501313e-05, "loss": 0.8248, "mean_token_accuracy": 0.7611874236874236, "step": 230 }, { "epoch": 0.5217873993893978, "grad_norm": 0.23828125, "learning_rate": 1.0968108707031792e-05, "loss": 0.8174, "mean_token_accuracy": 0.7629131054131055, "step": 235 }, { "epoch": 0.5328892589508742, "grad_norm": 0.244140625, "learning_rate": 1.0581448289104759e-05, "loss": 0.8105, "mean_token_accuracy": 0.7650734632542214, "step": 240 }, { "epoch": 0.5439911185123508, "grad_norm": 0.244140625, "learning_rate": 1.0193913317718245e-05, "loss": 0.81, "mean_token_accuracy": 0.7640618640618639, "step": 245 }, { "epoch": 0.5550929780738274, "grad_norm": 0.2421875, "learning_rate": 9.806086682281759e-06, "loss": 0.8205, "mean_token_accuracy": 0.7610907610907611, "step": 250 }, { "epoch": 0.5550929780738274, "eval_loss": 0.8572535514831543, "eval_mean_token_accuracy": 0.748435551376728, "eval_runtime": 9.7051, "eval_samples_per_second": 13.292, "eval_steps_per_second": 3.4, "step": 250 }, { "epoch": 0.5661948376353039, "grad_norm": 0.255859375, "learning_rate": 9.418551710895243e-06, "loss": 0.8235, "mean_token_accuracy": 0.7599084249084251, "step": 255 }, { "epoch": 0.5772966971967805, "grad_norm": 0.251953125, "learning_rate": 9.03189129296821e-06, "loss": 0.8286, "mean_token_accuracy": 0.7585877085877086, "step": 260 }, { "epoch": 0.588398556758257, "grad_norm": 0.259765625, "learning_rate": 8.646687002498692e-06, "loss": 0.8256, "mean_token_accuracy": 0.7592775742775744, "step": 265 }, { "epoch": 0.5995004163197336, "grad_norm": 0.251953125, "learning_rate": 8.263518223330698e-06, "loss": 0.8217, "mean_token_accuracy": 0.7614479039479042, "step": 270 }, { "epoch": 0.6106022758812101, "grad_norm": 0.2431640625, "learning_rate": 7.882961277705897e-06, "loss": 0.8354, "mean_token_accuracy": 0.7570135306548853, "step": 275 }, { "epoch": 0.6217041354426867, "grad_norm": 0.2490234375, "learning_rate": 7.505588559420188e-06, "loss": 0.8222, "mean_token_accuracy": 0.7614336589336589, "step": 280 }, { "epoch": 0.6328059950041632, "grad_norm": 0.244140625, "learning_rate": 7.131967672889101e-06, "loss": 0.8579, "mean_token_accuracy": 0.7512146494805758, "step": 285 }, { "epoch": 0.6439078545656397, "grad_norm": 0.25390625, "learning_rate": 6.762660579416791e-06, "loss": 0.8253, "mean_token_accuracy": 0.7587220187220187, "step": 290 }, { "epoch": 0.6550097141271163, "grad_norm": 0.2412109375, "learning_rate": 6.3982227519528986e-06, "loss": 0.8294, "mean_token_accuracy": 0.7600905599174735, "step": 295 }, { "epoch": 0.6661115736885929, "grad_norm": 0.2470703125, "learning_rate": 6.039202339608432e-06, "loss": 0.8129, "mean_token_accuracy": 0.763727106227106, "step": 300 }, { "epoch": 0.6661115736885929, "eval_loss": 0.854372501373291, "eval_mean_token_accuracy": 0.7488219841161018, "eval_runtime": 9.716, "eval_samples_per_second": 13.277, "eval_steps_per_second": 3.396, "step": 300 }, { "epoch": 0.6772134332500694, "grad_norm": 0.2431640625, "learning_rate": 5.686139343187468e-06, "loss": 0.8089, "mean_token_accuracy": 0.7635978835978834, "step": 305 }, { "epoch": 0.6883152928115459, "grad_norm": 0.2421875, "learning_rate": 5.339564802974615e-06, "loss": 0.8162, "mean_token_accuracy": 0.7625620675620676, "step": 310 }, { "epoch": 0.6994171523730225, "grad_norm": 0.25, "learning_rate": 5.000000000000003e-06, "loss": 0.818, "mean_token_accuracy": 0.7617165242165239, "step": 315 }, { "epoch": 0.7105190119344991, "grad_norm": 0.251953125, "learning_rate": 4.66795567198309e-06, "loss": 0.8164, "mean_token_accuracy": 0.7618395600557701, "step": 320 }, { "epoch": 0.7216208714959755, "grad_norm": 0.248046875, "learning_rate": 4.343931245134616e-06, "loss": 0.8391, "mean_token_accuracy": 0.7558740333740334, "step": 325 }, { "epoch": 0.7327227310574521, "grad_norm": 0.251953125, "learning_rate": 4.028414082972141e-06, "loss": 0.8288, "mean_token_accuracy": 0.7581603581603582, "step": 330 }, { "epoch": 0.7438245906189287, "grad_norm": 0.2470703125, "learning_rate": 3.7218787532790167e-06, "loss": 0.8266, "mean_token_accuracy": 0.758972323972324, "step": 335 }, { "epoch": 0.7549264501804053, "grad_norm": 0.23828125, "learning_rate": 3.424786314309365e-06, "loss": 0.8169, "mean_token_accuracy": 0.7615511468202194, "step": 340 }, { "epoch": 0.7660283097418817, "grad_norm": 0.248046875, "learning_rate": 3.1375836213126653e-06, "loss": 0.8207, "mean_token_accuracy": 0.7615923890923892, "step": 345 }, { "epoch": 0.7771301693033583, "grad_norm": 0.23828125, "learning_rate": 2.8607026544210115e-06, "loss": 0.8251, "mean_token_accuracy": 0.7595614570614571, "step": 350 }, { "epoch": 0.7771301693033583, "eval_loss": 0.8534859418869019, "eval_mean_token_accuracy": 0.7491665197547551, "eval_runtime": 9.7117, "eval_samples_per_second": 13.283, "eval_steps_per_second": 3.398, "step": 350 }, { "epoch": 0.7882320288648349, "grad_norm": 0.2314453125, "learning_rate": 2.594559868909956e-06, "loss": 0.789, "mean_token_accuracy": 0.7690638990638987, "step": 355 }, { "epoch": 0.7993338884263114, "grad_norm": 0.2490234375, "learning_rate": 2.339555568810221e-06, "loss": 0.8195, "mean_token_accuracy": 0.76007733007733, "step": 360 }, { "epoch": 0.810435747987788, "grad_norm": 0.2578125, "learning_rate": 2.0960733048124082e-06, "loss": 0.8358, "mean_token_accuracy": 0.7567429792429792, "step": 365 }, { "epoch": 0.8215376075492645, "grad_norm": 0.234375, "learning_rate": 1.8644792973703252e-06, "loss": 0.797, "mean_token_accuracy": 0.7672700447700449, "step": 370 }, { "epoch": 0.832639467110741, "grad_norm": 0.2333984375, "learning_rate": 1.6451218858706374e-06, "loss": 0.8269, "mean_token_accuracy": 0.7588715913715915, "step": 375 }, { "epoch": 0.8437413266722176, "grad_norm": 0.244140625, "learning_rate": 1.4383310046973365e-06, "loss": 0.8243, "mean_token_accuracy": 0.7594324261790015, "step": 380 }, { "epoch": 0.8548431862336942, "grad_norm": 0.2412109375, "learning_rate": 1.2444176869790925e-06, "loss": 0.8229, "mean_token_accuracy": 0.7609747659747659, "step": 385 }, { "epoch": 0.8659450457951707, "grad_norm": 0.24609375, "learning_rate": 1.0636735967658785e-06, "loss": 0.804, "mean_token_accuracy": 0.7651831501831503, "step": 390 }, { "epoch": 0.8770469053566472, "grad_norm": 0.2470703125, "learning_rate": 8.963705903385344e-07, "loss": 0.8512, "mean_token_accuracy": 0.7523127798127797, "step": 395 }, { "epoch": 0.8881487649181238, "grad_norm": 0.259765625, "learning_rate": 7.427603073110967e-07, "loss": 0.8437, "mean_token_accuracy": 0.7555453805453806, "step": 400 }, { "epoch": 0.8881487649181238, "eval_loss": 0.8532679080963135, "eval_mean_token_accuracy": 0.748890434184552, "eval_runtime": 9.7147, "eval_samples_per_second": 13.279, "eval_steps_per_second": 3.397, "step": 400 }, { "epoch": 0.8992506244796004, "grad_norm": 0.2490234375, "learning_rate": 6.030737921409169e-07, "loss": 0.8155, "mean_token_accuracy": 0.7616107041107041, "step": 405 }, { "epoch": 0.9103524840410768, "grad_norm": 0.25390625, "learning_rate": 4.775211466158469e-07, "loss": 0.8045, "mean_token_accuracy": 0.7646845746845747, "step": 410 }, { "epoch": 0.9214543436025534, "grad_norm": 0.2412109375, "learning_rate": 3.662912138411967e-07, "loss": 0.848, "mean_token_accuracy": 0.7535592185592186, "step": 415 }, { "epoch": 0.93255620316403, "grad_norm": 0.2412109375, "learning_rate": 2.6955129420176193e-07, "loss": 0.796, "mean_token_accuracy": 0.7675030525030523, "step": 420 }, { "epoch": 0.9436580627255066, "grad_norm": 0.2451171875, "learning_rate": 1.874468937261531e-07, "loss": 0.8322, "mean_token_accuracy": 0.7581685062193277, "step": 425 }, { "epoch": 0.954759922286983, "grad_norm": 0.265625, "learning_rate": 1.201015052319099e-07, "loss": 0.8023, "mean_token_accuracy": 0.7657336182336185, "step": 430 }, { "epoch": 0.9658617818484596, "grad_norm": 0.23828125, "learning_rate": 6.761642258056977e-08, "loss": 0.8094, "mean_token_accuracy": 0.7642338217338217, "step": 435 }, { "epoch": 0.9769636414099362, "grad_norm": 0.2392578125, "learning_rate": 3.0070588322079765e-08, "loss": 0.8206, "mean_token_accuracy": 0.7607519332519332, "step": 440 }, { "epoch": 0.9880655009714127, "grad_norm": 0.2490234375, "learning_rate": 7.520474957699586e-09, "loss": 0.8151, "mean_token_accuracy": 0.7614092389092388, "step": 445 }, { "epoch": 0.9991673605328892, "grad_norm": 0.23828125, "learning_rate": 0.0, "loss": 0.7954, "mean_token_accuracy": 0.7673087098087098, "step": 450 }, { "epoch": 0.9991673605328892, "eval_loss": 0.8533338904380798, "eval_mean_token_accuracy": 0.748884884179002, "eval_runtime": 9.7042, "eval_samples_per_second": 13.293, "eval_steps_per_second": 3.401, "step": 450 }, { "epoch": 0.9991673605328892, "step": 450, "total_flos": 6.955833048956928e+17, "train_loss": 0.8593901687198215, "train_runtime": 5693.7016, "train_samples_per_second": 3.797, "train_steps_per_second": 0.079 } ], "logging_steps": 5, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.955833048956928e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }