{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 612, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006535947712418301, "grad_norm": 11.154335021972656, "learning_rate": 8.064516129032259e-08, "loss": 1.1172, "step": 1 }, { "epoch": 0.06535947712418301, "grad_norm": 5.521430969238281, "learning_rate": 8.064516129032258e-07, "loss": 1.0538, "step": 10 }, { "epoch": 0.13071895424836602, "grad_norm": 2.362562417984009, "learning_rate": 1.6129032258064516e-06, "loss": 0.9643, "step": 20 }, { "epoch": 0.19607843137254902, "grad_norm": 2.3052921295166016, "learning_rate": 2.4193548387096776e-06, "loss": 0.9044, "step": 30 }, { "epoch": 0.26143790849673204, "grad_norm": 1.4959567785263062, "learning_rate": 3.225806451612903e-06, "loss": 0.8643, "step": 40 }, { "epoch": 0.32679738562091504, "grad_norm": 1.5943924188613892, "learning_rate": 4.032258064516129e-06, "loss": 0.8507, "step": 50 }, { "epoch": 0.39215686274509803, "grad_norm": 1.6096584796905518, "learning_rate": 4.838709677419355e-06, "loss": 0.8438, "step": 60 }, { "epoch": 0.45751633986928103, "grad_norm": 1.4947034120559692, "learning_rate": 4.997390310845578e-06, "loss": 0.8257, "step": 70 }, { "epoch": 0.5228758169934641, "grad_norm": 1.7357966899871826, "learning_rate": 4.986797785768296e-06, "loss": 0.832, "step": 80 }, { "epoch": 0.5882352941176471, "grad_norm": 2.4221644401550293, "learning_rate": 4.968093843200407e-06, "loss": 0.8168, "step": 90 }, { "epoch": 0.6535947712418301, "grad_norm": 1.916853427886963, "learning_rate": 4.9413394915149094e-06, "loss": 0.8077, "step": 100 }, { "epoch": 0.7189542483660131, "grad_norm": 1.7931920289993286, "learning_rate": 4.9066219978460485e-06, "loss": 0.7937, "step": 110 }, { "epoch": 0.7843137254901961, "grad_norm": 1.5507404804229736, "learning_rate": 4.864054603442063e-06, "loss": 0.784, "step": 120 }, { "epoch": 0.8496732026143791, "grad_norm": 1.6783676147460938, "learning_rate": 4.813776154295767e-06, "loss": 0.7874, "step": 130 }, { "epoch": 0.9150326797385621, "grad_norm": 1.4907172918319702, "learning_rate": 4.755950648257789e-06, "loss": 0.7858, "step": 140 }, { "epoch": 0.9803921568627451, "grad_norm": 1.4448139667510986, "learning_rate": 4.690766700109659e-06, "loss": 0.7849, "step": 150 }, { "epoch": 1.0457516339869282, "grad_norm": 1.9426017999649048, "learning_rate": 4.618436926341607e-06, "loss": 0.6916, "step": 160 }, { "epoch": 1.1111111111111112, "grad_norm": 1.858111023902893, "learning_rate": 4.5391972516417545e-06, "loss": 0.6377, "step": 170 }, { "epoch": 1.1764705882352942, "grad_norm": 1.7554138898849487, "learning_rate": 4.453306139358828e-06, "loss": 0.6431, "step": 180 }, { "epoch": 1.2418300653594772, "grad_norm": 1.6084789037704468, "learning_rate": 4.36104374844843e-06, "loss": 0.6474, "step": 190 }, { "epoch": 1.3071895424836601, "grad_norm": 1.7224164009094238, "learning_rate": 4.262711019652764e-06, "loss": 0.6372, "step": 200 }, { "epoch": 1.3725490196078431, "grad_norm": 1.5711984634399414, "learning_rate": 4.15862869389448e-06, "loss": 0.6379, "step": 210 }, { "epoch": 1.4379084967320261, "grad_norm": 1.7631185054779053, "learning_rate": 4.049136266086453e-06, "loss": 0.6302, "step": 220 }, { "epoch": 1.5032679738562091, "grad_norm": 1.8376095294952393, "learning_rate": 3.934590877769944e-06, "loss": 0.6378, "step": 230 }, { "epoch": 1.5686274509803921, "grad_norm": 2.0489087104797363, "learning_rate": 3.815366152193122e-06, "loss": 0.6164, "step": 240 }, { "epoch": 1.6339869281045751, "grad_norm": 1.8818341493606567, "learning_rate": 3.6918509756296876e-06, "loss": 0.6284, "step": 250 }, { "epoch": 1.6993464052287581, "grad_norm": 1.636940598487854, "learning_rate": 3.564448228912682e-06, "loss": 0.6223, "step": 260 }, { "epoch": 1.7647058823529411, "grad_norm": 1.699742078781128, "learning_rate": 3.4335734733209457e-06, "loss": 0.6212, "step": 270 }, { "epoch": 1.8300653594771243, "grad_norm": 1.6848982572555542, "learning_rate": 3.299653595104603e-06, "loss": 0.6241, "step": 280 }, { "epoch": 1.8954248366013071, "grad_norm": 1.8798364400863647, "learning_rate": 3.1631254130708446e-06, "loss": 0.6149, "step": 290 }, { "epoch": 1.9607843137254903, "grad_norm": 2.14373517036438, "learning_rate": 3.0244342537717735e-06, "loss": 0.6124, "step": 300 }, { "epoch": 2.026143790849673, "grad_norm": 2.7039153575897217, "learning_rate": 2.8840324989417488e-06, "loss": 0.5466, "step": 310 }, { "epoch": 2.0915032679738563, "grad_norm": 2.615293025970459, "learning_rate": 2.742378109922204e-06, "loss": 0.4731, "step": 320 }, { "epoch": 2.156862745098039, "grad_norm": 2.0649566650390625, "learning_rate": 2.599933133886934e-06, "loss": 0.4673, "step": 330 }, { "epoch": 2.2222222222222223, "grad_norm": 1.7854645252227783, "learning_rate": 2.457162196740252e-06, "loss": 0.4639, "step": 340 }, { "epoch": 2.287581699346405, "grad_norm": 1.954106330871582, "learning_rate": 2.31453098760387e-06, "loss": 0.4732, "step": 350 }, { "epoch": 2.3529411764705883, "grad_norm": 1.7365140914916992, "learning_rate": 2.1725047398357677e-06, "loss": 0.468, "step": 360 }, { "epoch": 2.418300653594771, "grad_norm": 1.9597340822219849, "learning_rate": 2.031546713535688e-06, "loss": 0.4646, "step": 370 }, { "epoch": 2.4836601307189543, "grad_norm": 1.8259657621383667, "learning_rate": 1.8921166844869762e-06, "loss": 0.4584, "step": 380 }, { "epoch": 2.549019607843137, "grad_norm": 1.9516103267669678, "learning_rate": 1.7546694444635394e-06, "loss": 0.4644, "step": 390 }, { "epoch": 2.6143790849673203, "grad_norm": 1.826661229133606, "learning_rate": 1.6196533177936132e-06, "loss": 0.4674, "step": 400 }, { "epoch": 2.6797385620915035, "grad_norm": 1.6825226545333862, "learning_rate": 1.487508699018987e-06, "loss": 0.4614, "step": 410 }, { "epoch": 2.7450980392156863, "grad_norm": 1.7974435091018677, "learning_rate": 1.358666616419544e-06, "loss": 0.4676, "step": 420 }, { "epoch": 2.810457516339869, "grad_norm": 1.6989048719406128, "learning_rate": 1.2335473260886046e-06, "loss": 0.4496, "step": 430 }, { "epoch": 2.8758169934640523, "grad_norm": 1.7996526956558228, "learning_rate": 1.1125589411448996e-06, "loss": 0.4597, "step": 440 }, { "epoch": 2.9411764705882355, "grad_norm": 1.746968388557434, "learning_rate": 9.960961005524033e-07, "loss": 0.4532, "step": 450 }, { "epoch": 3.0065359477124183, "grad_norm": 6.076014995574951, "learning_rate": 8.845386818900647e-07, "loss": 0.4454, "step": 460 }, { "epoch": 3.0718954248366015, "grad_norm": 2.6995885372161865, "learning_rate": 7.782505622700964e-07, "loss": 0.3719, "step": 470 }, { "epoch": 3.1372549019607843, "grad_norm": 2.025956869125366, "learning_rate": 6.775784314464717e-07, "loss": 0.3699, "step": 480 }, { "epoch": 3.2026143790849675, "grad_norm": 1.8959600925445557, "learning_rate": 5.828506609850054e-07, "loss": 0.3585, "step": 490 }, { "epoch": 3.2679738562091503, "grad_norm": 1.9729666709899902, "learning_rate": 4.943762331835622e-07, "loss": 0.3579, "step": 500 }, { "epoch": 3.3333333333333335, "grad_norm": 1.8463507890701294, "learning_rate": 4.1244373323601874e-07, "loss": 0.3572, "step": 510 }, { "epoch": 3.3986928104575163, "grad_norm": 1.870890498161316, "learning_rate": 3.3732040792734734e-07, "loss": 0.3609, "step": 520 }, { "epoch": 3.4640522875816995, "grad_norm": 1.8788135051727295, "learning_rate": 2.6925129393015196e-07, "loss": 0.3621, "step": 530 }, { "epoch": 3.5294117647058822, "grad_norm": 1.8110865354537964, "learning_rate": 2.0845841854597092e-07, "loss": 0.3544, "step": 540 }, { "epoch": 3.5947712418300655, "grad_norm": 1.8861949443817139, "learning_rate": 1.5514007549836979e-07, "loss": 0.3617, "step": 550 }, { "epoch": 3.6601307189542482, "grad_norm": 1.814979910850525, "learning_rate": 1.0947017814003258e-07, "loss": 0.3664, "step": 560 }, { "epoch": 3.7254901960784315, "grad_norm": 1.8255079984664917, "learning_rate": 7.159769218354873e-08, "loss": 0.3603, "step": 570 }, { "epoch": 3.7908496732026142, "grad_norm": 1.795508623123169, "learning_rate": 4.164614980622678e-08, "loss": 0.3604, "step": 580 }, { "epoch": 3.8562091503267975, "grad_norm": 1.8044642210006714, "learning_rate": 1.9713246713805588e-08, "loss": 0.3551, "step": 590 }, { "epoch": 3.9215686274509802, "grad_norm": 1.8935906887054443, "learning_rate": 5.87052347736844e-09, "loss": 0.3599, "step": 600 }, { "epoch": 3.9869281045751634, "grad_norm": 1.862278938293457, "learning_rate": 1.6313218287128396e-10, "loss": 0.3593, "step": 610 }, { "epoch": 4.0, "step": 612, "total_flos": 1.3695396597968404e+19, "train_loss": 0.5748976978406407, "train_runtime": 13335.1785, "train_samples_per_second": 11.743, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 612, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3695396597968404e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }