|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 504, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 11.633337718230726, |
|
"learning_rate": 2.631578947368421e-07, |
|
"loss": 0.8828, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 7.106262599818549, |
|
"learning_rate": 5.263157894736842e-07, |
|
"loss": 0.8087, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 2.0015322743055655, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.7295, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.4753694520182823, |
|
"learning_rate": 1.0526315789473683e-06, |
|
"loss": 0.6712, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 1.110295332689471, |
|
"learning_rate": 1.3157894736842106e-06, |
|
"loss": 0.6325, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 2.62423490064751, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.6019, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 2.320671081832736, |
|
"learning_rate": 1.8421052631578946e-06, |
|
"loss": 0.5851, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 3.572874828764162, |
|
"learning_rate": 1.9996767546702485e-06, |
|
"loss": 0.5748, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 2.387950128798658, |
|
"learning_rate": 1.996043443883064e-06, |
|
"loss": 0.5678, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 2.3954023739686083, |
|
"learning_rate": 1.988392397752233e-06, |
|
"loss": 0.5575, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6547619047619048, |
|
"grad_norm": 2.9223208257498885, |
|
"learning_rate": 1.9767648201496052e-06, |
|
"loss": 0.5551, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 2.3303161266900063, |
|
"learning_rate": 1.961223330122206e-06, |
|
"loss": 0.5468, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7738095238095238, |
|
"grad_norm": 2.5697587476278088, |
|
"learning_rate": 1.941851624664209e-06, |
|
"loss": 0.5452, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 2.7124137926277583, |
|
"learning_rate": 1.9187540279759314e-06, |
|
"loss": 0.5381, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 2.6892276446954377, |
|
"learning_rate": 1.8920549296372686e-06, |
|
"loss": 0.5341, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 2.078964942505403, |
|
"learning_rate": 1.861898114721218e-06, |
|
"loss": 0.5294, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.06584873795509338, |
|
"eval_runtime": 116.4553, |
|
"eval_samples_per_second": 155.45, |
|
"eval_steps_per_second": 0.61, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0119047619047619, |
|
"grad_norm": 1.8862054510010102, |
|
"learning_rate": 1.8284459894551025e-06, |
|
"loss": 0.5242, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 2.1399543444499467, |
|
"learning_rate": 1.7918787065996015e-06, |
|
"loss": 0.506, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.130952380952381, |
|
"grad_norm": 1.9062388412547935, |
|
"learning_rate": 1.7523931952557666e-06, |
|
"loss": 0.5037, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 1.2708808678811239, |
|
"learning_rate": 1.7102021003248955e-06, |
|
"loss": 0.5017, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.3542515524166345, |
|
"learning_rate": 1.6655326373326793e-06, |
|
"loss": 0.5023, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3095238095238095, |
|
"grad_norm": 2.1610871241580822, |
|
"learning_rate": 1.6186253687848507e-06, |
|
"loss": 0.497, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.369047619047619, |
|
"grad_norm": 2.0822374960801397, |
|
"learning_rate": 1.569732908644127e-06, |
|
"loss": 0.4962, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 2.0776279888799416, |
|
"learning_rate": 1.5191185619053519e-06, |
|
"loss": 0.4943, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4880952380952381, |
|
"grad_norm": 1.991564712073659, |
|
"learning_rate": 1.4670549065952552e-06, |
|
"loss": 0.4903, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5476190476190477, |
|
"grad_norm": 2.037009128388502, |
|
"learning_rate": 1.4138223258333096e-06, |
|
"loss": 0.4885, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 1.820207258608856, |
|
"learning_rate": 1.3597074978591206e-06, |
|
"loss": 0.4864, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 2.1241388881877183, |
|
"learning_rate": 1.3050018521581279e-06, |
|
"loss": 0.4871, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7261904761904763, |
|
"grad_norm": 1.78922859395332, |
|
"learning_rate": 1.2499999999999999e-06, |
|
"loss": 0.4864, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 1.7328904839729484, |
|
"learning_rate": 1.1949981478418721e-06, |
|
"loss": 0.4813, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8452380952380953, |
|
"grad_norm": 1.935272666911019, |
|
"learning_rate": 1.1402925021408796e-06, |
|
"loss": 0.4801, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 1.2142950270281057, |
|
"learning_rate": 1.0861776741666901e-06, |
|
"loss": 0.4795, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 1.3120184387970604, |
|
"learning_rate": 1.032945093404745e-06, |
|
"loss": 0.4791, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.061718959361314774, |
|
"eval_runtime": 116.2674, |
|
"eval_samples_per_second": 155.701, |
|
"eval_steps_per_second": 0.611, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.0238095238095237, |
|
"grad_norm": 1.4955203361832747, |
|
"learning_rate": 9.80881438094648e-07, |
|
"loss": 0.4693, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 1.1141424267445637, |
|
"learning_rate": 9.302670913558731e-07, |
|
"loss": 0.4599, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.8809215575567357, |
|
"learning_rate": 8.813746312151494e-07, |
|
"loss": 0.4552, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.2023809523809526, |
|
"grad_norm": 0.819257933682719, |
|
"learning_rate": 8.344673626673205e-07, |
|
"loss": 0.4546, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.261904761904762, |
|
"grad_norm": 0.9199338927704399, |
|
"learning_rate": 7.897978996751046e-07, |
|
"loss": 0.4574, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 0.9400263078511765, |
|
"learning_rate": 7.476068047442332e-07, |
|
"loss": 0.4527, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.8120316882785873, |
|
"learning_rate": 7.081212934003984e-07, |
|
"loss": 0.4517, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4404761904761907, |
|
"grad_norm": 0.926431884810278, |
|
"learning_rate": 6.715540105448972e-07, |
|
"loss": 0.4507, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.8615015270894472, |
|
"learning_rate": 6.381018852787821e-07, |
|
"loss": 0.4505, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.5595238095238093, |
|
"grad_norm": 0.794846867305112, |
|
"learning_rate": 6.079450703627314e-07, |
|
"loss": 0.4519, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.8659294487925853, |
|
"learning_rate": 5.812459720240681e-07, |
|
"loss": 0.4523, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 0.8078214310845254, |
|
"learning_rate": 5.581483753357905e-07, |
|
"loss": 0.4498, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.738095238095238, |
|
"grad_norm": 0.9616399301296082, |
|
"learning_rate": 5.387766698777935e-07, |
|
"loss": 0.451, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.7976190476190474, |
|
"grad_norm": 0.8093607714101151, |
|
"learning_rate": 5.232351798503945e-07, |
|
"loss": 0.4495, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.8508798798797214, |
|
"learning_rate": 5.116076022477671e-07, |
|
"loss": 0.4487, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 0.738511448943807, |
|
"learning_rate": 5.039565561169362e-07, |
|
"loss": 0.4454, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 0.7290876762277495, |
|
"learning_rate": 5.003232453297512e-07, |
|
"loss": 0.4473, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.06018054857850075, |
|
"eval_runtime": 118.2486, |
|
"eval_samples_per_second": 153.093, |
|
"eval_steps_per_second": 0.6, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 504, |
|
"total_flos": 3376037568184320.0, |
|
"train_loss": 0.518157976960379, |
|
"train_runtime": 17131.2553, |
|
"train_samples_per_second": 60.233, |
|
"train_steps_per_second": 0.029 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 504, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3376037568184320.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|