|
{ |
|
"best_metric": 1.1021808385849, |
|
"best_model_checkpoint": "./0.4b_finetuned_results/checkpoint-500", |
|
"epoch": 0.7485029940119761, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014970059880239521, |
|
"grad_norm": 4.375, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 3.8882, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029940119760479042, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 3.2257, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04491017964071856, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 0.0002, |
|
"loss": 2.92, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.059880239520958084, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.00019999938668382333, |
|
"loss": 2.3984, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0748502994011976, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00019999754674281632, |
|
"loss": 2.1626, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08982035928143713, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.0001999944801995484, |
|
"loss": 2.0388, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10479041916167664, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.0001999901870916347, |
|
"loss": 2.0121, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00019998466747173592, |
|
"loss": 1.8579, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1347305389221557, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00019997792140755746, |
|
"loss": 1.8254, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1497005988023952, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0001999699489818488, |
|
"loss": 1.7037, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16467065868263472, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00019996075029240219, |
|
"loss": 1.6647, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17964071856287425, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0001999503254520518, |
|
"loss": 1.5988, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19461077844311378, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00019993867458867207, |
|
"loss": 1.6197, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20958083832335328, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00019992579784517626, |
|
"loss": 1.5954, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2245508982035928, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019991169537951468, |
|
"loss": 1.5666, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00019989636736467278, |
|
"loss": 1.5227, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25449101796407186, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019987981398866887, |
|
"loss": 1.5048, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2694610778443114, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00019986203545455203, |
|
"loss": 1.4755, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2844311377245509, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001998430319803996, |
|
"loss": 1.4505, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2994011976047904, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00019982280379931422, |
|
"loss": 1.4295, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3143712574850299, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019980135115942136, |
|
"loss": 1.4683, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.32934131736526945, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019977867432386604, |
|
"loss": 1.4427, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.344311377245509, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00019975477357080966, |
|
"loss": 1.3852, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019972964919342663, |
|
"loss": 1.427, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.37425149700598803, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019970330149990062, |
|
"loss": 1.3759, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38922155688622756, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019967573081342103, |
|
"loss": 1.3559, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4041916167664671, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019964693747217874, |
|
"loss": 1.3715, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.41916167664670656, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019961692182936225, |
|
"loss": 1.2932, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4341317365269461, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019958568425315314, |
|
"loss": 1.3086, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4491017964071856, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019955322512672162, |
|
"loss": 1.3091, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.46407185628742514, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00019951954484822182, |
|
"loss": 1.3196, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00019948464383078696, |
|
"loss": 1.2944, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4940119760479042, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00019944852250252418, |
|
"loss": 1.3461, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5089820359281437, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00019941118130650942, |
|
"loss": 1.3221, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5239520958083832, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.00019937262070078183, |
|
"loss": 1.3111, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5389221556886228, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0001993328411583383, |
|
"loss": 1.3128, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5538922155688623, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00019929184316712758, |
|
"loss": 1.2618, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5688622754491018, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019924962723004425, |
|
"loss": 1.2893, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5838323353293413, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001992061938649227, |
|
"loss": 1.2727, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0001991615436045306, |
|
"loss": 1.293, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6137724550898204, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001991156769965625, |
|
"loss": 1.2692, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6287425149700598, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019906859460363307, |
|
"loss": 1.2588, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6437125748502994, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019902029700327018, |
|
"loss": 1.2576, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6586826347305389, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001989707847879078, |
|
"loss": 1.2595, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6736526946107785, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00019892005856487878, |
|
"loss": 1.2331, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.688622754491018, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001988681189564074, |
|
"loss": 1.2161, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7035928143712575, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001988149665996017, |
|
"loss": 1.2675, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019876060214644566, |
|
"loss": 1.269, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7335329341317365, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00019870502626379127, |
|
"loss": 1.2342, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019864823963335033, |
|
"loss": 1.2351, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"eval_loss": 1.1021808385849, |
|
"eval_runtime": 109.4058, |
|
"eval_samples_per_second": 9.14, |
|
"eval_steps_per_second": 1.143, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 9000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 14, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.479612424192e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|