{ "best_metric": 1.1021808385849, "best_model_checkpoint": "./0.4b_finetuned_results/checkpoint-500", "epoch": 0.7485029940119761, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014970059880239521, "grad_norm": 4.375, "learning_rate": 6.666666666666667e-05, "loss": 3.8882, "step": 10 }, { "epoch": 0.029940119760479042, "grad_norm": 6.0625, "learning_rate": 0.00013333333333333334, "loss": 3.2257, "step": 20 }, { "epoch": 0.04491017964071856, "grad_norm": 6.28125, "learning_rate": 0.0002, "loss": 2.92, "step": 30 }, { "epoch": 0.059880239520958084, "grad_norm": 1.9765625, "learning_rate": 0.00019999938668382333, "loss": 2.3984, "step": 40 }, { "epoch": 0.0748502994011976, "grad_norm": 1.484375, "learning_rate": 0.00019999754674281632, "loss": 2.1626, "step": 50 }, { "epoch": 0.08982035928143713, "grad_norm": 3.375, "learning_rate": 0.0001999944801995484, "loss": 2.0388, "step": 60 }, { "epoch": 0.10479041916167664, "grad_norm": 1.7890625, "learning_rate": 0.0001999901870916347, "loss": 2.0121, "step": 70 }, { "epoch": 0.11976047904191617, "grad_norm": 1.609375, "learning_rate": 0.00019998466747173592, "loss": 1.8579, "step": 80 }, { "epoch": 0.1347305389221557, "grad_norm": 0.81640625, "learning_rate": 0.00019997792140755746, "loss": 1.8254, "step": 90 }, { "epoch": 0.1497005988023952, "grad_norm": 1.515625, "learning_rate": 0.0001999699489818488, "loss": 1.7037, "step": 100 }, { "epoch": 0.16467065868263472, "grad_norm": 0.94140625, "learning_rate": 0.00019996075029240219, "loss": 1.6647, "step": 110 }, { "epoch": 0.17964071856287425, "grad_norm": 0.61328125, "learning_rate": 0.0001999503254520518, "loss": 1.5988, "step": 120 }, { "epoch": 0.19461077844311378, "grad_norm": 0.337890625, "learning_rate": 0.00019993867458867207, "loss": 1.6197, "step": 130 }, { "epoch": 0.20958083832335328, "grad_norm": 0.47265625, "learning_rate": 0.00019992579784517626, "loss": 1.5954, "step": 140 }, { "epoch": 0.2245508982035928, "grad_norm": 0.33203125, "learning_rate": 0.00019991169537951468, "loss": 1.5666, "step": 150 }, { "epoch": 0.23952095808383234, "grad_norm": 0.52734375, "learning_rate": 0.00019989636736467278, "loss": 1.5227, "step": 160 }, { "epoch": 0.25449101796407186, "grad_norm": 0.34375, "learning_rate": 0.00019987981398866887, "loss": 1.5048, "step": 170 }, { "epoch": 0.2694610778443114, "grad_norm": 0.46875, "learning_rate": 0.00019986203545455203, "loss": 1.4755, "step": 180 }, { "epoch": 0.2844311377245509, "grad_norm": 0.51953125, "learning_rate": 0.0001998430319803996, "loss": 1.4505, "step": 190 }, { "epoch": 0.2994011976047904, "grad_norm": 0.38671875, "learning_rate": 0.00019982280379931422, "loss": 1.4295, "step": 200 }, { "epoch": 0.3143712574850299, "grad_norm": 0.34765625, "learning_rate": 0.00019980135115942136, "loss": 1.4683, "step": 210 }, { "epoch": 0.32934131736526945, "grad_norm": 0.306640625, "learning_rate": 0.00019977867432386604, "loss": 1.4427, "step": 220 }, { "epoch": 0.344311377245509, "grad_norm": 0.357421875, "learning_rate": 0.00019975477357080966, "loss": 1.3852, "step": 230 }, { "epoch": 0.3592814371257485, "grad_norm": 0.361328125, "learning_rate": 0.00019972964919342663, "loss": 1.427, "step": 240 }, { "epoch": 0.37425149700598803, "grad_norm": 0.306640625, "learning_rate": 0.00019970330149990062, "loss": 1.3759, "step": 250 }, { "epoch": 0.38922155688622756, "grad_norm": 0.3515625, "learning_rate": 0.00019967573081342103, "loss": 1.3559, "step": 260 }, { "epoch": 0.4041916167664671, "grad_norm": 0.28515625, "learning_rate": 0.00019964693747217874, "loss": 1.3715, "step": 270 }, { "epoch": 0.41916167664670656, "grad_norm": 0.30859375, "learning_rate": 0.00019961692182936225, "loss": 1.2932, "step": 280 }, { "epoch": 0.4341317365269461, "grad_norm": 0.306640625, "learning_rate": 0.00019958568425315314, "loss": 1.3086, "step": 290 }, { "epoch": 0.4491017964071856, "grad_norm": 0.291015625, "learning_rate": 0.00019955322512672162, "loss": 1.3091, "step": 300 }, { "epoch": 0.46407185628742514, "grad_norm": 0.248046875, "learning_rate": 0.00019951954484822182, "loss": 1.3196, "step": 310 }, { "epoch": 0.47904191616766467, "grad_norm": 0.267578125, "learning_rate": 0.00019948464383078696, "loss": 1.2944, "step": 320 }, { "epoch": 0.4940119760479042, "grad_norm": 0.375, "learning_rate": 0.00019944852250252418, "loss": 1.3461, "step": 330 }, { "epoch": 0.5089820359281437, "grad_norm": 0.275390625, "learning_rate": 0.00019941118130650942, "loss": 1.3221, "step": 340 }, { "epoch": 0.5239520958083832, "grad_norm": 0.23828125, "learning_rate": 0.00019937262070078183, "loss": 1.3111, "step": 350 }, { "epoch": 0.5389221556886228, "grad_norm": 0.2578125, "learning_rate": 0.0001993328411583383, "loss": 1.3128, "step": 360 }, { "epoch": 0.5538922155688623, "grad_norm": 0.2578125, "learning_rate": 0.00019929184316712758, "loss": 1.2618, "step": 370 }, { "epoch": 0.5688622754491018, "grad_norm": 0.29296875, "learning_rate": 0.00019924962723004425, "loss": 1.2893, "step": 380 }, { "epoch": 0.5838323353293413, "grad_norm": 0.30859375, "learning_rate": 0.0001992061938649227, "loss": 1.2727, "step": 390 }, { "epoch": 0.5988023952095808, "grad_norm": 0.3359375, "learning_rate": 0.0001991615436045306, "loss": 1.293, "step": 400 }, { "epoch": 0.6137724550898204, "grad_norm": 0.314453125, "learning_rate": 0.0001991156769965625, "loss": 1.2692, "step": 410 }, { "epoch": 0.6287425149700598, "grad_norm": 0.326171875, "learning_rate": 0.00019906859460363307, "loss": 1.2588, "step": 420 }, { "epoch": 0.6437125748502994, "grad_norm": 0.26953125, "learning_rate": 0.00019902029700327018, "loss": 1.2576, "step": 430 }, { "epoch": 0.6586826347305389, "grad_norm": 0.2890625, "learning_rate": 0.0001989707847879078, "loss": 1.2595, "step": 440 }, { "epoch": 0.6736526946107785, "grad_norm": 0.337890625, "learning_rate": 0.00019892005856487878, "loss": 1.2331, "step": 450 }, { "epoch": 0.688622754491018, "grad_norm": 0.28515625, "learning_rate": 0.0001988681189564074, "loss": 1.2161, "step": 460 }, { "epoch": 0.7035928143712575, "grad_norm": 0.25390625, "learning_rate": 0.0001988149665996017, "loss": 1.2675, "step": 470 }, { "epoch": 0.718562874251497, "grad_norm": 0.26953125, "learning_rate": 0.00019876060214644566, "loss": 1.269, "step": 480 }, { "epoch": 0.7335329341317365, "grad_norm": 0.40625, "learning_rate": 0.00019870502626379127, "loss": 1.2342, "step": 490 }, { "epoch": 0.7485029940119761, "grad_norm": 0.298828125, "learning_rate": 0.00019864823963335033, "loss": 1.2351, "step": 500 }, { "epoch": 0.7485029940119761, "eval_loss": 1.1021808385849, "eval_runtime": 109.4058, "eval_samples_per_second": 9.14, "eval_steps_per_second": 1.143, "step": 500 } ], "logging_steps": 10, "max_steps": 9000, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.479612424192e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }