{
  "best_metric": 1.1021808385849,
  "best_model_checkpoint": "./0.4b_finetuned_results/checkpoint-500",
  "epoch": 0.7485029940119761,
  "eval_steps": 500,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.014970059880239521,
      "grad_norm": 4.375,
      "learning_rate": 6.666666666666667e-05,
      "loss": 3.8882,
      "step": 10
    },
    {
      "epoch": 0.029940119760479042,
      "grad_norm": 6.0625,
      "learning_rate": 0.00013333333333333334,
      "loss": 3.2257,
      "step": 20
    },
    {
      "epoch": 0.04491017964071856,
      "grad_norm": 6.28125,
      "learning_rate": 0.0002,
      "loss": 2.92,
      "step": 30
    },
    {
      "epoch": 0.059880239520958084,
      "grad_norm": 1.9765625,
      "learning_rate": 0.00019999938668382333,
      "loss": 2.3984,
      "step": 40
    },
    {
      "epoch": 0.0748502994011976,
      "grad_norm": 1.484375,
      "learning_rate": 0.00019999754674281632,
      "loss": 2.1626,
      "step": 50
    },
    {
      "epoch": 0.08982035928143713,
      "grad_norm": 3.375,
      "learning_rate": 0.0001999944801995484,
      "loss": 2.0388,
      "step": 60
    },
    {
      "epoch": 0.10479041916167664,
      "grad_norm": 1.7890625,
      "learning_rate": 0.0001999901870916347,
      "loss": 2.0121,
      "step": 70
    },
    {
      "epoch": 0.11976047904191617,
      "grad_norm": 1.609375,
      "learning_rate": 0.00019998466747173592,
      "loss": 1.8579,
      "step": 80
    },
    {
      "epoch": 0.1347305389221557,
      "grad_norm": 0.81640625,
      "learning_rate": 0.00019997792140755746,
      "loss": 1.8254,
      "step": 90
    },
    {
      "epoch": 0.1497005988023952,
      "grad_norm": 1.515625,
      "learning_rate": 0.0001999699489818488,
      "loss": 1.7037,
      "step": 100
    },
    {
      "epoch": 0.16467065868263472,
      "grad_norm": 0.94140625,
      "learning_rate": 0.00019996075029240219,
      "loss": 1.6647,
      "step": 110
    },
    {
      "epoch": 0.17964071856287425,
      "grad_norm": 0.61328125,
      "learning_rate": 0.0001999503254520518,
      "loss": 1.5988,
      "step": 120
    },
    {
      "epoch": 0.19461077844311378,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00019993867458867207,
      "loss": 1.6197,
      "step": 130
    },
    {
      "epoch": 0.20958083832335328,
      "grad_norm": 0.47265625,
      "learning_rate": 0.00019992579784517626,
      "loss": 1.5954,
      "step": 140
    },
    {
      "epoch": 0.2245508982035928,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00019991169537951468,
      "loss": 1.5666,
      "step": 150
    },
    {
      "epoch": 0.23952095808383234,
      "grad_norm": 0.52734375,
      "learning_rate": 0.00019989636736467278,
      "loss": 1.5227,
      "step": 160
    },
    {
      "epoch": 0.25449101796407186,
      "grad_norm": 0.34375,
      "learning_rate": 0.00019987981398866887,
      "loss": 1.5048,
      "step": 170
    },
    {
      "epoch": 0.2694610778443114,
      "grad_norm": 0.46875,
      "learning_rate": 0.00019986203545455203,
      "loss": 1.4755,
      "step": 180
    },
    {
      "epoch": 0.2844311377245509,
      "grad_norm": 0.51953125,
      "learning_rate": 0.0001998430319803996,
      "loss": 1.4505,
      "step": 190
    },
    {
      "epoch": 0.2994011976047904,
      "grad_norm": 0.38671875,
      "learning_rate": 0.00019982280379931422,
      "loss": 1.4295,
      "step": 200
    },
    {
      "epoch": 0.3143712574850299,
      "grad_norm": 0.34765625,
      "learning_rate": 0.00019980135115942136,
      "loss": 1.4683,
      "step": 210
    },
    {
      "epoch": 0.32934131736526945,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00019977867432386604,
      "loss": 1.4427,
      "step": 220
    },
    {
      "epoch": 0.344311377245509,
      "grad_norm": 0.357421875,
      "learning_rate": 0.00019975477357080966,
      "loss": 1.3852,
      "step": 230
    },
    {
      "epoch": 0.3592814371257485,
      "grad_norm": 0.361328125,
      "learning_rate": 0.00019972964919342663,
      "loss": 1.427,
      "step": 240
    },
    {
      "epoch": 0.37425149700598803,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00019970330149990062,
      "loss": 1.3759,
      "step": 250
    },
    {
      "epoch": 0.38922155688622756,
      "grad_norm": 0.3515625,
      "learning_rate": 0.00019967573081342103,
      "loss": 1.3559,
      "step": 260
    },
    {
      "epoch": 0.4041916167664671,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00019964693747217874,
      "loss": 1.3715,
      "step": 270
    },
    {
      "epoch": 0.41916167664670656,
      "grad_norm": 0.30859375,
      "learning_rate": 0.00019961692182936225,
      "loss": 1.2932,
      "step": 280
    },
    {
      "epoch": 0.4341317365269461,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00019958568425315314,
      "loss": 1.3086,
      "step": 290
    },
    {
      "epoch": 0.4491017964071856,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00019955322512672162,
      "loss": 1.3091,
      "step": 300
    },
    {
      "epoch": 0.46407185628742514,
      "grad_norm": 0.248046875,
      "learning_rate": 0.00019951954484822182,
      "loss": 1.3196,
      "step": 310
    },
    {
      "epoch": 0.47904191616766467,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00019948464383078696,
      "loss": 1.2944,
      "step": 320
    },
    {
      "epoch": 0.4940119760479042,
      "grad_norm": 0.375,
      "learning_rate": 0.00019944852250252418,
      "loss": 1.3461,
      "step": 330
    },
    {
      "epoch": 0.5089820359281437,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00019941118130650942,
      "loss": 1.3221,
      "step": 340
    },
    {
      "epoch": 0.5239520958083832,
      "grad_norm": 0.23828125,
      "learning_rate": 0.00019937262070078183,
      "loss": 1.3111,
      "step": 350
    },
    {
      "epoch": 0.5389221556886228,
      "grad_norm": 0.2578125,
      "learning_rate": 0.0001993328411583383,
      "loss": 1.3128,
      "step": 360
    },
    {
      "epoch": 0.5538922155688623,
      "grad_norm": 0.2578125,
      "learning_rate": 0.00019929184316712758,
      "loss": 1.2618,
      "step": 370
    },
    {
      "epoch": 0.5688622754491018,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00019924962723004425,
      "loss": 1.2893,
      "step": 380
    },
    {
      "epoch": 0.5838323353293413,
      "grad_norm": 0.30859375,
      "learning_rate": 0.0001992061938649227,
      "loss": 1.2727,
      "step": 390
    },
    {
      "epoch": 0.5988023952095808,
      "grad_norm": 0.3359375,
      "learning_rate": 0.0001991615436045306,
      "loss": 1.293,
      "step": 400
    },
    {
      "epoch": 0.6137724550898204,
      "grad_norm": 0.314453125,
      "learning_rate": 0.0001991156769965625,
      "loss": 1.2692,
      "step": 410
    },
    {
      "epoch": 0.6287425149700598,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00019906859460363307,
      "loss": 1.2588,
      "step": 420
    },
    {
      "epoch": 0.6437125748502994,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00019902029700327018,
      "loss": 1.2576,
      "step": 430
    },
    {
      "epoch": 0.6586826347305389,
      "grad_norm": 0.2890625,
      "learning_rate": 0.0001989707847879078,
      "loss": 1.2595,
      "step": 440
    },
    {
      "epoch": 0.6736526946107785,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00019892005856487878,
      "loss": 1.2331,
      "step": 450
    },
    {
      "epoch": 0.688622754491018,
      "grad_norm": 0.28515625,
      "learning_rate": 0.0001988681189564074,
      "loss": 1.2161,
      "step": 460
    },
    {
      "epoch": 0.7035928143712575,
      "grad_norm": 0.25390625,
      "learning_rate": 0.0001988149665996017,
      "loss": 1.2675,
      "step": 470
    },
    {
      "epoch": 0.718562874251497,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00019876060214644566,
      "loss": 1.269,
      "step": 480
    },
    {
      "epoch": 0.7335329341317365,
      "grad_norm": 0.40625,
      "learning_rate": 0.00019870502626379127,
      "loss": 1.2342,
      "step": 490
    },
    {
      "epoch": 0.7485029940119761,
      "grad_norm": 0.298828125,
      "learning_rate": 0.00019864823963335033,
      "loss": 1.2351,
      "step": 500
    },
    {
      "epoch": 0.7485029940119761,
      "eval_loss": 1.1021808385849,
      "eval_runtime": 109.4058,
      "eval_samples_per_second": 9.14,
      "eval_steps_per_second": 1.143,
      "step": 500
    }
  ],
  "logging_steps": 10,
  "max_steps": 9000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 14,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 3.479612424192e+16,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}