{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.9384615384615387,
  "eval_steps": 500,
  "global_step": 96,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.08205128205128205,
      "grad_norm": 8.876763343811035,
      "learning_rate": 0.00019583333333333334,
      "loss": 9.5477,
      "step": 2
    },
    {
      "epoch": 0.1641025641025641,
      "grad_norm": 14.638631820678711,
      "learning_rate": 0.00019166666666666667,
      "loss": 9.1151,
      "step": 4
    },
    {
      "epoch": 0.24615384615384617,
      "grad_norm": 13.725289344787598,
      "learning_rate": 0.0001875,
      "loss": 8.4628,
      "step": 6
    },
    {
      "epoch": 0.3282051282051282,
      "grad_norm": 18.06307601928711,
      "learning_rate": 0.00018333333333333334,
      "loss": 7.935,
      "step": 8
    },
    {
      "epoch": 0.41025641025641024,
      "grad_norm": 17.5152587890625,
      "learning_rate": 0.0001791666666666667,
      "loss": 7.6244,
      "step": 10
    },
    {
      "epoch": 0.49230769230769234,
      "grad_norm": 10.584898948669434,
      "learning_rate": 0.000175,
      "loss": 7.3313,
      "step": 12
    },
    {
      "epoch": 0.5743589743589743,
      "grad_norm": 14.065106391906738,
      "learning_rate": 0.00017083333333333333,
      "loss": 7.1131,
      "step": 14
    },
    {
      "epoch": 0.6564102564102564,
      "grad_norm": 9.009326934814453,
      "learning_rate": 0.0001666666666666667,
      "loss": 7.1038,
      "step": 16
    },
    {
      "epoch": 0.7384615384615385,
      "grad_norm": 6.479495525360107,
      "learning_rate": 0.00016250000000000002,
      "loss": 6.9424,
      "step": 18
    },
    {
      "epoch": 0.8205128205128205,
      "grad_norm": 5.650105953216553,
      "learning_rate": 0.00015833333333333332,
      "loss": 6.9944,
      "step": 20
    },
    {
      "epoch": 0.9025641025641026,
      "grad_norm": 8.514379501342773,
      "learning_rate": 0.00015416666666666668,
      "loss": 6.8316,
      "step": 22
    },
    {
      "epoch": 0.9846153846153847,
      "grad_norm": 8.760089874267578,
      "learning_rate": 0.00015000000000000001,
      "loss": 6.9918,
      "step": 24
    },
    {
      "epoch": 1.0666666666666667,
      "grad_norm": 6.9720869064331055,
      "learning_rate": 0.00014583333333333335,
      "loss": 6.7947,
      "step": 26
    },
    {
      "epoch": 1.1487179487179486,
      "grad_norm": 7.793176174163818,
      "learning_rate": 0.00014166666666666668,
      "loss": 6.8732,
      "step": 28
    },
    {
      "epoch": 1.2307692307692308,
      "grad_norm": 4.848390102386475,
      "learning_rate": 0.0001375,
      "loss": 6.8486,
      "step": 30
    },
    {
      "epoch": 1.3128205128205128,
      "grad_norm": 4.971543788909912,
      "learning_rate": 0.00013333333333333334,
      "loss": 6.7634,
      "step": 32
    },
    {
      "epoch": 1.3948717948717948,
      "grad_norm": 8.622535705566406,
      "learning_rate": 0.00012916666666666667,
      "loss": 6.927,
      "step": 34
    },
    {
      "epoch": 1.476923076923077,
      "grad_norm": 5.5455780029296875,
      "learning_rate": 0.000125,
      "loss": 6.8502,
      "step": 36
    },
    {
      "epoch": 1.558974358974359,
      "grad_norm": 8.031872749328613,
      "learning_rate": 0.00012083333333333333,
      "loss": 6.6609,
      "step": 38
    },
    {
      "epoch": 1.641025641025641,
      "grad_norm": 4.131848335266113,
      "learning_rate": 0.00011666666666666668,
      "loss": 6.7818,
      "step": 40
    },
    {
      "epoch": 1.7230769230769232,
      "grad_norm": 4.26460599899292,
      "learning_rate": 0.00011250000000000001,
      "loss": 6.8535,
      "step": 42
    },
    {
      "epoch": 1.8051282051282052,
      "grad_norm": 6.2603583335876465,
      "learning_rate": 0.00010833333333333333,
      "loss": 6.7652,
      "step": 44
    },
    {
      "epoch": 1.8871794871794871,
      "grad_norm": 6.9018964767456055,
      "learning_rate": 0.00010416666666666667,
      "loss": 6.8772,
      "step": 46
    },
    {
      "epoch": 1.9692307692307693,
      "grad_norm": 5.591540813446045,
      "learning_rate": 0.0001,
      "loss": 6.8769,
      "step": 48
    },
    {
      "epoch": 2.051282051282051,
      "grad_norm": 8.05819320678711,
      "learning_rate": 9.583333333333334e-05,
      "loss": 6.6865,
      "step": 50
    },
    {
      "epoch": 2.1333333333333333,
      "grad_norm": 5.486148834228516,
      "learning_rate": 9.166666666666667e-05,
      "loss": 6.8715,
      "step": 52
    },
    {
      "epoch": 2.2153846153846155,
      "grad_norm": 7.930551528930664,
      "learning_rate": 8.75e-05,
      "loss": 6.8346,
      "step": 54
    },
    {
      "epoch": 2.2974358974358973,
      "grad_norm": 4.514185905456543,
      "learning_rate": 8.333333333333334e-05,
      "loss": 6.7591,
      "step": 56
    },
    {
      "epoch": 2.3794871794871795,
      "grad_norm": 6.322687149047852,
      "learning_rate": 7.916666666666666e-05,
      "loss": 6.7531,
      "step": 58
    },
    {
      "epoch": 2.4615384615384617,
      "grad_norm": 4.071617603302002,
      "learning_rate": 7.500000000000001e-05,
      "loss": 6.8126,
      "step": 60
    },
    {
      "epoch": 2.5435897435897434,
      "grad_norm": 6.385562419891357,
      "learning_rate": 7.083333333333334e-05,
      "loss": 6.6341,
      "step": 62
    },
    {
      "epoch": 2.6256410256410256,
      "grad_norm": 5.388179779052734,
      "learning_rate": 6.666666666666667e-05,
      "loss": 6.7812,
      "step": 64
    },
    {
      "epoch": 2.707692307692308,
      "grad_norm": 3.850942850112915,
      "learning_rate": 6.25e-05,
      "loss": 6.7739,
      "step": 66
    },
    {
      "epoch": 2.7897435897435896,
      "grad_norm": 6.017411231994629,
      "learning_rate": 5.833333333333334e-05,
      "loss": 6.6202,
      "step": 68
    },
    {
      "epoch": 2.871794871794872,
      "grad_norm": 4.091090679168701,
      "learning_rate": 5.4166666666666664e-05,
      "loss": 6.7726,
      "step": 70
    },
    {
      "epoch": 2.953846153846154,
      "grad_norm": 5.818283557891846,
      "learning_rate": 5e-05,
      "loss": 6.7549,
      "step": 72
    },
    {
      "epoch": 3.0358974358974358,
      "grad_norm": 6.205195903778076,
      "learning_rate": 4.5833333333333334e-05,
      "loss": 6.7979,
      "step": 74
    },
    {
      "epoch": 3.117948717948718,
      "grad_norm": 3.819091320037842,
      "learning_rate": 4.166666666666667e-05,
      "loss": 6.7187,
      "step": 76
    },
    {
      "epoch": 3.2,
      "grad_norm": 3.546342611312866,
      "learning_rate": 3.7500000000000003e-05,
      "loss": 6.8922,
      "step": 78
    },
    {
      "epoch": 3.282051282051282,
      "grad_norm": 3.590898275375366,
      "learning_rate": 3.541666666666667e-05,
      "loss": 6.6631,
      "step": 80
    },
    {
      "epoch": 3.364102564102564,
      "grad_norm": 5.197542667388916,
      "learning_rate": 3.125e-05,
      "loss": 6.8214,
      "step": 82
    },
    {
      "epoch": 3.4461538461538463,
      "grad_norm": 8.915237426757812,
      "learning_rate": 2.7083333333333332e-05,
      "loss": 6.6971,
      "step": 84
    },
    {
      "epoch": 3.528205128205128,
      "grad_norm": 7.203217029571533,
      "learning_rate": 2.2916666666666667e-05,
      "loss": 6.6676,
      "step": 86
    },
    {
      "epoch": 3.6102564102564103,
      "grad_norm": 4.247269630432129,
      "learning_rate": 1.8750000000000002e-05,
      "loss": 6.7764,
      "step": 88
    },
    {
      "epoch": 3.6923076923076925,
      "grad_norm": 4.6499176025390625,
      "learning_rate": 1.4583333333333335e-05,
      "loss": 6.8644,
      "step": 90
    },
    {
      "epoch": 3.7743589743589743,
      "grad_norm": 5.579225063323975,
      "learning_rate": 1.0416666666666668e-05,
      "loss": 6.6546,
      "step": 92
    },
    {
      "epoch": 3.8564102564102565,
      "grad_norm": 3.8157854080200195,
      "learning_rate": 6.25e-06,
      "loss": 6.6051,
      "step": 94
    },
    {
      "epoch": 3.9384615384615387,
      "grad_norm": 4.765449523925781,
      "learning_rate": 2.0833333333333334e-06,
      "loss": 6.6474,
      "step": 96
    },
    {
      "epoch": 3.9384615384615387,
      "step": 96,
      "total_flos": 963923732562624.0,
      "train_loss": 6.994301348924637,
      "train_runtime": 2104.8364,
      "train_samples_per_second": 0.741,
      "train_steps_per_second": 0.046
    }
  ],
  "logging_steps": 2,
  "max_steps": 96,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 963923732562624.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}