{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.00031130100472399273,
  "eval_steps": 500,
  "global_step": 400,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "grad_norm": 2.555440902709961,
      "learning_rate": 3.3333333333333333e-06,
      "loss": 3.6397,
      "step": 20
    },
    {
      "epoch": 0.0,
      "grad_norm": 2.218903064727783,
      "learning_rate": 6.666666666666667e-06,
      "loss": 3.6917,
      "step": 40
    },
    {
      "epoch": 0.0,
      "grad_norm": 0.9262466430664062,
      "learning_rate": 1e-05,
      "loss": 3.5828,
      "step": 60
    },
    {
      "epoch": 0.0,
      "grad_norm": 2.782036542892456,
      "learning_rate": 1.3166666666666665e-05,
      "loss": 3.5865,
      "step": 80
    },
    {
      "epoch": 0.0,
      "grad_norm": 1.9482054710388184,
      "learning_rate": 1.65e-05,
      "loss": 3.3337,
      "step": 100
    },
    {
      "epoch": 0.0,
      "grad_norm": 4.047863006591797,
      "learning_rate": 1.9833333333333335e-05,
      "loss": 3.1903,
      "step": 120
    },
    {
      "epoch": 0.0,
      "grad_norm": 3.08722186088562,
      "learning_rate": 2.3166666666666666e-05,
      "loss": 3.5379,
      "step": 140
    },
    {
      "epoch": 0.0,
      "grad_norm": 3.540940046310425,
      "learning_rate": 2.6500000000000004e-05,
      "loss": 3.16,
      "step": 160
    },
    {
      "epoch": 0.0,
      "grad_norm": 5.391817092895508,
      "learning_rate": 2.9833333333333335e-05,
      "loss": 3.2489,
      "step": 180
    },
    {
      "epoch": 0.0,
      "grad_norm": 5.890682220458984,
      "learning_rate": 3.316666666666667e-05,
      "loss": 3.0499,
      "step": 200
    },
    {
      "epoch": 0.0,
      "grad_norm": 6.314597129821777,
      "learning_rate": 3.65e-05,
      "loss": 2.8568,
      "step": 220
    },
    {
      "epoch": 0.0,
      "grad_norm": 1.0859078168869019,
      "learning_rate": 3.983333333333333e-05,
      "loss": 2.8566,
      "step": 240
    },
    {
      "epoch": 0.0,
      "grad_norm": 4.688353538513184,
      "learning_rate": 4.316666666666667e-05,
      "loss": 3.0079,
      "step": 260
    },
    {
      "epoch": 0.0,
      "grad_norm": 4.502331256866455,
      "learning_rate": 4.6500000000000005e-05,
      "loss": 2.6839,
      "step": 280
    },
    {
      "epoch": 0.0,
      "grad_norm": 8.951983451843262,
      "learning_rate": 4.9833333333333336e-05,
      "loss": 2.7932,
      "step": 300
    },
    {
      "epoch": 0.0,
      "grad_norm": 4.788575172424316,
      "learning_rate": 4.9999526661182696e-05,
      "loss": 2.9341,
      "step": 320
    },
    {
      "epoch": 0.0,
      "grad_norm": 7.716049671173096,
      "learning_rate": 4.999800570348766e-05,
      "loss": 2.5987,
      "step": 340
    },
    {
      "epoch": 0.0,
      "grad_norm": 4.9223952293396,
      "learning_rate": 4.9995435879539254e-05,
      "loss": 2.7863,
      "step": 360
    },
    {
      "epoch": 0.0,
      "grad_norm": 7.647037506103516,
      "learning_rate": 4.999181729716214e-05,
      "loss": 2.6197,
      "step": 380
    },
    {
      "epoch": 0.0,
      "grad_norm": 1.073474407196045,
      "learning_rate": 4.998715010818479e-05,
      "loss": 2.6627,
      "step": 400
    }
  ],
  "logging_steps": 20,
  "max_steps": 10000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 20,
  "total_flos": 1824387808739328.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}