Full initial commit

Browse files

Files changed (10) hide show

config.json +53 -0
generation_config.json +6 -0
optimizer.pt +3 -0
pytorch_model-00001-of-00002.bin +3 -0
pytorch_model-00002-of-00002.bin +3 -0
pytorch_model.bin.index.json +201 -0
rng_state.pth +0 -0
scheduler.pt +0 -0
trainer_state.json +302 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "_name_or_path": "mosaicml/mpt-7b-instruct",
+  "architectures": [
+    "MPTForCausalLM"
+  ],
+  "attn_config": {
+    "alibi": true,
+    "alibi_bias_max": 8,
+    "attn_impl": "torch",
+    "attn_pdrop": 0,
+    "attn_type": "multihead_attention",
+    "attn_uses_sequence_id": false,
+    "clip_qkv": null,
+    "prefix_lm": false,
+    "qk_ln": false,
+    "softmax_scale": null
+  },
+  "auto_map": {
+    "AutoConfig": "mosaicml/mpt-7b-instruct--configuration_mpt.MPTConfig",
+    "AutoModelForCausalLM": "mosaicml/mpt-7b-instruct--modeling_mpt.MPTForCausalLM"
+  },
+  "d_model": 4096,
+  "emb_pdrop": 0,
+  "embedding_fraction": 1.0,
+  "expansion_ratio": 4,
+  "init_config": {
+    "emb_init_std": null,
+    "emb_init_uniform_lim": null,
+    "fan_mode": "fan_in",
+    "init_div_is_residual": true,
+    "init_gain": 0,
+    "init_nonlinearity": "relu",
+    "init_std": 0.02,
+    "name": "kaiming_normal_",
+    "verbose": 0
+  },
+  "init_device": "cpu",
+  "learned_pos_emb": true,
+  "logit_scale": null,
+  "max_seq_len": 2048,
+  "model_type": "mpt",
+  "n_heads": 32,
+  "n_layers": 32,
+  "no_bias": true,
+  "norm_type": "low_precision_layernorm",
+  "resid_pdrop": 0,
+  "tokenizer_name": "EleutherAI/gpt-neox-20b",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.31.0",
+  "use_cache": false,
+  "verbose": 0,
+  "vocab_size": 50432
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": 0,
+  "transformers_version": "4.31.0",
+  "use_cache": false
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6743c11f72935d5f55b467d6c75e5a83a4c049eb646c38e65dfec13daf768ecf
+size 8053427513

pytorch_model-00001-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:569bca6f58c5b8d4d0e506accb0646d98935edf567c1d7dcd0c373539a6e598c
+size 9943042259

pytorch_model-00002-of-00002.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02e994e0821bf6b55cc61e5ceefa09c5d62ba18dc0deb736aecaf9d22d843d56
+size 3355599827

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,201 @@

+{
+  "metadata": {
+    "total_size": 13298573312
+  },
+  "weight_map": {
+    "transformer.blocks.0.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.0.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.0.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.0.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.0.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.0.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.1.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.1.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.1.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.1.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.1.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.1.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.10.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.10.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.10.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.10.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.10.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.10.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.11.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.11.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.11.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.11.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.11.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.11.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.12.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.12.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.12.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.12.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.12.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.12.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.13.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.13.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.13.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.13.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.13.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.13.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.14.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.14.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.14.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.14.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.14.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.14.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.15.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.15.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.15.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.15.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.15.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.15.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.16.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.16.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.16.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.16.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.16.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.16.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.17.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.17.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.17.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.17.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.17.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.17.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.18.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.18.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.18.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.18.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.18.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.18.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.19.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.19.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.19.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.19.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.19.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.19.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.2.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.2.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.2.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.2.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.2.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.2.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.20.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.20.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.20.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.20.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.20.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.20.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.21.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.21.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.21.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.21.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.21.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.21.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.22.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.22.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.22.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.22.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.22.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.22.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.23.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.23.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.23.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.23.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.23.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.23.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.24.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.24.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.24.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.24.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.24.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.24.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.25.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.25.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.25.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.25.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.25.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.25.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.26.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.26.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.26.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.26.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.26.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.26.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.27.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.27.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.27.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.27.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.27.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.27.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.28.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.28.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.28.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.28.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.28.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.28.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.29.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.29.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.29.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.29.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.29.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.29.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.3.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.3.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.3.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.3.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.3.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.3.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.30.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.30.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.30.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.30.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.30.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.30.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.31.attn.Wqkv.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.31.attn.out_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.31.ffn.down_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.31.ffn.up_proj.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.31.norm_1.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.31.norm_2.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.blocks.4.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.4.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.4.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.4.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.4.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.4.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.5.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.5.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.5.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.5.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.5.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.5.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.6.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.6.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.6.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.6.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.6.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.6.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.7.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.7.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.7.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.7.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.7.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.7.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.8.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.8.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.8.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.8.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.8.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.8.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.9.attn.Wqkv.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.9.attn.out_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.9.ffn.down_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.9.ffn.up_proj.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.9.norm_1.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.blocks.9.norm_2.weight": "pytorch_model-00001-of-00002.bin",
+    "transformer.norm_f.weight": "pytorch_model-00002-of-00002.bin",
+    "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
+  }
+}

rng_state.pth ADDED Viewed

Binary file (14.6 kB). View file

scheduler.pt ADDED Viewed

Binary file (627 Bytes). View file

trainer_state.json ADDED Viewed

	@@ -0,0 +1,302 @@

+{
+  "best_metric": 0.18517187237739563,
+  "best_model_checkpoint": "./results/checkpoint-16500",
+  "epoch": 2.9333333333333336,
+  "global_step": 16500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.09,
+      "learning_rate": 5e-05,
+      "loss": 0.3388,
+      "step": 500
+    },
+    {
+      "epoch": 0.18,
+      "learning_rate": 4.9248120300751884e-05,
+      "loss": 0.2803,
+      "step": 1000
+    },
+    {
+      "epoch": 0.27,
+      "learning_rate": 4.849624060150376e-05,
+      "loss": 0.2595,
+      "step": 1500
+    },
+    {
+      "epoch": 0.27,
+      "eval_loss": 0.24845312535762787,
+      "eval_runtime": 152.0435,
+      "eval_samples_per_second": 59.194,
+      "eval_steps_per_second": 0.927,
+      "step": 1500
+    },
+    {
+      "epoch": 0.36,
+      "learning_rate": 4.774436090225564e-05,
+      "loss": 0.2445,
+      "step": 2000
+    },
+    {
+      "epoch": 0.44,
+      "learning_rate": 4.699248120300752e-05,
+      "loss": 0.2357,
+      "step": 2500
+    },
+    {
+      "epoch": 0.53,
+      "learning_rate": 4.62406015037594e-05,
+      "loss": 0.2308,
+      "step": 3000
+    },
+    {
+      "epoch": 0.53,
+      "eval_loss": 0.2250121533870697,
+      "eval_runtime": 151.8556,
+      "eval_samples_per_second": 59.267,
+      "eval_steps_per_second": 0.929,
+      "step": 3000
+    },
+    {
+      "epoch": 0.62,
+      "learning_rate": 4.548872180451128e-05,
+      "loss": 0.2239,
+      "step": 3500
+    },
+    {
+      "epoch": 0.71,
+      "learning_rate": 4.473684210526316e-05,
+      "loss": 0.2204,
+      "step": 4000
+    },
+    {
+      "epoch": 0.8,
+      "learning_rate": 4.398496240601504e-05,
+      "loss": 0.2152,
+      "step": 4500
+    },
+    {
+      "epoch": 0.8,
+      "eval_loss": 0.21273697912693024,
+      "eval_runtime": 152.3413,
+      "eval_samples_per_second": 59.078,
+      "eval_steps_per_second": 0.926,
+      "step": 4500
+    },
+    {
+      "epoch": 0.89,
+      "learning_rate": 4.323308270676692e-05,
+      "loss": 0.213,
+      "step": 5000
+    },
+    {
+      "epoch": 0.98,
+      "learning_rate": 4.24812030075188e-05,
+      "loss": 0.2085,
+      "step": 5500
+    },
+    {
+      "epoch": 1.07,
+      "learning_rate": 4.172932330827068e-05,
+      "loss": 0.1893,
+      "step": 6000
+    },
+    {
+      "epoch": 1.07,
+      "eval_loss": 0.20545659959316254,
+      "eval_runtime": 151.7962,
+      "eval_samples_per_second": 59.29,
+      "eval_steps_per_second": 0.929,
+      "step": 6000
+    },
+    {
+      "epoch": 1.16,
+      "learning_rate": 4.097744360902256e-05,
+      "loss": 0.1851,
+      "step": 6500
+    },
+    {
+      "epoch": 1.24,
+      "learning_rate": 4.022556390977444e-05,
+      "loss": 0.1827,
+      "step": 7000
+    },
+    {
+      "epoch": 1.33,
+      "learning_rate": 3.9473684210526316e-05,
+      "loss": 0.1823,
+      "step": 7500
+    },
+    {
+      "epoch": 1.33,
+      "eval_loss": 0.20052604377269745,
+      "eval_runtime": 151.8901,
+      "eval_samples_per_second": 59.253,
+      "eval_steps_per_second": 0.928,
+      "step": 7500
+    },
+    {
+      "epoch": 1.42,
+      "learning_rate": 3.87218045112782e-05,
+      "loss": 0.1791,
+      "step": 8000
+    },
+    {
+      "epoch": 1.51,
+      "learning_rate": 3.796992481203008e-05,
+      "loss": 0.1771,
+      "step": 8500
+    },
+    {
+      "epoch": 1.6,
+      "learning_rate": 3.721804511278196e-05,
+      "loss": 0.1759,
+      "step": 9000
+    },
+    {
+      "epoch": 1.6,
+      "eval_loss": 0.19474565982818604,
+      "eval_runtime": 151.9186,
+      "eval_samples_per_second": 59.242,
+      "eval_steps_per_second": 0.928,
+      "step": 9000
+    },
+    {
+      "epoch": 1.69,
+      "learning_rate": 3.6466165413533835e-05,
+      "loss": 0.1761,
+      "step": 9500
+    },
+    {
+      "epoch": 1.78,
+      "learning_rate": 3.571428571428572e-05,
+      "loss": 0.1759,
+      "step": 10000
+    },
+    {
+      "epoch": 1.87,
+      "learning_rate": 3.49624060150376e-05,
+      "loss": 0.1727,
+      "step": 10500
+    },
+    {
+      "epoch": 1.87,
+      "eval_loss": 0.19019705057144165,
+      "eval_runtime": 151.9085,
+      "eval_samples_per_second": 59.246,
+      "eval_steps_per_second": 0.928,
+      "step": 10500
+    },
+    {
+      "epoch": 1.96,
+      "learning_rate": 3.421052631578947e-05,
+      "loss": 0.1724,
+      "step": 11000
+    },
+    {
+      "epoch": 2.04,
+      "learning_rate": 3.3458646616541355e-05,
+      "loss": 0.1592,
+      "step": 11500
+    },
+    {
+      "epoch": 2.13,
+      "learning_rate": 3.2706766917293236e-05,
+      "loss": 0.148,
+      "step": 12000
+    },
+    {
+      "epoch": 2.13,
+      "eval_loss": 0.19001474976539612,
+      "eval_runtime": 152.6517,
+      "eval_samples_per_second": 58.958,
+      "eval_steps_per_second": 0.924,
+      "step": 12000
+    },
+    {
+      "epoch": 2.22,
+      "learning_rate": 3.195488721804512e-05,
+      "loss": 0.1477,
+      "step": 12500
+    },
+    {
+      "epoch": 2.31,
+      "learning_rate": 3.120300751879699e-05,
+      "loss": 0.1469,
+      "step": 13000
+    },
+    {
+      "epoch": 2.4,
+      "learning_rate": 3.0451127819548874e-05,
+      "loss": 0.1488,
+      "step": 13500
+    },
+    {
+      "epoch": 2.4,
+      "eval_loss": 0.18920138478279114,
+      "eval_runtime": 151.8531,
+      "eval_samples_per_second": 59.268,
+      "eval_steps_per_second": 0.929,
+      "step": 13500
+    },
+    {
+      "epoch": 2.49,
+      "learning_rate": 2.9699248120300755e-05,
+      "loss": 0.1486,
+      "step": 14000
+    },
+    {
+      "epoch": 2.58,
+      "learning_rate": 2.8947368421052634e-05,
+      "loss": 0.1471,
+      "step": 14500
+    },
+    {
+      "epoch": 2.67,
+      "learning_rate": 2.8195488721804515e-05,
+      "loss": 0.147,
+      "step": 15000
+    },
+    {
+      "epoch": 2.67,
+      "eval_loss": 0.186552956700325,
+      "eval_runtime": 151.8712,
+      "eval_samples_per_second": 59.261,
+      "eval_steps_per_second": 0.928,
+      "step": 15000
+    },
+    {
+      "epoch": 2.76,
+      "learning_rate": 2.7443609022556393e-05,
+      "loss": 0.147,
+      "step": 15500
+    },
+    {
+      "epoch": 2.84,
+      "learning_rate": 2.6691729323308275e-05,
+      "loss": 0.1461,
+      "step": 16000
+    },
+    {
+      "epoch": 2.93,
+      "learning_rate": 2.5939849624060153e-05,
+      "loss": 0.1453,
+      "step": 16500
+    },
+    {
+      "epoch": 2.93,
+      "eval_loss": 0.18517187237739563,
+      "eval_runtime": 151.8569,
+      "eval_samples_per_second": 59.266,
+      "eval_steps_per_second": 0.929,
+      "step": 16500
+    }
+  ],
+  "max_steps": 33750,
+  "num_train_epochs": 6,
+  "total_flos": 2.612547588980736e+18,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbbb34eb40c2535f27700f80a4660bd7cf5773069de4ccb5c830b1de913ad87e
+size 3899