diff --git a/all_results.json b/all_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f7ad23765a709637c43e20c5e71464010e888ff
--- /dev/null
+++ b/all_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 97.96,
+    "train_loss": 0.302445507645607,
+    "train_runtime": 8265.7464,
+    "train_samples": 98,
+    "train_samples_per_second": 1.161,
+    "train_steps_per_second": 0.073
+}
\ No newline at end of file
diff --git a/checkpoint-100/config.json b/checkpoint-100/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504
--- /dev/null
+++ b/checkpoint-100/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm2-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.30.2",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/checkpoint-100/generation_config.json b/checkpoint-100/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59
--- /dev/null
+++ b/checkpoint-100/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}
diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..52c801a2f2ecae27572b00c1a0ba0b9451c2fa19
--- /dev/null
+++ b/checkpoint-100/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b9322b7c791be0283cae33d01cb2e6c40786a9c9fab7fc421715ba39faa314
+size 14681892
diff --git a/checkpoint-100/pytorch_model.bin b/checkpoint-100/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..918f41cbd30359828fd3c6f019f56566e69ef98b
--- /dev/null
+++ b/checkpoint-100/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:119f3aee6155af1456cc129b0eab064a91fd3a95f864e8b1a4985d7e10381988
+size 7341306
diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..fbcc351fdb5edd176fbbfa420eeee3e70e240743
--- /dev/null
+++ b/checkpoint-100/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:723f496b9d8d8e776f11531f4652ca1ce47b825b86325d6f5aba8841ca36f1a0
+size 14244
diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..97f63f3eb812c004e226c4d5a316e628e46c3532
--- /dev/null
+++ b/checkpoint-100/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e97f84b2e9cf03e34106548e8fd72d9181e088fcbe9b5747b6e8466de9610724
+size 1064
diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/checkpoint-100/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/checkpoint-100/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e
--- /dev/null
+++ b/checkpoint-100/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer"
+}
diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f5d4dbed2ebfe7bb2ce69aaa8b495dd232943ba
--- /dev/null
+++ b/checkpoint-100/trainer_state.json
@@ -0,0 +1,76 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 16.3265306122449,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.63,
+      "learning_rate": 0.009833333333333333,
+      "loss": 2.53,
+      "step": 10
+    },
+    {
+      "epoch": 3.27,
+      "learning_rate": 0.009666666666666667,
+      "loss": 2.0016,
+      "step": 20
+    },
+    {
+      "epoch": 4.9,
+      "learning_rate": 0.0095,
+      "loss": 1.7775,
+      "step": 30
+    },
+    {
+      "epoch": 6.53,
+      "learning_rate": 0.009333333333333334,
+      "loss": 1.6576,
+      "step": 40
+    },
+    {
+      "epoch": 8.16,
+      "learning_rate": 0.009166666666666667,
+      "loss": 1.5048,
+      "step": 50
+    },
+    {
+      "epoch": 9.8,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.3572,
+      "step": 60
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.008833333333333334,
+      "loss": 1.2067,
+      "step": 70
+    },
+    {
+      "epoch": 13.06,
+      "learning_rate": 0.008666666666666668,
+      "loss": 1.0777,
+      "step": 80
+    },
+    {
+      "epoch": 14.69,
+      "learning_rate": 0.0085,
+      "loss": 0.9188,
+      "step": 90
+    },
+    {
+      "epoch": 16.33,
+      "learning_rate": 0.008333333333333333,
+      "loss": 0.7241,
+      "step": 100
+    }
+  ],
+  "max_steps": 600,
+  "num_train_epochs": 100,
+  "total_flos": 1.1757481562734592e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad
--- /dev/null
+++ b/checkpoint-100/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
+size 4472
diff --git a/checkpoint-200/config.json b/checkpoint-200/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504
--- /dev/null
+++ b/checkpoint-200/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm2-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.30.2",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/checkpoint-200/generation_config.json b/checkpoint-200/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59
--- /dev/null
+++ b/checkpoint-200/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}
diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e4cb54f42ee02783f24d1c226ea5b5f7745b56db
--- /dev/null
+++ b/checkpoint-200/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:933b7b82708ba6a23d949d7b05fcb8644b9ab8b06ecf625f35c30aeba85b3ba2
+size 14681892
diff --git a/checkpoint-200/pytorch_model.bin b/checkpoint-200/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8e4d6030a564ce573337f3b67a7184e2a88e2234
--- /dev/null
+++ b/checkpoint-200/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54e939cf8e3ee1c58646595ea0e7748202c1e1b85f82aeb536a388bbe8d36e86
+size 7341306
diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4cf9dd992ca86cb2bb62e9edb968c887e9aff1e2
--- /dev/null
+++ b/checkpoint-200/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51523eedac643c13a3a71297ac9e347331249d1d4cc19f9738a182bae3585fb2
+size 14244
diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..03859582101c93c48f703cbc6d8bdff86b097783
--- /dev/null
+++ b/checkpoint-200/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3db1c4819d8e7a76f34cf5f8f4aa0bf9497992cd0862dbd9ba3fc68b9886b79e
+size 1064
diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/checkpoint-200/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/checkpoint-200/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e
--- /dev/null
+++ b/checkpoint-200/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer"
+}
diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..a8095cc1f70332f738abddb0364d128f780b03d1
--- /dev/null
+++ b/checkpoint-200/trainer_state.json
@@ -0,0 +1,136 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 32.6530612244898,
+  "global_step": 200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.63,
+      "learning_rate": 0.009833333333333333,
+      "loss": 2.53,
+      "step": 10
+    },
+    {
+      "epoch": 3.27,
+      "learning_rate": 0.009666666666666667,
+      "loss": 2.0016,
+      "step": 20
+    },
+    {
+      "epoch": 4.9,
+      "learning_rate": 0.0095,
+      "loss": 1.7775,
+      "step": 30
+    },
+    {
+      "epoch": 6.53,
+      "learning_rate": 0.009333333333333334,
+      "loss": 1.6576,
+      "step": 40
+    },
+    {
+      "epoch": 8.16,
+      "learning_rate": 0.009166666666666667,
+      "loss": 1.5048,
+      "step": 50
+    },
+    {
+      "epoch": 9.8,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.3572,
+      "step": 60
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.008833333333333334,
+      "loss": 1.2067,
+      "step": 70
+    },
+    {
+      "epoch": 13.06,
+      "learning_rate": 0.008666666666666668,
+      "loss": 1.0777,
+      "step": 80
+    },
+    {
+      "epoch": 14.69,
+      "learning_rate": 0.0085,
+      "loss": 0.9188,
+      "step": 90
+    },
+    {
+      "epoch": 16.33,
+      "learning_rate": 0.008333333333333333,
+      "loss": 0.7241,
+      "step": 100
+    },
+    {
+      "epoch": 17.96,
+      "learning_rate": 0.008166666666666666,
+      "loss": 0.5775,
+      "step": 110
+    },
+    {
+      "epoch": 19.59,
+      "learning_rate": 0.008,
+      "loss": 0.4235,
+      "step": 120
+    },
+    {
+      "epoch": 21.22,
+      "learning_rate": 0.007833333333333333,
+      "loss": 0.3182,
+      "step": 130
+    },
+    {
+      "epoch": 22.86,
+      "learning_rate": 0.007666666666666667,
+      "loss": 0.2155,
+      "step": 140
+    },
+    {
+      "epoch": 24.49,
+      "learning_rate": 0.0075,
+      "loss": 0.1633,
+      "step": 150
+    },
+    {
+      "epoch": 26.12,
+      "learning_rate": 0.007333333333333333,
+      "loss": 0.1234,
+      "step": 160
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.007166666666666667,
+      "loss": 0.0911,
+      "step": 170
+    },
+    {
+      "epoch": 29.39,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.0738,
+      "step": 180
+    },
+    {
+      "epoch": 31.02,
+      "learning_rate": 0.006833333333333334,
+      "loss": 0.0673,
+      "step": 190
+    },
+    {
+      "epoch": 32.65,
+      "learning_rate": 0.006666666666666666,
+      "loss": 0.0544,
+      "step": 200
+    }
+  ],
+  "max_steps": 600,
+  "num_train_epochs": 100,
+  "total_flos": 2.3514963125469184e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad
--- /dev/null
+++ b/checkpoint-200/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
+size 4472
diff --git a/checkpoint-300/config.json b/checkpoint-300/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504
--- /dev/null
+++ b/checkpoint-300/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm2-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.30.2",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/checkpoint-300/generation_config.json b/checkpoint-300/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59
--- /dev/null
+++ b/checkpoint-300/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}
diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..10f90948789c8412689667bdc8a8e42676881c62
--- /dev/null
+++ b/checkpoint-300/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb80ab0b61a192373221d205400431f1f9db5591d3be1fcdb9051924f1b410d2
+size 14681892
diff --git a/checkpoint-300/pytorch_model.bin b/checkpoint-300/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7c2c568a282353a4b55bed9c7e8ba3d33487a168
--- /dev/null
+++ b/checkpoint-300/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:187b159480029f605a8ec08a6da076afe43110d3c1ae18d10931f2ac9e5793ec
+size 7341306
diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..e479fc067e718f8fbaaa35204bb72b4bac22d994
--- /dev/null
+++ b/checkpoint-300/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5035452976c183913e118a486015c4dbd9cf61159f30c79ac9dd02dbf2cd81c
+size 14244
diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..de11f4c03d3a770ff4ef03641588dc58053cc791
--- /dev/null
+++ b/checkpoint-300/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b5ff897392aa57ce97759b435acfdb4ee39aef21d4a4a68095c3294c513f6c0
+size 1064
diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/checkpoint-300/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/checkpoint-300/tokenizer.model b/checkpoint-300/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/checkpoint-300/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e
--- /dev/null
+++ b/checkpoint-300/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer"
+}
diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..408c74767e379b67512325d3df703334e4ba1ed3
--- /dev/null
+++ b/checkpoint-300/trainer_state.json
@@ -0,0 +1,196 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 48.97959183673469,
+  "global_step": 300,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.63,
+      "learning_rate": 0.009833333333333333,
+      "loss": 2.53,
+      "step": 10
+    },
+    {
+      "epoch": 3.27,
+      "learning_rate": 0.009666666666666667,
+      "loss": 2.0016,
+      "step": 20
+    },
+    {
+      "epoch": 4.9,
+      "learning_rate": 0.0095,
+      "loss": 1.7775,
+      "step": 30
+    },
+    {
+      "epoch": 6.53,
+      "learning_rate": 0.009333333333333334,
+      "loss": 1.6576,
+      "step": 40
+    },
+    {
+      "epoch": 8.16,
+      "learning_rate": 0.009166666666666667,
+      "loss": 1.5048,
+      "step": 50
+    },
+    {
+      "epoch": 9.8,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.3572,
+      "step": 60
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.008833333333333334,
+      "loss": 1.2067,
+      "step": 70
+    },
+    {
+      "epoch": 13.06,
+      "learning_rate": 0.008666666666666668,
+      "loss": 1.0777,
+      "step": 80
+    },
+    {
+      "epoch": 14.69,
+      "learning_rate": 0.0085,
+      "loss": 0.9188,
+      "step": 90
+    },
+    {
+      "epoch": 16.33,
+      "learning_rate": 0.008333333333333333,
+      "loss": 0.7241,
+      "step": 100
+    },
+    {
+      "epoch": 17.96,
+      "learning_rate": 0.008166666666666666,
+      "loss": 0.5775,
+      "step": 110
+    },
+    {
+      "epoch": 19.59,
+      "learning_rate": 0.008,
+      "loss": 0.4235,
+      "step": 120
+    },
+    {
+      "epoch": 21.22,
+      "learning_rate": 0.007833333333333333,
+      "loss": 0.3182,
+      "step": 130
+    },
+    {
+      "epoch": 22.86,
+      "learning_rate": 0.007666666666666667,
+      "loss": 0.2155,
+      "step": 140
+    },
+    {
+      "epoch": 24.49,
+      "learning_rate": 0.0075,
+      "loss": 0.1633,
+      "step": 150
+    },
+    {
+      "epoch": 26.12,
+      "learning_rate": 0.007333333333333333,
+      "loss": 0.1234,
+      "step": 160
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.007166666666666667,
+      "loss": 0.0911,
+      "step": 170
+    },
+    {
+      "epoch": 29.39,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.0738,
+      "step": 180
+    },
+    {
+      "epoch": 31.02,
+      "learning_rate": 0.006833333333333334,
+      "loss": 0.0673,
+      "step": 190
+    },
+    {
+      "epoch": 32.65,
+      "learning_rate": 0.006666666666666666,
+      "loss": 0.0544,
+      "step": 200
+    },
+    {
+      "epoch": 34.29,
+      "learning_rate": 0.006500000000000001,
+      "loss": 0.0492,
+      "step": 210
+    },
+    {
+      "epoch": 35.92,
+      "learning_rate": 0.006333333333333333,
+      "loss": 0.0458,
+      "step": 220
+    },
+    {
+      "epoch": 37.55,
+      "learning_rate": 0.0061666666666666675,
+      "loss": 0.0434,
+      "step": 230
+    },
+    {
+      "epoch": 39.18,
+      "learning_rate": 0.006,
+      "loss": 0.0387,
+      "step": 240
+    },
+    {
+      "epoch": 40.82,
+      "learning_rate": 0.005833333333333334,
+      "loss": 0.0375,
+      "step": 250
+    },
+    {
+      "epoch": 42.45,
+      "learning_rate": 0.005666666666666666,
+      "loss": 0.0363,
+      "step": 260
+    },
+    {
+      "epoch": 44.08,
+      "learning_rate": 0.0055000000000000005,
+      "loss": 0.0347,
+      "step": 270
+    },
+    {
+      "epoch": 45.71,
+      "learning_rate": 0.005333333333333333,
+      "loss": 0.0341,
+      "step": 280
+    },
+    {
+      "epoch": 47.35,
+      "learning_rate": 0.0051666666666666675,
+      "loss": 0.0327,
+      "step": 290
+    },
+    {
+      "epoch": 48.98,
+      "learning_rate": 0.005,
+      "loss": 0.0307,
+      "step": 300
+    }
+  ],
+  "max_steps": 600,
+  "num_train_epochs": 100,
+  "total_flos": 3.5272444688203776e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad
--- /dev/null
+++ b/checkpoint-300/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
+size 4472
diff --git a/checkpoint-400/config.json b/checkpoint-400/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504
--- /dev/null
+++ b/checkpoint-400/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm2-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.30.2",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/checkpoint-400/generation_config.json b/checkpoint-400/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59
--- /dev/null
+++ b/checkpoint-400/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}
diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d1a3ae2ea3b3ca7d394452bf6c9cd94e5019038d
--- /dev/null
+++ b/checkpoint-400/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ddda63cbe968668b459a73f0a54c34fc36c007f9f202063794ded2a8814a37a
+size 14681892
diff --git a/checkpoint-400/pytorch_model.bin b/checkpoint-400/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9c3d90644c07522d549839852fdb549785208bc3
--- /dev/null
+++ b/checkpoint-400/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b954c8f23337c53ad1c86bafb2969338878db3b96c2bc2459aa04e1198a2141
+size 7341306
diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..bae703a4e27ab853ddff95c141d5555cb75c5980
--- /dev/null
+++ b/checkpoint-400/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:11204a688e287bc0c7409fba921f7fd490e9471d91d738932d045851e4742a4e
+size 14244
diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..04caa6d4a5322a24b77c5b3ecdd6355359355de7
--- /dev/null
+++ b/checkpoint-400/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c32c17fb8a573adc159285286f456bfb53c7e2d80664d0c2cce541b6013ed8d7
+size 1064
diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/checkpoint-400/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/checkpoint-400/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e
--- /dev/null
+++ b/checkpoint-400/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer"
+}
diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5292f7ce7abbead9d2162808fe7a4a51f7868e79
--- /dev/null
+++ b/checkpoint-400/trainer_state.json
@@ -0,0 +1,256 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 65.3061224489796,
+  "global_step": 400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.63,
+      "learning_rate": 0.009833333333333333,
+      "loss": 2.53,
+      "step": 10
+    },
+    {
+      "epoch": 3.27,
+      "learning_rate": 0.009666666666666667,
+      "loss": 2.0016,
+      "step": 20
+    },
+    {
+      "epoch": 4.9,
+      "learning_rate": 0.0095,
+      "loss": 1.7775,
+      "step": 30
+    },
+    {
+      "epoch": 6.53,
+      "learning_rate": 0.009333333333333334,
+      "loss": 1.6576,
+      "step": 40
+    },
+    {
+      "epoch": 8.16,
+      "learning_rate": 0.009166666666666667,
+      "loss": 1.5048,
+      "step": 50
+    },
+    {
+      "epoch": 9.8,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.3572,
+      "step": 60
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.008833333333333334,
+      "loss": 1.2067,
+      "step": 70
+    },
+    {
+      "epoch": 13.06,
+      "learning_rate": 0.008666666666666668,
+      "loss": 1.0777,
+      "step": 80
+    },
+    {
+      "epoch": 14.69,
+      "learning_rate": 0.0085,
+      "loss": 0.9188,
+      "step": 90
+    },
+    {
+      "epoch": 16.33,
+      "learning_rate": 0.008333333333333333,
+      "loss": 0.7241,
+      "step": 100
+    },
+    {
+      "epoch": 17.96,
+      "learning_rate": 0.008166666666666666,
+      "loss": 0.5775,
+      "step": 110
+    },
+    {
+      "epoch": 19.59,
+      "learning_rate": 0.008,
+      "loss": 0.4235,
+      "step": 120
+    },
+    {
+      "epoch": 21.22,
+      "learning_rate": 0.007833333333333333,
+      "loss": 0.3182,
+      "step": 130
+    },
+    {
+      "epoch": 22.86,
+      "learning_rate": 0.007666666666666667,
+      "loss": 0.2155,
+      "step": 140
+    },
+    {
+      "epoch": 24.49,
+      "learning_rate": 0.0075,
+      "loss": 0.1633,
+      "step": 150
+    },
+    {
+      "epoch": 26.12,
+      "learning_rate": 0.007333333333333333,
+      "loss": 0.1234,
+      "step": 160
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.007166666666666667,
+      "loss": 0.0911,
+      "step": 170
+    },
+    {
+      "epoch": 29.39,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.0738,
+      "step": 180
+    },
+    {
+      "epoch": 31.02,
+      "learning_rate": 0.006833333333333334,
+      "loss": 0.0673,
+      "step": 190
+    },
+    {
+      "epoch": 32.65,
+      "learning_rate": 0.006666666666666666,
+      "loss": 0.0544,
+      "step": 200
+    },
+    {
+      "epoch": 34.29,
+      "learning_rate": 0.006500000000000001,
+      "loss": 0.0492,
+      "step": 210
+    },
+    {
+      "epoch": 35.92,
+      "learning_rate": 0.006333333333333333,
+      "loss": 0.0458,
+      "step": 220
+    },
+    {
+      "epoch": 37.55,
+      "learning_rate": 0.0061666666666666675,
+      "loss": 0.0434,
+      "step": 230
+    },
+    {
+      "epoch": 39.18,
+      "learning_rate": 0.006,
+      "loss": 0.0387,
+      "step": 240
+    },
+    {
+      "epoch": 40.82,
+      "learning_rate": 0.005833333333333334,
+      "loss": 0.0375,
+      "step": 250
+    },
+    {
+      "epoch": 42.45,
+      "learning_rate": 0.005666666666666666,
+      "loss": 0.0363,
+      "step": 260
+    },
+    {
+      "epoch": 44.08,
+      "learning_rate": 0.0055000000000000005,
+      "loss": 0.0347,
+      "step": 270
+    },
+    {
+      "epoch": 45.71,
+      "learning_rate": 0.005333333333333333,
+      "loss": 0.0341,
+      "step": 280
+    },
+    {
+      "epoch": 47.35,
+      "learning_rate": 0.0051666666666666675,
+      "loss": 0.0327,
+      "step": 290
+    },
+    {
+      "epoch": 48.98,
+      "learning_rate": 0.005,
+      "loss": 0.0307,
+      "step": 300
+    },
+    {
+      "epoch": 50.61,
+      "learning_rate": 0.004833333333333334,
+      "loss": 0.031,
+      "step": 310
+    },
+    {
+      "epoch": 52.24,
+      "learning_rate": 0.004666666666666667,
+      "loss": 0.0312,
+      "step": 320
+    },
+    {
+      "epoch": 53.88,
+      "learning_rate": 0.0045000000000000005,
+      "loss": 0.033,
+      "step": 330
+    },
+    {
+      "epoch": 55.51,
+      "learning_rate": 0.004333333333333334,
+      "loss": 0.0294,
+      "step": 340
+    },
+    {
+      "epoch": 57.14,
+      "learning_rate": 0.004166666666666667,
+      "loss": 0.0308,
+      "step": 350
+    },
+    {
+      "epoch": 58.78,
+      "learning_rate": 0.004,
+      "loss": 0.0301,
+      "step": 360
+    },
+    {
+      "epoch": 60.41,
+      "learning_rate": 0.0038333333333333336,
+      "loss": 0.0292,
+      "step": 370
+    },
+    {
+      "epoch": 62.04,
+      "learning_rate": 0.0036666666666666666,
+      "loss": 0.0316,
+      "step": 380
+    },
+    {
+      "epoch": 63.67,
+      "learning_rate": 0.0034999999999999996,
+      "loss": 0.0302,
+      "step": 390
+    },
+    {
+      "epoch": 65.31,
+      "learning_rate": 0.003333333333333333,
+      "loss": 0.0295,
+      "step": 400
+    }
+  ],
+  "max_steps": 600,
+  "num_train_epochs": 100,
+  "total_flos": 4.702992625093837e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad
--- /dev/null
+++ b/checkpoint-400/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
+size 4472
diff --git a/checkpoint-500/config.json b/checkpoint-500/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504
--- /dev/null
+++ b/checkpoint-500/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm2-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.30.2",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/checkpoint-500/generation_config.json b/checkpoint-500/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59
--- /dev/null
+++ b/checkpoint-500/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}
diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..19707d809180dc84de01e068b7fbfc2e17a66941
--- /dev/null
+++ b/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9df4c877e409ae83e7bc7c0f1205d623699a931f44d97cbd852d2946c9fa1c96
+size 14681892
diff --git a/checkpoint-500/pytorch_model.bin b/checkpoint-500/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..41de8dd2b81817bbcfa5cb6dd53c67fbee19e67a
--- /dev/null
+++ b/checkpoint-500/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61a87e680f1db9957f77578eb4f8c6df8112d5951619472ae6cfe33f88f3f54e
+size 7341306
diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..60907f63b4ea28b9ff4c68ac70fb085bd735b02f
--- /dev/null
+++ b/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e88ac4017435c2ca3872f675a493a2f3116de05fe3fa16f5cc26289716e59698
+size 14244
diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b6c0f3762c3d1136503933403160db438fadd448
--- /dev/null
+++ b/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f977c9b1c32e778511ae551c40f7b1714188f32f52b3ae781ebb01b99519c875
+size 1064
diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/checkpoint-500/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/checkpoint-500/tokenizer.model b/checkpoint-500/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/checkpoint-500/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e
--- /dev/null
+++ b/checkpoint-500/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer"
+}
diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d72bf5958b9e074f210d5d2739afd384f952dae5
--- /dev/null
+++ b/checkpoint-500/trainer_state.json
@@ -0,0 +1,316 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 81.63265306122449,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.63,
+      "learning_rate": 0.009833333333333333,
+      "loss": 2.53,
+      "step": 10
+    },
+    {
+      "epoch": 3.27,
+      "learning_rate": 0.009666666666666667,
+      "loss": 2.0016,
+      "step": 20
+    },
+    {
+      "epoch": 4.9,
+      "learning_rate": 0.0095,
+      "loss": 1.7775,
+      "step": 30
+    },
+    {
+      "epoch": 6.53,
+      "learning_rate": 0.009333333333333334,
+      "loss": 1.6576,
+      "step": 40
+    },
+    {
+      "epoch": 8.16,
+      "learning_rate": 0.009166666666666667,
+      "loss": 1.5048,
+      "step": 50
+    },
+    {
+      "epoch": 9.8,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.3572,
+      "step": 60
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.008833333333333334,
+      "loss": 1.2067,
+      "step": 70
+    },
+    {
+      "epoch": 13.06,
+      "learning_rate": 0.008666666666666668,
+      "loss": 1.0777,
+      "step": 80
+    },
+    {
+      "epoch": 14.69,
+      "learning_rate": 0.0085,
+      "loss": 0.9188,
+      "step": 90
+    },
+    {
+      "epoch": 16.33,
+      "learning_rate": 0.008333333333333333,
+      "loss": 0.7241,
+      "step": 100
+    },
+    {
+      "epoch": 17.96,
+      "learning_rate": 0.008166666666666666,
+      "loss": 0.5775,
+      "step": 110
+    },
+    {
+      "epoch": 19.59,
+      "learning_rate": 0.008,
+      "loss": 0.4235,
+      "step": 120
+    },
+    {
+      "epoch": 21.22,
+      "learning_rate": 0.007833333333333333,
+      "loss": 0.3182,
+      "step": 130
+    },
+    {
+      "epoch": 22.86,
+      "learning_rate": 0.007666666666666667,
+      "loss": 0.2155,
+      "step": 140
+    },
+    {
+      "epoch": 24.49,
+      "learning_rate": 0.0075,
+      "loss": 0.1633,
+      "step": 150
+    },
+    {
+      "epoch": 26.12,
+      "learning_rate": 0.007333333333333333,
+      "loss": 0.1234,
+      "step": 160
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.007166666666666667,
+      "loss": 0.0911,
+      "step": 170
+    },
+    {
+      "epoch": 29.39,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.0738,
+      "step": 180
+    },
+    {
+      "epoch": 31.02,
+      "learning_rate": 0.006833333333333334,
+      "loss": 0.0673,
+      "step": 190
+    },
+    {
+      "epoch": 32.65,
+      "learning_rate": 0.006666666666666666,
+      "loss": 0.0544,
+      "step": 200
+    },
+    {
+      "epoch": 34.29,
+      "learning_rate": 0.006500000000000001,
+      "loss": 0.0492,
+      "step": 210
+    },
+    {
+      "epoch": 35.92,
+      "learning_rate": 0.006333333333333333,
+      "loss": 0.0458,
+      "step": 220
+    },
+    {
+      "epoch": 37.55,
+      "learning_rate": 0.0061666666666666675,
+      "loss": 0.0434,
+      "step": 230
+    },
+    {
+      "epoch": 39.18,
+      "learning_rate": 0.006,
+      "loss": 0.0387,
+      "step": 240
+    },
+    {
+      "epoch": 40.82,
+      "learning_rate": 0.005833333333333334,
+      "loss": 0.0375,
+      "step": 250
+    },
+    {
+      "epoch": 42.45,
+      "learning_rate": 0.005666666666666666,
+      "loss": 0.0363,
+      "step": 260
+    },
+    {
+      "epoch": 44.08,
+      "learning_rate": 0.0055000000000000005,
+      "loss": 0.0347,
+      "step": 270
+    },
+    {
+      "epoch": 45.71,
+      "learning_rate": 0.005333333333333333,
+      "loss": 0.0341,
+      "step": 280
+    },
+    {
+      "epoch": 47.35,
+      "learning_rate": 0.0051666666666666675,
+      "loss": 0.0327,
+      "step": 290
+    },
+    {
+      "epoch": 48.98,
+      "learning_rate": 0.005,
+      "loss": 0.0307,
+      "step": 300
+    },
+    {
+      "epoch": 50.61,
+      "learning_rate": 0.004833333333333334,
+      "loss": 0.031,
+      "step": 310
+    },
+    {
+      "epoch": 52.24,
+      "learning_rate": 0.004666666666666667,
+      "loss": 0.0312,
+      "step": 320
+    },
+    {
+      "epoch": 53.88,
+      "learning_rate": 0.0045000000000000005,
+      "loss": 0.033,
+      "step": 330
+    },
+    {
+      "epoch": 55.51,
+      "learning_rate": 0.004333333333333334,
+      "loss": 0.0294,
+      "step": 340
+    },
+    {
+      "epoch": 57.14,
+      "learning_rate": 0.004166666666666667,
+      "loss": 0.0308,
+      "step": 350
+    },
+    {
+      "epoch": 58.78,
+      "learning_rate": 0.004,
+      "loss": 0.0301,
+      "step": 360
+    },
+    {
+      "epoch": 60.41,
+      "learning_rate": 0.0038333333333333336,
+      "loss": 0.0292,
+      "step": 370
+    },
+    {
+      "epoch": 62.04,
+      "learning_rate": 0.0036666666666666666,
+      "loss": 0.0316,
+      "step": 380
+    },
+    {
+      "epoch": 63.67,
+      "learning_rate": 0.0034999999999999996,
+      "loss": 0.0302,
+      "step": 390
+    },
+    {
+      "epoch": 65.31,
+      "learning_rate": 0.003333333333333333,
+      "loss": 0.0295,
+      "step": 400
+    },
+    {
+      "epoch": 66.94,
+      "learning_rate": 0.0031666666666666666,
+      "loss": 0.0306,
+      "step": 410
+    },
+    {
+      "epoch": 68.57,
+      "learning_rate": 0.003,
+      "loss": 0.0296,
+      "step": 420
+    },
+    {
+      "epoch": 70.2,
+      "learning_rate": 0.002833333333333333,
+      "loss": 0.0293,
+      "step": 430
+    },
+    {
+      "epoch": 71.84,
+      "learning_rate": 0.0026666666666666666,
+      "loss": 0.0302,
+      "step": 440
+    },
+    {
+      "epoch": 73.47,
+      "learning_rate": 0.0025,
+      "loss": 0.0288,
+      "step": 450
+    },
+    {
+      "epoch": 75.1,
+      "learning_rate": 0.0023333333333333335,
+      "loss": 0.0292,
+      "step": 460
+    },
+    {
+      "epoch": 76.73,
+      "learning_rate": 0.002166666666666667,
+      "loss": 0.0285,
+      "step": 470
+    },
+    {
+      "epoch": 78.37,
+      "learning_rate": 0.002,
+      "loss": 0.0309,
+      "step": 480
+    },
+    {
+      "epoch": 80.0,
+      "learning_rate": 0.0018333333333333333,
+      "loss": 0.0291,
+      "step": 490
+    },
+    {
+      "epoch": 81.63,
+      "learning_rate": 0.0016666666666666666,
+      "loss": 0.0305,
+      "step": 500
+    }
+  ],
+  "max_steps": 600,
+  "num_train_epochs": 100,
+  "total_flos": 5.878740781367296e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad
--- /dev/null
+++ b/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
+size 4472
diff --git a/checkpoint-600/config.json b/checkpoint-600/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504
--- /dev/null
+++ b/checkpoint-600/config.json
@@ -0,0 +1,47 @@
+{
+  "_name_or_path": "chatglm2-6b",
+  "add_bias_linear": false,
+  "add_qkv_bias": true,
+  "apply_query_key_layer_scaling": true,
+  "apply_residual_connection_post_layernorm": false,
+  "architectures": [
+    "ChatGLMForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "attention_softmax_in_fp32": true,
+  "auto_map": {
+    "AutoConfig": "configuration_chatglm.ChatGLMConfig",
+    "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
+    "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
+  },
+  "bias_dropout_fusion": true,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "ffn_hidden_size": 13696,
+  "fp32_residual_connection": false,
+  "hidden_dropout": 0.0,
+  "hidden_size": 4096,
+  "kv_channels": 128,
+  "layernorm_epsilon": 1e-05,
+  "model_type": "chatglm",
+  "multi_query_attention": true,
+  "multi_query_group_num": 2,
+  "num_attention_heads": 32,
+  "num_layers": 28,
+  "original_rope": true,
+  "pad_token_id": 0,
+  "padded_vocab_size": 65024,
+  "post_layer_norm": true,
+  "pre_seq_len": 128,
+  "prefix_projection": false,
+  "quantization_bit": 0,
+  "rmsnorm": true,
+  "seq_length": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.30.2",
+  "use_cache": true,
+  "vocab_size": 65024
+}
diff --git a/checkpoint-600/generation_config.json b/checkpoint-600/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59
--- /dev/null
+++ b/checkpoint-600/generation_config.json
@@ -0,0 +1,6 @@
+{
+  "_from_model_config": true,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.30.2"
+}
diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..49165c57d87ebcc5be39a09ad3101aa06f0ce48d
--- /dev/null
+++ b/checkpoint-600/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f846354f6111553252e1e024ead83ce86f2e7013a3f4bf820d2e4ed5899bf33
+size 14681892
diff --git a/checkpoint-600/pytorch_model.bin b/checkpoint-600/pytorch_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d97c2a152fe6ae9c04970d5f0c379431d0d1b60f
--- /dev/null
+++ b/checkpoint-600/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:deb3d3d7c639eb28900ada66408548977ec5f467d2f47ea105727c6bb7593924
+size 7341306
diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b6f478d94147d001e3ab74e0b16fc8ef0c8eda51
--- /dev/null
+++ b/checkpoint-600/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e06dc51ff93f048a9d0e62293091fe86d17b057af111c26d07c68c9032c09764
+size 14244
diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..ce070b66204b5892a9e615c42a9311b8b3e8d8c2
--- /dev/null
+++ b/checkpoint-600/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:374a1275027169bbb158e812d9992a46eddd14d03a3c85a854057bdc507c957e
+size 1064
diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/checkpoint-600/special_tokens_map.json
@@ -0,0 +1 @@
+{}
diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba
--- /dev/null
+++ b/checkpoint-600/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370
diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e
--- /dev/null
+++ b/checkpoint-600/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "ChatGLMTokenizer"
+}
diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..91afe194dac09688ab580692238f7d2a8f1b09af
--- /dev/null
+++ b/checkpoint-600/trainer_state.json
@@ -0,0 +1,376 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 97.95918367346938,
+  "global_step": 600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.63,
+      "learning_rate": 0.009833333333333333,
+      "loss": 2.53,
+      "step": 10
+    },
+    {
+      "epoch": 3.27,
+      "learning_rate": 0.009666666666666667,
+      "loss": 2.0016,
+      "step": 20
+    },
+    {
+      "epoch": 4.9,
+      "learning_rate": 0.0095,
+      "loss": 1.7775,
+      "step": 30
+    },
+    {
+      "epoch": 6.53,
+      "learning_rate": 0.009333333333333334,
+      "loss": 1.6576,
+      "step": 40
+    },
+    {
+      "epoch": 8.16,
+      "learning_rate": 0.009166666666666667,
+      "loss": 1.5048,
+      "step": 50
+    },
+    {
+      "epoch": 9.8,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.3572,
+      "step": 60
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.008833333333333334,
+      "loss": 1.2067,
+      "step": 70
+    },
+    {
+      "epoch": 13.06,
+      "learning_rate": 0.008666666666666668,
+      "loss": 1.0777,
+      "step": 80
+    },
+    {
+      "epoch": 14.69,
+      "learning_rate": 0.0085,
+      "loss": 0.9188,
+      "step": 90
+    },
+    {
+      "epoch": 16.33,
+      "learning_rate": 0.008333333333333333,
+      "loss": 0.7241,
+      "step": 100
+    },
+    {
+      "epoch": 17.96,
+      "learning_rate": 0.008166666666666666,
+      "loss": 0.5775,
+      "step": 110
+    },
+    {
+      "epoch": 19.59,
+      "learning_rate": 0.008,
+      "loss": 0.4235,
+      "step": 120
+    },
+    {
+      "epoch": 21.22,
+      "learning_rate": 0.007833333333333333,
+      "loss": 0.3182,
+      "step": 130
+    },
+    {
+      "epoch": 22.86,
+      "learning_rate": 0.007666666666666667,
+      "loss": 0.2155,
+      "step": 140
+    },
+    {
+      "epoch": 24.49,
+      "learning_rate": 0.0075,
+      "loss": 0.1633,
+      "step": 150
+    },
+    {
+      "epoch": 26.12,
+      "learning_rate": 0.007333333333333333,
+      "loss": 0.1234,
+      "step": 160
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.007166666666666667,
+      "loss": 0.0911,
+      "step": 170
+    },
+    {
+      "epoch": 29.39,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.0738,
+      "step": 180
+    },
+    {
+      "epoch": 31.02,
+      "learning_rate": 0.006833333333333334,
+      "loss": 0.0673,
+      "step": 190
+    },
+    {
+      "epoch": 32.65,
+      "learning_rate": 0.006666666666666666,
+      "loss": 0.0544,
+      "step": 200
+    },
+    {
+      "epoch": 34.29,
+      "learning_rate": 0.006500000000000001,
+      "loss": 0.0492,
+      "step": 210
+    },
+    {
+      "epoch": 35.92,
+      "learning_rate": 0.006333333333333333,
+      "loss": 0.0458,
+      "step": 220
+    },
+    {
+      "epoch": 37.55,
+      "learning_rate": 0.0061666666666666675,
+      "loss": 0.0434,
+      "step": 230
+    },
+    {
+      "epoch": 39.18,
+      "learning_rate": 0.006,
+      "loss": 0.0387,
+      "step": 240
+    },
+    {
+      "epoch": 40.82,
+      "learning_rate": 0.005833333333333334,
+      "loss": 0.0375,
+      "step": 250
+    },
+    {
+      "epoch": 42.45,
+      "learning_rate": 0.005666666666666666,
+      "loss": 0.0363,
+      "step": 260
+    },
+    {
+      "epoch": 44.08,
+      "learning_rate": 0.0055000000000000005,
+      "loss": 0.0347,
+      "step": 270
+    },
+    {
+      "epoch": 45.71,
+      "learning_rate": 0.005333333333333333,
+      "loss": 0.0341,
+      "step": 280
+    },
+    {
+      "epoch": 47.35,
+      "learning_rate": 0.0051666666666666675,
+      "loss": 0.0327,
+      "step": 290
+    },
+    {
+      "epoch": 48.98,
+      "learning_rate": 0.005,
+      "loss": 0.0307,
+      "step": 300
+    },
+    {
+      "epoch": 50.61,
+      "learning_rate": 0.004833333333333334,
+      "loss": 0.031,
+      "step": 310
+    },
+    {
+      "epoch": 52.24,
+      "learning_rate": 0.004666666666666667,
+      "loss": 0.0312,
+      "step": 320
+    },
+    {
+      "epoch": 53.88,
+      "learning_rate": 0.0045000000000000005,
+      "loss": 0.033,
+      "step": 330
+    },
+    {
+      "epoch": 55.51,
+      "learning_rate": 0.004333333333333334,
+      "loss": 0.0294,
+      "step": 340
+    },
+    {
+      "epoch": 57.14,
+      "learning_rate": 0.004166666666666667,
+      "loss": 0.0308,
+      "step": 350
+    },
+    {
+      "epoch": 58.78,
+      "learning_rate": 0.004,
+      "loss": 0.0301,
+      "step": 360
+    },
+    {
+      "epoch": 60.41,
+      "learning_rate": 0.0038333333333333336,
+      "loss": 0.0292,
+      "step": 370
+    },
+    {
+      "epoch": 62.04,
+      "learning_rate": 0.0036666666666666666,
+      "loss": 0.0316,
+      "step": 380
+    },
+    {
+      "epoch": 63.67,
+      "learning_rate": 0.0034999999999999996,
+      "loss": 0.0302,
+      "step": 390
+    },
+    {
+      "epoch": 65.31,
+      "learning_rate": 0.003333333333333333,
+      "loss": 0.0295,
+      "step": 400
+    },
+    {
+      "epoch": 66.94,
+      "learning_rate": 0.0031666666666666666,
+      "loss": 0.0306,
+      "step": 410
+    },
+    {
+      "epoch": 68.57,
+      "learning_rate": 0.003,
+      "loss": 0.0296,
+      "step": 420
+    },
+    {
+      "epoch": 70.2,
+      "learning_rate": 0.002833333333333333,
+      "loss": 0.0293,
+      "step": 430
+    },
+    {
+      "epoch": 71.84,
+      "learning_rate": 0.0026666666666666666,
+      "loss": 0.0302,
+      "step": 440
+    },
+    {
+      "epoch": 73.47,
+      "learning_rate": 0.0025,
+      "loss": 0.0288,
+      "step": 450
+    },
+    {
+      "epoch": 75.1,
+      "learning_rate": 0.0023333333333333335,
+      "loss": 0.0292,
+      "step": 460
+    },
+    {
+      "epoch": 76.73,
+      "learning_rate": 0.002166666666666667,
+      "loss": 0.0285,
+      "step": 470
+    },
+    {
+      "epoch": 78.37,
+      "learning_rate": 0.002,
+      "loss": 0.0309,
+      "step": 480
+    },
+    {
+      "epoch": 80.0,
+      "learning_rate": 0.0018333333333333333,
+      "loss": 0.0291,
+      "step": 490
+    },
+    {
+      "epoch": 81.63,
+      "learning_rate": 0.0016666666666666666,
+      "loss": 0.0305,
+      "step": 500
+    },
+    {
+      "epoch": 83.27,
+      "learning_rate": 0.0015,
+      "loss": 0.0302,
+      "step": 510
+    },
+    {
+      "epoch": 84.9,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 0.0294,
+      "step": 520
+    },
+    {
+      "epoch": 86.53,
+      "learning_rate": 0.0011666666666666668,
+      "loss": 0.0295,
+      "step": 530
+    },
+    {
+      "epoch": 88.16,
+      "learning_rate": 0.001,
+      "loss": 0.0283,
+      "step": 540
+    },
+    {
+      "epoch": 89.8,
+      "learning_rate": 0.0008333333333333333,
+      "loss": 0.0305,
+      "step": 550
+    },
+    {
+      "epoch": 91.43,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 0.0288,
+      "step": 560
+    },
+    {
+      "epoch": 93.06,
+      "learning_rate": 0.0005,
+      "loss": 0.0309,
+      "step": 570
+    },
+    {
+      "epoch": 94.69,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.0286,
+      "step": 580
+    },
+    {
+      "epoch": 96.33,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.0309,
+      "step": 590
+    },
+    {
+      "epoch": 97.96,
+      "learning_rate": 0.0,
+      "loss": 0.0294,
+      "step": 600
+    }
+  ],
+  "max_steps": 600,
+  "num_train_epochs": 100,
+  "total_flos": 7.054488937640755e+17,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad
--- /dev/null
+++ b/checkpoint-600/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
+size 4472
diff --git a/train_results.json b/train_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..7f7ad23765a709637c43e20c5e71464010e888ff
--- /dev/null
+++ b/train_results.json
@@ -0,0 +1,8 @@
+{
+    "epoch": 97.96,
+    "train_loss": 0.302445507645607,
+    "train_runtime": 8265.7464,
+    "train_samples": 98,
+    "train_samples_per_second": 1.161,
+    "train_steps_per_second": 0.073
+}
\ No newline at end of file
diff --git a/trainer_state.json b/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..dff7c783d0b3f0f018c46d665ec95cfa14783257
--- /dev/null
+++ b/trainer_state.json
@@ -0,0 +1,385 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 97.95918367346938,
+  "global_step": 600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.63,
+      "learning_rate": 0.009833333333333333,
+      "loss": 2.53,
+      "step": 10
+    },
+    {
+      "epoch": 3.27,
+      "learning_rate": 0.009666666666666667,
+      "loss": 2.0016,
+      "step": 20
+    },
+    {
+      "epoch": 4.9,
+      "learning_rate": 0.0095,
+      "loss": 1.7775,
+      "step": 30
+    },
+    {
+      "epoch": 6.53,
+      "learning_rate": 0.009333333333333334,
+      "loss": 1.6576,
+      "step": 40
+    },
+    {
+      "epoch": 8.16,
+      "learning_rate": 0.009166666666666667,
+      "loss": 1.5048,
+      "step": 50
+    },
+    {
+      "epoch": 9.8,
+      "learning_rate": 0.009000000000000001,
+      "loss": 1.3572,
+      "step": 60
+    },
+    {
+      "epoch": 11.43,
+      "learning_rate": 0.008833333333333334,
+      "loss": 1.2067,
+      "step": 70
+    },
+    {
+      "epoch": 13.06,
+      "learning_rate": 0.008666666666666668,
+      "loss": 1.0777,
+      "step": 80
+    },
+    {
+      "epoch": 14.69,
+      "learning_rate": 0.0085,
+      "loss": 0.9188,
+      "step": 90
+    },
+    {
+      "epoch": 16.33,
+      "learning_rate": 0.008333333333333333,
+      "loss": 0.7241,
+      "step": 100
+    },
+    {
+      "epoch": 17.96,
+      "learning_rate": 0.008166666666666666,
+      "loss": 0.5775,
+      "step": 110
+    },
+    {
+      "epoch": 19.59,
+      "learning_rate": 0.008,
+      "loss": 0.4235,
+      "step": 120
+    },
+    {
+      "epoch": 21.22,
+      "learning_rate": 0.007833333333333333,
+      "loss": 0.3182,
+      "step": 130
+    },
+    {
+      "epoch": 22.86,
+      "learning_rate": 0.007666666666666667,
+      "loss": 0.2155,
+      "step": 140
+    },
+    {
+      "epoch": 24.49,
+      "learning_rate": 0.0075,
+      "loss": 0.1633,
+      "step": 150
+    },
+    {
+      "epoch": 26.12,
+      "learning_rate": 0.007333333333333333,
+      "loss": 0.1234,
+      "step": 160
+    },
+    {
+      "epoch": 27.76,
+      "learning_rate": 0.007166666666666667,
+      "loss": 0.0911,
+      "step": 170
+    },
+    {
+      "epoch": 29.39,
+      "learning_rate": 0.006999999999999999,
+      "loss": 0.0738,
+      "step": 180
+    },
+    {
+      "epoch": 31.02,
+      "learning_rate": 0.006833333333333334,
+      "loss": 0.0673,
+      "step": 190
+    },
+    {
+      "epoch": 32.65,
+      "learning_rate": 0.006666666666666666,
+      "loss": 0.0544,
+      "step": 200
+    },
+    {
+      "epoch": 34.29,
+      "learning_rate": 0.006500000000000001,
+      "loss": 0.0492,
+      "step": 210
+    },
+    {
+      "epoch": 35.92,
+      "learning_rate": 0.006333333333333333,
+      "loss": 0.0458,
+      "step": 220
+    },
+    {
+      "epoch": 37.55,
+      "learning_rate": 0.0061666666666666675,
+      "loss": 0.0434,
+      "step": 230
+    },
+    {
+      "epoch": 39.18,
+      "learning_rate": 0.006,
+      "loss": 0.0387,
+      "step": 240
+    },
+    {
+      "epoch": 40.82,
+      "learning_rate": 0.005833333333333334,
+      "loss": 0.0375,
+      "step": 250
+    },
+    {
+      "epoch": 42.45,
+      "learning_rate": 0.005666666666666666,
+      "loss": 0.0363,
+      "step": 260
+    },
+    {
+      "epoch": 44.08,
+      "learning_rate": 0.0055000000000000005,
+      "loss": 0.0347,
+      "step": 270
+    },
+    {
+      "epoch": 45.71,
+      "learning_rate": 0.005333333333333333,
+      "loss": 0.0341,
+      "step": 280
+    },
+    {
+      "epoch": 47.35,
+      "learning_rate": 0.0051666666666666675,
+      "loss": 0.0327,
+      "step": 290
+    },
+    {
+      "epoch": 48.98,
+      "learning_rate": 0.005,
+      "loss": 0.0307,
+      "step": 300
+    },
+    {
+      "epoch": 50.61,
+      "learning_rate": 0.004833333333333334,
+      "loss": 0.031,
+      "step": 310
+    },
+    {
+      "epoch": 52.24,
+      "learning_rate": 0.004666666666666667,
+      "loss": 0.0312,
+      "step": 320
+    },
+    {
+      "epoch": 53.88,
+      "learning_rate": 0.0045000000000000005,
+      "loss": 0.033,
+      "step": 330
+    },
+    {
+      "epoch": 55.51,
+      "learning_rate": 0.004333333333333334,
+      "loss": 0.0294,
+      "step": 340
+    },
+    {
+      "epoch": 57.14,
+      "learning_rate": 0.004166666666666667,
+      "loss": 0.0308,
+      "step": 350
+    },
+    {
+      "epoch": 58.78,
+      "learning_rate": 0.004,
+      "loss": 0.0301,
+      "step": 360
+    },
+    {
+      "epoch": 60.41,
+      "learning_rate": 0.0038333333333333336,
+      "loss": 0.0292,
+      "step": 370
+    },
+    {
+      "epoch": 62.04,
+      "learning_rate": 0.0036666666666666666,
+      "loss": 0.0316,
+      "step": 380
+    },
+    {
+      "epoch": 63.67,
+      "learning_rate": 0.0034999999999999996,
+      "loss": 0.0302,
+      "step": 390
+    },
+    {
+      "epoch": 65.31,
+      "learning_rate": 0.003333333333333333,
+      "loss": 0.0295,
+      "step": 400
+    },
+    {
+      "epoch": 66.94,
+      "learning_rate": 0.0031666666666666666,
+      "loss": 0.0306,
+      "step": 410
+    },
+    {
+      "epoch": 68.57,
+      "learning_rate": 0.003,
+      "loss": 0.0296,
+      "step": 420
+    },
+    {
+      "epoch": 70.2,
+      "learning_rate": 0.002833333333333333,
+      "loss": 0.0293,
+      "step": 430
+    },
+    {
+      "epoch": 71.84,
+      "learning_rate": 0.0026666666666666666,
+      "loss": 0.0302,
+      "step": 440
+    },
+    {
+      "epoch": 73.47,
+      "learning_rate": 0.0025,
+      "loss": 0.0288,
+      "step": 450
+    },
+    {
+      "epoch": 75.1,
+      "learning_rate": 0.0023333333333333335,
+      "loss": 0.0292,
+      "step": 460
+    },
+    {
+      "epoch": 76.73,
+      "learning_rate": 0.002166666666666667,
+      "loss": 0.0285,
+      "step": 470
+    },
+    {
+      "epoch": 78.37,
+      "learning_rate": 0.002,
+      "loss": 0.0309,
+      "step": 480
+    },
+    {
+      "epoch": 80.0,
+      "learning_rate": 0.0018333333333333333,
+      "loss": 0.0291,
+      "step": 490
+    },
+    {
+      "epoch": 81.63,
+      "learning_rate": 0.0016666666666666666,
+      "loss": 0.0305,
+      "step": 500
+    },
+    {
+      "epoch": 83.27,
+      "learning_rate": 0.0015,
+      "loss": 0.0302,
+      "step": 510
+    },
+    {
+      "epoch": 84.9,
+      "learning_rate": 0.0013333333333333333,
+      "loss": 0.0294,
+      "step": 520
+    },
+    {
+      "epoch": 86.53,
+      "learning_rate": 0.0011666666666666668,
+      "loss": 0.0295,
+      "step": 530
+    },
+    {
+      "epoch": 88.16,
+      "learning_rate": 0.001,
+      "loss": 0.0283,
+      "step": 540
+    },
+    {
+      "epoch": 89.8,
+      "learning_rate": 0.0008333333333333333,
+      "loss": 0.0305,
+      "step": 550
+    },
+    {
+      "epoch": 91.43,
+      "learning_rate": 0.0006666666666666666,
+      "loss": 0.0288,
+      "step": 560
+    },
+    {
+      "epoch": 93.06,
+      "learning_rate": 0.0005,
+      "loss": 0.0309,
+      "step": 570
+    },
+    {
+      "epoch": 94.69,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.0286,
+      "step": 580
+    },
+    {
+      "epoch": 96.33,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.0309,
+      "step": 590
+    },
+    {
+      "epoch": 97.96,
+      "learning_rate": 0.0,
+      "loss": 0.0294,
+      "step": 600
+    },
+    {
+      "epoch": 97.96,
+      "step": 600,
+      "total_flos": 7.054488937640755e+17,
+      "train_loss": 0.302445507645607,
+      "train_runtime": 8265.7464,
+      "train_samples_per_second": 1.161,
+      "train_steps_per_second": 0.073
+    }
+  ],
+  "max_steps": 600,
+  "num_train_epochs": 100,
+  "total_flos": 7.054488937640755e+17,
+  "trial_name": null,
+  "trial_params": null
+}