diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7f7ad23765a709637c43e20c5e71464010e888ff --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 97.96, + "train_loss": 0.302445507645607, + "train_runtime": 8265.7464, + "train_samples": 98, + "train_samples_per_second": 1.161, + "train_steps_per_second": 0.073 +} \ No newline at end of file diff --git a/checkpoint-100/config.json b/checkpoint-100/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504 --- /dev/null +++ b/checkpoint-100/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm2-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 32768, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.30.2", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/checkpoint-100/generation_config.json b/checkpoint-100/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59 --- /dev/null +++ b/checkpoint-100/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.30.2" +} diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..52c801a2f2ecae27572b00c1a0ba0b9451c2fa19 --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b9322b7c791be0283cae33d01cb2e6c40786a9c9fab7fc421715ba39faa314 +size 14681892 diff --git a/checkpoint-100/pytorch_model.bin b/checkpoint-100/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..918f41cbd30359828fd3c6f019f56566e69ef98b --- /dev/null +++ b/checkpoint-100/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119f3aee6155af1456cc129b0eab064a91fd3a95f864e8b1a4985d7e10381988 +size 7341306 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fbcc351fdb5edd176fbbfa420eeee3e70e240743 --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:723f496b9d8d8e776f11531f4652ca1ce47b825b86325d6f5aba8841ca36f1a0 +size 14244 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..97f63f3eb812c004e226c4d5a316e628e46c3532 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e97f84b2e9cf03e34106548e8fd72d9181e088fcbe9b5747b6e8466de9610724 +size 1064 diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/checkpoint-100/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer" +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5f5d4dbed2ebfe7bb2ce69aaa8b495dd232943ba --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,76 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.3265306122449, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.63, + "learning_rate": 0.009833333333333333, + "loss": 2.53, + "step": 10 + }, + { + "epoch": 3.27, + "learning_rate": 0.009666666666666667, + "loss": 2.0016, + "step": 20 + }, + { + "epoch": 4.9, + "learning_rate": 0.0095, + "loss": 1.7775, + "step": 30 + }, + { + "epoch": 6.53, + "learning_rate": 0.009333333333333334, + "loss": 1.6576, + "step": 40 + }, + { + "epoch": 8.16, + "learning_rate": 0.009166666666666667, + "loss": 1.5048, + "step": 50 + }, + { + "epoch": 9.8, + "learning_rate": 0.009000000000000001, + "loss": 1.3572, + "step": 60 + }, + { + "epoch": 11.43, + "learning_rate": 0.008833333333333334, + "loss": 1.2067, + "step": 70 + }, + { + "epoch": 13.06, + "learning_rate": 0.008666666666666668, + "loss": 1.0777, + "step": 80 + }, + { + "epoch": 14.69, + "learning_rate": 0.0085, + "loss": 0.9188, + "step": 90 + }, + { + "epoch": 16.33, + "learning_rate": 0.008333333333333333, + "loss": 0.7241, + "step": 100 + } + ], + "max_steps": 600, + "num_train_epochs": 100, + "total_flos": 1.1757481562734592e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af +size 4472 diff --git a/checkpoint-200/config.json b/checkpoint-200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504 --- /dev/null +++ b/checkpoint-200/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm2-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 32768, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.30.2", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/checkpoint-200/generation_config.json b/checkpoint-200/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59 --- /dev/null +++ b/checkpoint-200/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.30.2" +} diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4cb54f42ee02783f24d1c226ea5b5f7745b56db --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:933b7b82708ba6a23d949d7b05fcb8644b9ab8b06ecf625f35c30aeba85b3ba2 +size 14681892 diff --git a/checkpoint-200/pytorch_model.bin b/checkpoint-200/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8e4d6030a564ce573337f3b67a7184e2a88e2234 --- /dev/null +++ b/checkpoint-200/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54e939cf8e3ee1c58646595ea0e7748202c1e1b85f82aeb536a388bbe8d36e86 +size 7341306 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4cf9dd992ca86cb2bb62e9edb968c887e9aff1e2 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51523eedac643c13a3a71297ac9e347331249d1d4cc19f9738a182bae3585fb2 +size 14244 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..03859582101c93c48f703cbc6d8bdff86b097783 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3db1c4819d8e7a76f34cf5f8f4aa0bf9497992cd0862dbd9ba3fc68b9886b79e +size 1064 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer" +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a8095cc1f70332f738abddb0364d128f780b03d1 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,136 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 32.6530612244898, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.63, + "learning_rate": 0.009833333333333333, + "loss": 2.53, + "step": 10 + }, + { + "epoch": 3.27, + "learning_rate": 0.009666666666666667, + "loss": 2.0016, + "step": 20 + }, + { + "epoch": 4.9, + "learning_rate": 0.0095, + "loss": 1.7775, + "step": 30 + }, + { + "epoch": 6.53, + "learning_rate": 0.009333333333333334, + "loss": 1.6576, + "step": 40 + }, + { + "epoch": 8.16, + "learning_rate": 0.009166666666666667, + "loss": 1.5048, + "step": 50 + }, + { + "epoch": 9.8, + "learning_rate": 0.009000000000000001, + "loss": 1.3572, + "step": 60 + }, + { + "epoch": 11.43, + "learning_rate": 0.008833333333333334, + "loss": 1.2067, + "step": 70 + }, + { + "epoch": 13.06, + "learning_rate": 0.008666666666666668, + "loss": 1.0777, + "step": 80 + }, + { + "epoch": 14.69, + "learning_rate": 0.0085, + "loss": 0.9188, + "step": 90 + }, + { + "epoch": 16.33, + "learning_rate": 0.008333333333333333, + "loss": 0.7241, + "step": 100 + }, + { + "epoch": 17.96, + "learning_rate": 0.008166666666666666, + "loss": 0.5775, + "step": 110 + }, + { + "epoch": 19.59, + "learning_rate": 0.008, + "loss": 0.4235, + "step": 120 + }, + { + "epoch": 21.22, + "learning_rate": 0.007833333333333333, + "loss": 0.3182, + "step": 130 + }, + { + "epoch": 22.86, + "learning_rate": 0.007666666666666667, + "loss": 0.2155, + "step": 140 + }, + { + "epoch": 24.49, + "learning_rate": 0.0075, + "loss": 0.1633, + "step": 150 + }, + { + "epoch": 26.12, + "learning_rate": 0.007333333333333333, + "loss": 0.1234, + "step": 160 + }, + { + "epoch": 27.76, + "learning_rate": 0.007166666666666667, + "loss": 0.0911, + "step": 170 + }, + { + "epoch": 29.39, + "learning_rate": 0.006999999999999999, + "loss": 0.0738, + "step": 180 + }, + { + "epoch": 31.02, + "learning_rate": 0.006833333333333334, + "loss": 0.0673, + "step": 190 + }, + { + "epoch": 32.65, + "learning_rate": 0.006666666666666666, + "loss": 0.0544, + "step": 200 + } + ], + "max_steps": 600, + "num_train_epochs": 100, + "total_flos": 2.3514963125469184e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af +size 4472 diff --git a/checkpoint-300/config.json b/checkpoint-300/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504 --- /dev/null +++ b/checkpoint-300/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm2-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 32768, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.30.2", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/checkpoint-300/generation_config.json b/checkpoint-300/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59 --- /dev/null +++ b/checkpoint-300/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.30.2" +} diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..10f90948789c8412689667bdc8a8e42676881c62 --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb80ab0b61a192373221d205400431f1f9db5591d3be1fcdb9051924f1b410d2 +size 14681892 diff --git a/checkpoint-300/pytorch_model.bin b/checkpoint-300/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7c2c568a282353a4b55bed9c7e8ba3d33487a168 --- /dev/null +++ b/checkpoint-300/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:187b159480029f605a8ec08a6da076afe43110d3c1ae18d10931f2ac9e5793ec +size 7341306 diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e479fc067e718f8fbaaa35204bb72b4bac22d994 --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5035452976c183913e118a486015c4dbd9cf61159f30c79ac9dd02dbf2cd81c +size 14244 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..de11f4c03d3a770ff4ef03641588dc58053cc791 --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b5ff897392aa57ce97759b435acfdb4ee39aef21d4a4a68095c3294c513f6c0 +size 1064 diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/checkpoint-300/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/checkpoint-300/tokenizer.model b/checkpoint-300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e --- /dev/null +++ b/checkpoint-300/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer" +} diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..408c74767e379b67512325d3df703334e4ba1ed3 --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,196 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 48.97959183673469, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.63, + "learning_rate": 0.009833333333333333, + "loss": 2.53, + "step": 10 + }, + { + "epoch": 3.27, + "learning_rate": 0.009666666666666667, + "loss": 2.0016, + "step": 20 + }, + { + "epoch": 4.9, + "learning_rate": 0.0095, + "loss": 1.7775, + "step": 30 + }, + { + "epoch": 6.53, + "learning_rate": 0.009333333333333334, + "loss": 1.6576, + "step": 40 + }, + { + "epoch": 8.16, + "learning_rate": 0.009166666666666667, + "loss": 1.5048, + "step": 50 + }, + { + "epoch": 9.8, + "learning_rate": 0.009000000000000001, + "loss": 1.3572, + "step": 60 + }, + { + "epoch": 11.43, + "learning_rate": 0.008833333333333334, + "loss": 1.2067, + "step": 70 + }, + { + "epoch": 13.06, + "learning_rate": 0.008666666666666668, + "loss": 1.0777, + "step": 80 + }, + { + "epoch": 14.69, + "learning_rate": 0.0085, + "loss": 0.9188, + "step": 90 + }, + { + "epoch": 16.33, + "learning_rate": 0.008333333333333333, + "loss": 0.7241, + "step": 100 + }, + { + "epoch": 17.96, + "learning_rate": 0.008166666666666666, + "loss": 0.5775, + "step": 110 + }, + { + "epoch": 19.59, + "learning_rate": 0.008, + "loss": 0.4235, + "step": 120 + }, + { + "epoch": 21.22, + "learning_rate": 0.007833333333333333, + "loss": 0.3182, + "step": 130 + }, + { + "epoch": 22.86, + "learning_rate": 0.007666666666666667, + "loss": 0.2155, + "step": 140 + }, + { + "epoch": 24.49, + "learning_rate": 0.0075, + "loss": 0.1633, + "step": 150 + }, + { + "epoch": 26.12, + "learning_rate": 0.007333333333333333, + "loss": 0.1234, + "step": 160 + }, + { + "epoch": 27.76, + "learning_rate": 0.007166666666666667, + "loss": 0.0911, + "step": 170 + }, + { + "epoch": 29.39, + "learning_rate": 0.006999999999999999, + "loss": 0.0738, + "step": 180 + }, + { + "epoch": 31.02, + "learning_rate": 0.006833333333333334, + "loss": 0.0673, + "step": 190 + }, + { + "epoch": 32.65, + "learning_rate": 0.006666666666666666, + "loss": 0.0544, + "step": 200 + }, + { + "epoch": 34.29, + "learning_rate": 0.006500000000000001, + "loss": 0.0492, + "step": 210 + }, + { + "epoch": 35.92, + "learning_rate": 0.006333333333333333, + "loss": 0.0458, + "step": 220 + }, + { + "epoch": 37.55, + "learning_rate": 0.0061666666666666675, + "loss": 0.0434, + "step": 230 + }, + { + "epoch": 39.18, + "learning_rate": 0.006, + "loss": 0.0387, + "step": 240 + }, + { + "epoch": 40.82, + "learning_rate": 0.005833333333333334, + "loss": 0.0375, + "step": 250 + }, + { + "epoch": 42.45, + "learning_rate": 0.005666666666666666, + "loss": 0.0363, + "step": 260 + }, + { + "epoch": 44.08, + "learning_rate": 0.0055000000000000005, + "loss": 0.0347, + "step": 270 + }, + { + "epoch": 45.71, + "learning_rate": 0.005333333333333333, + "loss": 0.0341, + "step": 280 + }, + { + "epoch": 47.35, + "learning_rate": 0.0051666666666666675, + "loss": 0.0327, + "step": 290 + }, + { + "epoch": 48.98, + "learning_rate": 0.005, + "loss": 0.0307, + "step": 300 + } + ], + "max_steps": 600, + "num_train_epochs": 100, + "total_flos": 3.5272444688203776e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af +size 4472 diff --git a/checkpoint-400/config.json b/checkpoint-400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504 --- /dev/null +++ b/checkpoint-400/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm2-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 32768, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.30.2", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/checkpoint-400/generation_config.json b/checkpoint-400/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59 --- /dev/null +++ b/checkpoint-400/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.30.2" +} diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1a3ae2ea3b3ca7d394452bf6c9cd94e5019038d --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ddda63cbe968668b459a73f0a54c34fc36c007f9f202063794ded2a8814a37a +size 14681892 diff --git a/checkpoint-400/pytorch_model.bin b/checkpoint-400/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c3d90644c07522d549839852fdb549785208bc3 --- /dev/null +++ b/checkpoint-400/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b954c8f23337c53ad1c86bafb2969338878db3b96c2bc2459aa04e1198a2141 +size 7341306 diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bae703a4e27ab853ddff95c141d5555cb75c5980 --- /dev/null +++ b/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11204a688e287bc0c7409fba921f7fd490e9471d91d738932d045851e4742a4e +size 14244 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..04caa6d4a5322a24b77c5b3ecdd6355359355de7 --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c32c17fb8a573adc159285286f456bfb53c7e2d80664d0c2cce541b6013ed8d7 +size 1064 diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/checkpoint-400/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e --- /dev/null +++ b/checkpoint-400/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer" +} diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5292f7ce7abbead9d2162808fe7a4a51f7868e79 --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,256 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 65.3061224489796, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.63, + "learning_rate": 0.009833333333333333, + "loss": 2.53, + "step": 10 + }, + { + "epoch": 3.27, + "learning_rate": 0.009666666666666667, + "loss": 2.0016, + "step": 20 + }, + { + "epoch": 4.9, + "learning_rate": 0.0095, + "loss": 1.7775, + "step": 30 + }, + { + "epoch": 6.53, + "learning_rate": 0.009333333333333334, + "loss": 1.6576, + "step": 40 + }, + { + "epoch": 8.16, + "learning_rate": 0.009166666666666667, + "loss": 1.5048, + "step": 50 + }, + { + "epoch": 9.8, + "learning_rate": 0.009000000000000001, + "loss": 1.3572, + "step": 60 + }, + { + "epoch": 11.43, + "learning_rate": 0.008833333333333334, + "loss": 1.2067, + "step": 70 + }, + { + "epoch": 13.06, + "learning_rate": 0.008666666666666668, + "loss": 1.0777, + "step": 80 + }, + { + "epoch": 14.69, + "learning_rate": 0.0085, + "loss": 0.9188, + "step": 90 + }, + { + "epoch": 16.33, + "learning_rate": 0.008333333333333333, + "loss": 0.7241, + "step": 100 + }, + { + "epoch": 17.96, + "learning_rate": 0.008166666666666666, + "loss": 0.5775, + "step": 110 + }, + { + "epoch": 19.59, + "learning_rate": 0.008, + "loss": 0.4235, + "step": 120 + }, + { + "epoch": 21.22, + "learning_rate": 0.007833333333333333, + "loss": 0.3182, + "step": 130 + }, + { + "epoch": 22.86, + "learning_rate": 0.007666666666666667, + "loss": 0.2155, + "step": 140 + }, + { + "epoch": 24.49, + "learning_rate": 0.0075, + "loss": 0.1633, + "step": 150 + }, + { + "epoch": 26.12, + "learning_rate": 0.007333333333333333, + "loss": 0.1234, + "step": 160 + }, + { + "epoch": 27.76, + "learning_rate": 0.007166666666666667, + "loss": 0.0911, + "step": 170 + }, + { + "epoch": 29.39, + "learning_rate": 0.006999999999999999, + "loss": 0.0738, + "step": 180 + }, + { + "epoch": 31.02, + "learning_rate": 0.006833333333333334, + "loss": 0.0673, + "step": 190 + }, + { + "epoch": 32.65, + "learning_rate": 0.006666666666666666, + "loss": 0.0544, + "step": 200 + }, + { + "epoch": 34.29, + "learning_rate": 0.006500000000000001, + "loss": 0.0492, + "step": 210 + }, + { + "epoch": 35.92, + "learning_rate": 0.006333333333333333, + "loss": 0.0458, + "step": 220 + }, + { + "epoch": 37.55, + "learning_rate": 0.0061666666666666675, + "loss": 0.0434, + "step": 230 + }, + { + "epoch": 39.18, + "learning_rate": 0.006, + "loss": 0.0387, + "step": 240 + }, + { + "epoch": 40.82, + "learning_rate": 0.005833333333333334, + "loss": 0.0375, + "step": 250 + }, + { + "epoch": 42.45, + "learning_rate": 0.005666666666666666, + "loss": 0.0363, + "step": 260 + }, + { + "epoch": 44.08, + "learning_rate": 0.0055000000000000005, + "loss": 0.0347, + "step": 270 + }, + { + "epoch": 45.71, + "learning_rate": 0.005333333333333333, + "loss": 0.0341, + "step": 280 + }, + { + "epoch": 47.35, + "learning_rate": 0.0051666666666666675, + "loss": 0.0327, + "step": 290 + }, + { + "epoch": 48.98, + "learning_rate": 0.005, + "loss": 0.0307, + "step": 300 + }, + { + "epoch": 50.61, + "learning_rate": 0.004833333333333334, + "loss": 0.031, + "step": 310 + }, + { + "epoch": 52.24, + "learning_rate": 0.004666666666666667, + "loss": 0.0312, + "step": 320 + }, + { + "epoch": 53.88, + "learning_rate": 0.0045000000000000005, + "loss": 0.033, + "step": 330 + }, + { + "epoch": 55.51, + "learning_rate": 0.004333333333333334, + "loss": 0.0294, + "step": 340 + }, + { + "epoch": 57.14, + "learning_rate": 0.004166666666666667, + "loss": 0.0308, + "step": 350 + }, + { + "epoch": 58.78, + "learning_rate": 0.004, + "loss": 0.0301, + "step": 360 + }, + { + "epoch": 60.41, + "learning_rate": 0.0038333333333333336, + "loss": 0.0292, + "step": 370 + }, + { + "epoch": 62.04, + "learning_rate": 0.0036666666666666666, + "loss": 0.0316, + "step": 380 + }, + { + "epoch": 63.67, + "learning_rate": 0.0034999999999999996, + "loss": 0.0302, + "step": 390 + }, + { + "epoch": 65.31, + "learning_rate": 0.003333333333333333, + "loss": 0.0295, + "step": 400 + } + ], + "max_steps": 600, + "num_train_epochs": 100, + "total_flos": 4.702992625093837e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af +size 4472 diff --git a/checkpoint-500/config.json b/checkpoint-500/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504 --- /dev/null +++ b/checkpoint-500/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm2-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 32768, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.30.2", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/checkpoint-500/generation_config.json b/checkpoint-500/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59 --- /dev/null +++ b/checkpoint-500/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.30.2" +} diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..19707d809180dc84de01e068b7fbfc2e17a66941 --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9df4c877e409ae83e7bc7c0f1205d623699a931f44d97cbd852d2946c9fa1c96 +size 14681892 diff --git a/checkpoint-500/pytorch_model.bin b/checkpoint-500/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..41de8dd2b81817bbcfa5cb6dd53c67fbee19e67a --- /dev/null +++ b/checkpoint-500/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a87e680f1db9957f77578eb4f8c6df8112d5951619472ae6cfe33f88f3f54e +size 7341306 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..60907f63b4ea28b9ff4c68ac70fb085bd735b02f --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e88ac4017435c2ca3872f675a493a2f3116de05fe3fa16f5cc26289716e59698 +size 14244 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6c0f3762c3d1136503933403160db438fadd448 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f977c9b1c32e778511ae551c40f7b1714188f32f52b3ae781ebb01b99519c875 +size 1064 diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/checkpoint-500/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/checkpoint-500/tokenizer.model b/checkpoint-500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer" +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d72bf5958b9e074f210d5d2739afd384f952dae5 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,316 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 81.63265306122449, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.63, + "learning_rate": 0.009833333333333333, + "loss": 2.53, + "step": 10 + }, + { + "epoch": 3.27, + "learning_rate": 0.009666666666666667, + "loss": 2.0016, + "step": 20 + }, + { + "epoch": 4.9, + "learning_rate": 0.0095, + "loss": 1.7775, + "step": 30 + }, + { + "epoch": 6.53, + "learning_rate": 0.009333333333333334, + "loss": 1.6576, + "step": 40 + }, + { + "epoch": 8.16, + "learning_rate": 0.009166666666666667, + "loss": 1.5048, + "step": 50 + }, + { + "epoch": 9.8, + "learning_rate": 0.009000000000000001, + "loss": 1.3572, + "step": 60 + }, + { + "epoch": 11.43, + "learning_rate": 0.008833333333333334, + "loss": 1.2067, + "step": 70 + }, + { + "epoch": 13.06, + "learning_rate": 0.008666666666666668, + "loss": 1.0777, + "step": 80 + }, + { + "epoch": 14.69, + "learning_rate": 0.0085, + "loss": 0.9188, + "step": 90 + }, + { + "epoch": 16.33, + "learning_rate": 0.008333333333333333, + "loss": 0.7241, + "step": 100 + }, + { + "epoch": 17.96, + "learning_rate": 0.008166666666666666, + "loss": 0.5775, + "step": 110 + }, + { + "epoch": 19.59, + "learning_rate": 0.008, + "loss": 0.4235, + "step": 120 + }, + { + "epoch": 21.22, + "learning_rate": 0.007833333333333333, + "loss": 0.3182, + "step": 130 + }, + { + "epoch": 22.86, + "learning_rate": 0.007666666666666667, + "loss": 0.2155, + "step": 140 + }, + { + "epoch": 24.49, + "learning_rate": 0.0075, + "loss": 0.1633, + "step": 150 + }, + { + "epoch": 26.12, + "learning_rate": 0.007333333333333333, + "loss": 0.1234, + "step": 160 + }, + { + "epoch": 27.76, + "learning_rate": 0.007166666666666667, + "loss": 0.0911, + "step": 170 + }, + { + "epoch": 29.39, + "learning_rate": 0.006999999999999999, + "loss": 0.0738, + "step": 180 + }, + { + "epoch": 31.02, + "learning_rate": 0.006833333333333334, + "loss": 0.0673, + "step": 190 + }, + { + "epoch": 32.65, + "learning_rate": 0.006666666666666666, + "loss": 0.0544, + "step": 200 + }, + { + "epoch": 34.29, + "learning_rate": 0.006500000000000001, + "loss": 0.0492, + "step": 210 + }, + { + "epoch": 35.92, + "learning_rate": 0.006333333333333333, + "loss": 0.0458, + "step": 220 + }, + { + "epoch": 37.55, + "learning_rate": 0.0061666666666666675, + "loss": 0.0434, + "step": 230 + }, + { + "epoch": 39.18, + "learning_rate": 0.006, + "loss": 0.0387, + "step": 240 + }, + { + "epoch": 40.82, + "learning_rate": 0.005833333333333334, + "loss": 0.0375, + "step": 250 + }, + { + "epoch": 42.45, + "learning_rate": 0.005666666666666666, + "loss": 0.0363, + "step": 260 + }, + { + "epoch": 44.08, + "learning_rate": 0.0055000000000000005, + "loss": 0.0347, + "step": 270 + }, + { + "epoch": 45.71, + "learning_rate": 0.005333333333333333, + "loss": 0.0341, + "step": 280 + }, + { + "epoch": 47.35, + "learning_rate": 0.0051666666666666675, + "loss": 0.0327, + "step": 290 + }, + { + "epoch": 48.98, + "learning_rate": 0.005, + "loss": 0.0307, + "step": 300 + }, + { + "epoch": 50.61, + "learning_rate": 0.004833333333333334, + "loss": 0.031, + "step": 310 + }, + { + "epoch": 52.24, + "learning_rate": 0.004666666666666667, + "loss": 0.0312, + "step": 320 + }, + { + "epoch": 53.88, + "learning_rate": 0.0045000000000000005, + "loss": 0.033, + "step": 330 + }, + { + "epoch": 55.51, + "learning_rate": 0.004333333333333334, + "loss": 0.0294, + "step": 340 + }, + { + "epoch": 57.14, + "learning_rate": 0.004166666666666667, + "loss": 0.0308, + "step": 350 + }, + { + "epoch": 58.78, + "learning_rate": 0.004, + "loss": 0.0301, + "step": 360 + }, + { + "epoch": 60.41, + "learning_rate": 0.0038333333333333336, + "loss": 0.0292, + "step": 370 + }, + { + "epoch": 62.04, + "learning_rate": 0.0036666666666666666, + "loss": 0.0316, + "step": 380 + }, + { + "epoch": 63.67, + "learning_rate": 0.0034999999999999996, + "loss": 0.0302, + "step": 390 + }, + { + "epoch": 65.31, + "learning_rate": 0.003333333333333333, + "loss": 0.0295, + "step": 400 + }, + { + "epoch": 66.94, + "learning_rate": 0.0031666666666666666, + "loss": 0.0306, + "step": 410 + }, + { + "epoch": 68.57, + "learning_rate": 0.003, + "loss": 0.0296, + "step": 420 + }, + { + "epoch": 70.2, + "learning_rate": 0.002833333333333333, + "loss": 0.0293, + "step": 430 + }, + { + "epoch": 71.84, + "learning_rate": 0.0026666666666666666, + "loss": 0.0302, + "step": 440 + }, + { + "epoch": 73.47, + "learning_rate": 0.0025, + "loss": 0.0288, + "step": 450 + }, + { + "epoch": 75.1, + "learning_rate": 0.0023333333333333335, + "loss": 0.0292, + "step": 460 + }, + { + "epoch": 76.73, + "learning_rate": 0.002166666666666667, + "loss": 0.0285, + "step": 470 + }, + { + "epoch": 78.37, + "learning_rate": 0.002, + "loss": 0.0309, + "step": 480 + }, + { + "epoch": 80.0, + "learning_rate": 0.0018333333333333333, + "loss": 0.0291, + "step": 490 + }, + { + "epoch": 81.63, + "learning_rate": 0.0016666666666666666, + "loss": 0.0305, + "step": 500 + } + ], + "max_steps": 600, + "num_train_epochs": 100, + "total_flos": 5.878740781367296e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af +size 4472 diff --git a/checkpoint-600/config.json b/checkpoint-600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bbce657c9324dce8d26fbda65427f220ad32d504 --- /dev/null +++ b/checkpoint-600/config.json @@ -0,0 +1,47 @@ +{ + "_name_or_path": "chatglm2-6b", + "add_bias_linear": false, + "add_qkv_bias": true, + "apply_query_key_layer_scaling": true, + "apply_residual_connection_post_layernorm": false, + "architectures": [ + "ChatGLMForConditionalGeneration" + ], + "attention_dropout": 0.0, + "attention_softmax_in_fp32": true, + "auto_map": { + "AutoConfig": "configuration_chatglm.ChatGLMConfig", + "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", + "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" + }, + "bias_dropout_fusion": true, + "classifier_dropout": null, + "eos_token_id": 2, + "ffn_hidden_size": 13696, + "fp32_residual_connection": false, + "hidden_dropout": 0.0, + "hidden_size": 4096, + "kv_channels": 128, + "layernorm_epsilon": 1e-05, + "model_type": "chatglm", + "multi_query_attention": true, + "multi_query_group_num": 2, + "num_attention_heads": 32, + "num_layers": 28, + "original_rope": true, + "pad_token_id": 0, + "padded_vocab_size": 65024, + "post_layer_norm": true, + "pre_seq_len": 128, + "prefix_projection": false, + "quantization_bit": 0, + "rmsnorm": true, + "seq_length": 32768, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.30.2", + "use_cache": true, + "vocab_size": 65024 +} diff --git a/checkpoint-600/generation_config.json b/checkpoint-600/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a005e89abe7e18f683d0e247c9b15103e4ab0c59 --- /dev/null +++ b/checkpoint-600/generation_config.json @@ -0,0 +1,6 @@ +{ + "_from_model_config": true, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.30.2" +} diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..49165c57d87ebcc5be39a09ad3101aa06f0ce48d --- /dev/null +++ b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f846354f6111553252e1e024ead83ce86f2e7013a3f4bf820d2e4ed5899bf33 +size 14681892 diff --git a/checkpoint-600/pytorch_model.bin b/checkpoint-600/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d97c2a152fe6ae9c04970d5f0c379431d0d1b60f --- /dev/null +++ b/checkpoint-600/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deb3d3d7c639eb28900ada66408548977ec5f467d2f47ea105727c6bb7593924 +size 7341306 diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b6f478d94147d001e3ab74e0b16fc8ef0c8eda51 --- /dev/null +++ b/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e06dc51ff93f048a9d0e62293091fe86d17b057af111c26d07c68c9032c09764 +size 14244 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce070b66204b5892a9e615c42a9311b8b3e8d8c2 --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:374a1275027169bbb158e812d9992a46eddd14d03a3c85a854057bdc507c957e +size 1064 diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93 --- /dev/null +++ b/checkpoint-600/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..8a8007697b7cc3d3868dcffbbebf8c1f2bd690ba --- /dev/null +++ b/checkpoint-600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2 +size 1018370 diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef22cfefc75d5926e955e1e419b35de39eb8415e --- /dev/null +++ b/checkpoint-600/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "model_max_length": 1000000000000000019884624838656, + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "ChatGLMTokenizer" +} diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91afe194dac09688ab580692238f7d2a8f1b09af --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,376 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 97.95918367346938, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.63, + "learning_rate": 0.009833333333333333, + "loss": 2.53, + "step": 10 + }, + { + "epoch": 3.27, + "learning_rate": 0.009666666666666667, + "loss": 2.0016, + "step": 20 + }, + { + "epoch": 4.9, + "learning_rate": 0.0095, + "loss": 1.7775, + "step": 30 + }, + { + "epoch": 6.53, + "learning_rate": 0.009333333333333334, + "loss": 1.6576, + "step": 40 + }, + { + "epoch": 8.16, + "learning_rate": 0.009166666666666667, + "loss": 1.5048, + "step": 50 + }, + { + "epoch": 9.8, + "learning_rate": 0.009000000000000001, + "loss": 1.3572, + "step": 60 + }, + { + "epoch": 11.43, + "learning_rate": 0.008833333333333334, + "loss": 1.2067, + "step": 70 + }, + { + "epoch": 13.06, + "learning_rate": 0.008666666666666668, + "loss": 1.0777, + "step": 80 + }, + { + "epoch": 14.69, + "learning_rate": 0.0085, + "loss": 0.9188, + "step": 90 + }, + { + "epoch": 16.33, + "learning_rate": 0.008333333333333333, + "loss": 0.7241, + "step": 100 + }, + { + "epoch": 17.96, + "learning_rate": 0.008166666666666666, + "loss": 0.5775, + "step": 110 + }, + { + "epoch": 19.59, + "learning_rate": 0.008, + "loss": 0.4235, + "step": 120 + }, + { + "epoch": 21.22, + "learning_rate": 0.007833333333333333, + "loss": 0.3182, + "step": 130 + }, + { + "epoch": 22.86, + "learning_rate": 0.007666666666666667, + "loss": 0.2155, + "step": 140 + }, + { + "epoch": 24.49, + "learning_rate": 0.0075, + "loss": 0.1633, + "step": 150 + }, + { + "epoch": 26.12, + "learning_rate": 0.007333333333333333, + "loss": 0.1234, + "step": 160 + }, + { + "epoch": 27.76, + "learning_rate": 0.007166666666666667, + "loss": 0.0911, + "step": 170 + }, + { + "epoch": 29.39, + "learning_rate": 0.006999999999999999, + "loss": 0.0738, + "step": 180 + }, + { + "epoch": 31.02, + "learning_rate": 0.006833333333333334, + "loss": 0.0673, + "step": 190 + }, + { + "epoch": 32.65, + "learning_rate": 0.006666666666666666, + "loss": 0.0544, + "step": 200 + }, + { + "epoch": 34.29, + "learning_rate": 0.006500000000000001, + "loss": 0.0492, + "step": 210 + }, + { + "epoch": 35.92, + "learning_rate": 0.006333333333333333, + "loss": 0.0458, + "step": 220 + }, + { + "epoch": 37.55, + "learning_rate": 0.0061666666666666675, + "loss": 0.0434, + "step": 230 + }, + { + "epoch": 39.18, + "learning_rate": 0.006, + "loss": 0.0387, + "step": 240 + }, + { + "epoch": 40.82, + "learning_rate": 0.005833333333333334, + "loss": 0.0375, + "step": 250 + }, + { + "epoch": 42.45, + "learning_rate": 0.005666666666666666, + "loss": 0.0363, + "step": 260 + }, + { + "epoch": 44.08, + "learning_rate": 0.0055000000000000005, + "loss": 0.0347, + "step": 270 + }, + { + "epoch": 45.71, + "learning_rate": 0.005333333333333333, + "loss": 0.0341, + "step": 280 + }, + { + "epoch": 47.35, + "learning_rate": 0.0051666666666666675, + "loss": 0.0327, + "step": 290 + }, + { + "epoch": 48.98, + "learning_rate": 0.005, + "loss": 0.0307, + "step": 300 + }, + { + "epoch": 50.61, + "learning_rate": 0.004833333333333334, + "loss": 0.031, + "step": 310 + }, + { + "epoch": 52.24, + "learning_rate": 0.004666666666666667, + "loss": 0.0312, + "step": 320 + }, + { + "epoch": 53.88, + "learning_rate": 0.0045000000000000005, + "loss": 0.033, + "step": 330 + }, + { + "epoch": 55.51, + "learning_rate": 0.004333333333333334, + "loss": 0.0294, + "step": 340 + }, + { + "epoch": 57.14, + "learning_rate": 0.004166666666666667, + "loss": 0.0308, + "step": 350 + }, + { + "epoch": 58.78, + "learning_rate": 0.004, + "loss": 0.0301, + "step": 360 + }, + { + "epoch": 60.41, + "learning_rate": 0.0038333333333333336, + "loss": 0.0292, + "step": 370 + }, + { + "epoch": 62.04, + "learning_rate": 0.0036666666666666666, + "loss": 0.0316, + "step": 380 + }, + { + "epoch": 63.67, + "learning_rate": 0.0034999999999999996, + "loss": 0.0302, + "step": 390 + }, + { + "epoch": 65.31, + "learning_rate": 0.003333333333333333, + "loss": 0.0295, + "step": 400 + }, + { + "epoch": 66.94, + "learning_rate": 0.0031666666666666666, + "loss": 0.0306, + "step": 410 + }, + { + "epoch": 68.57, + "learning_rate": 0.003, + "loss": 0.0296, + "step": 420 + }, + { + "epoch": 70.2, + "learning_rate": 0.002833333333333333, + "loss": 0.0293, + "step": 430 + }, + { + "epoch": 71.84, + "learning_rate": 0.0026666666666666666, + "loss": 0.0302, + "step": 440 + }, + { + "epoch": 73.47, + "learning_rate": 0.0025, + "loss": 0.0288, + "step": 450 + }, + { + "epoch": 75.1, + "learning_rate": 0.0023333333333333335, + "loss": 0.0292, + "step": 460 + }, + { + "epoch": 76.73, + "learning_rate": 0.002166666666666667, + "loss": 0.0285, + "step": 470 + }, + { + "epoch": 78.37, + "learning_rate": 0.002, + "loss": 0.0309, + "step": 480 + }, + { + "epoch": 80.0, + "learning_rate": 0.0018333333333333333, + "loss": 0.0291, + "step": 490 + }, + { + "epoch": 81.63, + "learning_rate": 0.0016666666666666666, + "loss": 0.0305, + "step": 500 + }, + { + "epoch": 83.27, + "learning_rate": 0.0015, + "loss": 0.0302, + "step": 510 + }, + { + "epoch": 84.9, + "learning_rate": 0.0013333333333333333, + "loss": 0.0294, + "step": 520 + }, + { + "epoch": 86.53, + "learning_rate": 0.0011666666666666668, + "loss": 0.0295, + "step": 530 + }, + { + "epoch": 88.16, + "learning_rate": 0.001, + "loss": 0.0283, + "step": 540 + }, + { + "epoch": 89.8, + "learning_rate": 0.0008333333333333333, + "loss": 0.0305, + "step": 550 + }, + { + "epoch": 91.43, + "learning_rate": 0.0006666666666666666, + "loss": 0.0288, + "step": 560 + }, + { + "epoch": 93.06, + "learning_rate": 0.0005, + "loss": 0.0309, + "step": 570 + }, + { + "epoch": 94.69, + "learning_rate": 0.0003333333333333333, + "loss": 0.0286, + "step": 580 + }, + { + "epoch": 96.33, + "learning_rate": 0.00016666666666666666, + "loss": 0.0309, + "step": 590 + }, + { + "epoch": 97.96, + "learning_rate": 0.0, + "loss": 0.0294, + "step": 600 + } + ], + "max_steps": 600, + "num_train_epochs": 100, + "total_flos": 7.054488937640755e+17, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9ce0a49fc5b353df0eeeeacba0c6b1cb2bfd86ad --- /dev/null +++ b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af +size 4472 diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7f7ad23765a709637c43e20c5e71464010e888ff --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 97.96, + "train_loss": 0.302445507645607, + "train_runtime": 8265.7464, + "train_samples": 98, + "train_samples_per_second": 1.161, + "train_steps_per_second": 0.073 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dff7c783d0b3f0f018c46d665ec95cfa14783257 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,385 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 97.95918367346938, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 1.63, + "learning_rate": 0.009833333333333333, + "loss": 2.53, + "step": 10 + }, + { + "epoch": 3.27, + "learning_rate": 0.009666666666666667, + "loss": 2.0016, + "step": 20 + }, + { + "epoch": 4.9, + "learning_rate": 0.0095, + "loss": 1.7775, + "step": 30 + }, + { + "epoch": 6.53, + "learning_rate": 0.009333333333333334, + "loss": 1.6576, + "step": 40 + }, + { + "epoch": 8.16, + "learning_rate": 0.009166666666666667, + "loss": 1.5048, + "step": 50 + }, + { + "epoch": 9.8, + "learning_rate": 0.009000000000000001, + "loss": 1.3572, + "step": 60 + }, + { + "epoch": 11.43, + "learning_rate": 0.008833333333333334, + "loss": 1.2067, + "step": 70 + }, + { + "epoch": 13.06, + "learning_rate": 0.008666666666666668, + "loss": 1.0777, + "step": 80 + }, + { + "epoch": 14.69, + "learning_rate": 0.0085, + "loss": 0.9188, + "step": 90 + }, + { + "epoch": 16.33, + "learning_rate": 0.008333333333333333, + "loss": 0.7241, + "step": 100 + }, + { + "epoch": 17.96, + "learning_rate": 0.008166666666666666, + "loss": 0.5775, + "step": 110 + }, + { + "epoch": 19.59, + "learning_rate": 0.008, + "loss": 0.4235, + "step": 120 + }, + { + "epoch": 21.22, + "learning_rate": 0.007833333333333333, + "loss": 0.3182, + "step": 130 + }, + { + "epoch": 22.86, + "learning_rate": 0.007666666666666667, + "loss": 0.2155, + "step": 140 + }, + { + "epoch": 24.49, + "learning_rate": 0.0075, + "loss": 0.1633, + "step": 150 + }, + { + "epoch": 26.12, + "learning_rate": 0.007333333333333333, + "loss": 0.1234, + "step": 160 + }, + { + "epoch": 27.76, + "learning_rate": 0.007166666666666667, + "loss": 0.0911, + "step": 170 + }, + { + "epoch": 29.39, + "learning_rate": 0.006999999999999999, + "loss": 0.0738, + "step": 180 + }, + { + "epoch": 31.02, + "learning_rate": 0.006833333333333334, + "loss": 0.0673, + "step": 190 + }, + { + "epoch": 32.65, + "learning_rate": 0.006666666666666666, + "loss": 0.0544, + "step": 200 + }, + { + "epoch": 34.29, + "learning_rate": 0.006500000000000001, + "loss": 0.0492, + "step": 210 + }, + { + "epoch": 35.92, + "learning_rate": 0.006333333333333333, + "loss": 0.0458, + "step": 220 + }, + { + "epoch": 37.55, + "learning_rate": 0.0061666666666666675, + "loss": 0.0434, + "step": 230 + }, + { + "epoch": 39.18, + "learning_rate": 0.006, + "loss": 0.0387, + "step": 240 + }, + { + "epoch": 40.82, + "learning_rate": 0.005833333333333334, + "loss": 0.0375, + "step": 250 + }, + { + "epoch": 42.45, + "learning_rate": 0.005666666666666666, + "loss": 0.0363, + "step": 260 + }, + { + "epoch": 44.08, + "learning_rate": 0.0055000000000000005, + "loss": 0.0347, + "step": 270 + }, + { + "epoch": 45.71, + "learning_rate": 0.005333333333333333, + "loss": 0.0341, + "step": 280 + }, + { + "epoch": 47.35, + "learning_rate": 0.0051666666666666675, + "loss": 0.0327, + "step": 290 + }, + { + "epoch": 48.98, + "learning_rate": 0.005, + "loss": 0.0307, + "step": 300 + }, + { + "epoch": 50.61, + "learning_rate": 0.004833333333333334, + "loss": 0.031, + "step": 310 + }, + { + "epoch": 52.24, + "learning_rate": 0.004666666666666667, + "loss": 0.0312, + "step": 320 + }, + { + "epoch": 53.88, + "learning_rate": 0.0045000000000000005, + "loss": 0.033, + "step": 330 + }, + { + "epoch": 55.51, + "learning_rate": 0.004333333333333334, + "loss": 0.0294, + "step": 340 + }, + { + "epoch": 57.14, + "learning_rate": 0.004166666666666667, + "loss": 0.0308, + "step": 350 + }, + { + "epoch": 58.78, + "learning_rate": 0.004, + "loss": 0.0301, + "step": 360 + }, + { + "epoch": 60.41, + "learning_rate": 0.0038333333333333336, + "loss": 0.0292, + "step": 370 + }, + { + "epoch": 62.04, + "learning_rate": 0.0036666666666666666, + "loss": 0.0316, + "step": 380 + }, + { + "epoch": 63.67, + "learning_rate": 0.0034999999999999996, + "loss": 0.0302, + "step": 390 + }, + { + "epoch": 65.31, + "learning_rate": 0.003333333333333333, + "loss": 0.0295, + "step": 400 + }, + { + "epoch": 66.94, + "learning_rate": 0.0031666666666666666, + "loss": 0.0306, + "step": 410 + }, + { + "epoch": 68.57, + "learning_rate": 0.003, + "loss": 0.0296, + "step": 420 + }, + { + "epoch": 70.2, + "learning_rate": 0.002833333333333333, + "loss": 0.0293, + "step": 430 + }, + { + "epoch": 71.84, + "learning_rate": 0.0026666666666666666, + "loss": 0.0302, + "step": 440 + }, + { + "epoch": 73.47, + "learning_rate": 0.0025, + "loss": 0.0288, + "step": 450 + }, + { + "epoch": 75.1, + "learning_rate": 0.0023333333333333335, + "loss": 0.0292, + "step": 460 + }, + { + "epoch": 76.73, + "learning_rate": 0.002166666666666667, + "loss": 0.0285, + "step": 470 + }, + { + "epoch": 78.37, + "learning_rate": 0.002, + "loss": 0.0309, + "step": 480 + }, + { + "epoch": 80.0, + "learning_rate": 0.0018333333333333333, + "loss": 0.0291, + "step": 490 + }, + { + "epoch": 81.63, + "learning_rate": 0.0016666666666666666, + "loss": 0.0305, + "step": 500 + }, + { + "epoch": 83.27, + "learning_rate": 0.0015, + "loss": 0.0302, + "step": 510 + }, + { + "epoch": 84.9, + "learning_rate": 0.0013333333333333333, + "loss": 0.0294, + "step": 520 + }, + { + "epoch": 86.53, + "learning_rate": 0.0011666666666666668, + "loss": 0.0295, + "step": 530 + }, + { + "epoch": 88.16, + "learning_rate": 0.001, + "loss": 0.0283, + "step": 540 + }, + { + "epoch": 89.8, + "learning_rate": 0.0008333333333333333, + "loss": 0.0305, + "step": 550 + }, + { + "epoch": 91.43, + "learning_rate": 0.0006666666666666666, + "loss": 0.0288, + "step": 560 + }, + { + "epoch": 93.06, + "learning_rate": 0.0005, + "loss": 0.0309, + "step": 570 + }, + { + "epoch": 94.69, + "learning_rate": 0.0003333333333333333, + "loss": 0.0286, + "step": 580 + }, + { + "epoch": 96.33, + "learning_rate": 0.00016666666666666666, + "loss": 0.0309, + "step": 590 + }, + { + "epoch": 97.96, + "learning_rate": 0.0, + "loss": 0.0294, + "step": 600 + }, + { + "epoch": 97.96, + "step": 600, + "total_flos": 7.054488937640755e+17, + "train_loss": 0.302445507645607, + "train_runtime": 8265.7464, + "train_samples_per_second": 1.161, + "train_steps_per_second": 0.073 + } + ], + "max_steps": 600, + "num_train_epochs": 100, + "total_flos": 7.054488937640755e+17, + "trial_name": null, + "trial_params": null +}