kevinwang676 commited on
Commit
390c787
1 Parent(s): 97813ba

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. all_results.json +8 -0
  2. checkpoint-100/config.json +47 -0
  3. checkpoint-100/generation_config.json +6 -0
  4. checkpoint-100/optimizer.pt +3 -0
  5. checkpoint-100/pytorch_model.bin +3 -0
  6. checkpoint-100/rng_state.pth +3 -0
  7. checkpoint-100/scheduler.pt +3 -0
  8. checkpoint-100/special_tokens_map.json +1 -0
  9. checkpoint-100/tokenizer.model +3 -0
  10. checkpoint-100/tokenizer_config.json +14 -0
  11. checkpoint-100/trainer_state.json +76 -0
  12. checkpoint-100/training_args.bin +3 -0
  13. checkpoint-200/config.json +47 -0
  14. checkpoint-200/generation_config.json +6 -0
  15. checkpoint-200/optimizer.pt +3 -0
  16. checkpoint-200/pytorch_model.bin +3 -0
  17. checkpoint-200/rng_state.pth +3 -0
  18. checkpoint-200/scheduler.pt +3 -0
  19. checkpoint-200/special_tokens_map.json +1 -0
  20. checkpoint-200/tokenizer.model +3 -0
  21. checkpoint-200/tokenizer_config.json +14 -0
  22. checkpoint-200/trainer_state.json +136 -0
  23. checkpoint-200/training_args.bin +3 -0
  24. checkpoint-300/config.json +47 -0
  25. checkpoint-300/generation_config.json +6 -0
  26. checkpoint-300/optimizer.pt +3 -0
  27. checkpoint-300/pytorch_model.bin +3 -0
  28. checkpoint-300/rng_state.pth +3 -0
  29. checkpoint-300/scheduler.pt +3 -0
  30. checkpoint-300/special_tokens_map.json +1 -0
  31. checkpoint-300/tokenizer.model +3 -0
  32. checkpoint-300/tokenizer_config.json +14 -0
  33. checkpoint-300/trainer_state.json +196 -0
  34. checkpoint-300/training_args.bin +3 -0
  35. checkpoint-400/config.json +47 -0
  36. checkpoint-400/generation_config.json +6 -0
  37. checkpoint-400/optimizer.pt +3 -0
  38. checkpoint-400/pytorch_model.bin +3 -0
  39. checkpoint-400/rng_state.pth +3 -0
  40. checkpoint-400/scheduler.pt +3 -0
  41. checkpoint-400/special_tokens_map.json +1 -0
  42. checkpoint-400/tokenizer.model +3 -0
  43. checkpoint-400/tokenizer_config.json +14 -0
  44. checkpoint-400/trainer_state.json +256 -0
  45. checkpoint-400/training_args.bin +3 -0
  46. checkpoint-500/config.json +47 -0
  47. checkpoint-500/generation_config.json +6 -0
  48. checkpoint-500/optimizer.pt +3 -0
  49. checkpoint-500/pytorch_model.bin +3 -0
  50. checkpoint-500/rng_state.pth +3 -0
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 97.96,
3
+ "train_loss": 0.302445507645607,
4
+ "train_runtime": 8265.7464,
5
+ "train_samples": 98,
6
+ "train_samples_per_second": 1.161,
7
+ "train_steps_per_second": 0.073
8
+ }
checkpoint-100/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "chatglm2-6b",
3
+ "add_bias_linear": false,
4
+ "add_qkv_bias": true,
5
+ "apply_query_key_layer_scaling": true,
6
+ "apply_residual_connection_post_layernorm": false,
7
+ "architectures": [
8
+ "ChatGLMForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "attention_softmax_in_fp32": true,
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_chatglm.ChatGLMConfig",
14
+ "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
15
+ "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
16
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
17
+ "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
18
+ },
19
+ "bias_dropout_fusion": true,
20
+ "classifier_dropout": null,
21
+ "eos_token_id": 2,
22
+ "ffn_hidden_size": 13696,
23
+ "fp32_residual_connection": false,
24
+ "hidden_dropout": 0.0,
25
+ "hidden_size": 4096,
26
+ "kv_channels": 128,
27
+ "layernorm_epsilon": 1e-05,
28
+ "model_type": "chatglm",
29
+ "multi_query_attention": true,
30
+ "multi_query_group_num": 2,
31
+ "num_attention_heads": 32,
32
+ "num_layers": 28,
33
+ "original_rope": true,
34
+ "pad_token_id": 0,
35
+ "padded_vocab_size": 65024,
36
+ "post_layer_norm": true,
37
+ "pre_seq_len": 128,
38
+ "prefix_projection": false,
39
+ "quantization_bit": 0,
40
+ "rmsnorm": true,
41
+ "seq_length": 32768,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float16",
44
+ "transformers_version": "4.30.2",
45
+ "use_cache": true,
46
+ "vocab_size": 65024
47
+ }
checkpoint-100/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.30.2"
6
+ }
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5b9322b7c791be0283cae33d01cb2e6c40786a9c9fab7fc421715ba39faa314
3
+ size 14681892
checkpoint-100/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:119f3aee6155af1456cc129b0eab064a91fd3a95f864e8b1a4985d7e10381988
3
+ size 7341306
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:723f496b9d8d8e776f11531f4652ca1ce47b825b86325d6f5aba8841ca36f1a0
3
+ size 14244
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e97f84b2e9cf03e34106548e8fd72d9181e088fcbe9b5747b6e8466de9610724
3
+ size 1064
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
3
+ size 1018370
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_chatglm.ChatGLMTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": false,
9
+ "do_lower_case": false,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "padding_side": "left",
12
+ "remove_space": false,
13
+ "tokenizer_class": "ChatGLMTokenizer"
14
+ }
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 16.3265306122449,
5
+ "global_step": 100,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.63,
12
+ "learning_rate": 0.009833333333333333,
13
+ "loss": 2.53,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 3.27,
18
+ "learning_rate": 0.009666666666666667,
19
+ "loss": 2.0016,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 4.9,
24
+ "learning_rate": 0.0095,
25
+ "loss": 1.7775,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 6.53,
30
+ "learning_rate": 0.009333333333333334,
31
+ "loss": 1.6576,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 8.16,
36
+ "learning_rate": 0.009166666666666667,
37
+ "loss": 1.5048,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 9.8,
42
+ "learning_rate": 0.009000000000000001,
43
+ "loss": 1.3572,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 11.43,
48
+ "learning_rate": 0.008833333333333334,
49
+ "loss": 1.2067,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 13.06,
54
+ "learning_rate": 0.008666666666666668,
55
+ "loss": 1.0777,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 14.69,
60
+ "learning_rate": 0.0085,
61
+ "loss": 0.9188,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 16.33,
66
+ "learning_rate": 0.008333333333333333,
67
+ "loss": 0.7241,
68
+ "step": 100
69
+ }
70
+ ],
71
+ "max_steps": 600,
72
+ "num_train_epochs": 100,
73
+ "total_flos": 1.1757481562734592e+17,
74
+ "trial_name": null,
75
+ "trial_params": null
76
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
3
+ size 4472
checkpoint-200/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "chatglm2-6b",
3
+ "add_bias_linear": false,
4
+ "add_qkv_bias": true,
5
+ "apply_query_key_layer_scaling": true,
6
+ "apply_residual_connection_post_layernorm": false,
7
+ "architectures": [
8
+ "ChatGLMForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "attention_softmax_in_fp32": true,
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_chatglm.ChatGLMConfig",
14
+ "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
15
+ "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
16
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
17
+ "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
18
+ },
19
+ "bias_dropout_fusion": true,
20
+ "classifier_dropout": null,
21
+ "eos_token_id": 2,
22
+ "ffn_hidden_size": 13696,
23
+ "fp32_residual_connection": false,
24
+ "hidden_dropout": 0.0,
25
+ "hidden_size": 4096,
26
+ "kv_channels": 128,
27
+ "layernorm_epsilon": 1e-05,
28
+ "model_type": "chatglm",
29
+ "multi_query_attention": true,
30
+ "multi_query_group_num": 2,
31
+ "num_attention_heads": 32,
32
+ "num_layers": 28,
33
+ "original_rope": true,
34
+ "pad_token_id": 0,
35
+ "padded_vocab_size": 65024,
36
+ "post_layer_norm": true,
37
+ "pre_seq_len": 128,
38
+ "prefix_projection": false,
39
+ "quantization_bit": 0,
40
+ "rmsnorm": true,
41
+ "seq_length": 32768,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float16",
44
+ "transformers_version": "4.30.2",
45
+ "use_cache": true,
46
+ "vocab_size": 65024
47
+ }
checkpoint-200/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.30.2"
6
+ }
checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933b7b82708ba6a23d949d7b05fcb8644b9ab8b06ecf625f35c30aeba85b3ba2
3
+ size 14681892
checkpoint-200/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54e939cf8e3ee1c58646595ea0e7748202c1e1b85f82aeb536a388bbe8d36e86
3
+ size 7341306
checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51523eedac643c13a3a71297ac9e347331249d1d4cc19f9738a182bae3585fb2
3
+ size 14244
checkpoint-200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3db1c4819d8e7a76f34cf5f8f4aa0bf9497992cd0862dbd9ba3fc68b9886b79e
3
+ size 1064
checkpoint-200/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
3
+ size 1018370
checkpoint-200/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_chatglm.ChatGLMTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": false,
9
+ "do_lower_case": false,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "padding_side": "left",
12
+ "remove_space": false,
13
+ "tokenizer_class": "ChatGLMTokenizer"
14
+ }
checkpoint-200/trainer_state.json ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 32.6530612244898,
5
+ "global_step": 200,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.63,
12
+ "learning_rate": 0.009833333333333333,
13
+ "loss": 2.53,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 3.27,
18
+ "learning_rate": 0.009666666666666667,
19
+ "loss": 2.0016,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 4.9,
24
+ "learning_rate": 0.0095,
25
+ "loss": 1.7775,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 6.53,
30
+ "learning_rate": 0.009333333333333334,
31
+ "loss": 1.6576,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 8.16,
36
+ "learning_rate": 0.009166666666666667,
37
+ "loss": 1.5048,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 9.8,
42
+ "learning_rate": 0.009000000000000001,
43
+ "loss": 1.3572,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 11.43,
48
+ "learning_rate": 0.008833333333333334,
49
+ "loss": 1.2067,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 13.06,
54
+ "learning_rate": 0.008666666666666668,
55
+ "loss": 1.0777,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 14.69,
60
+ "learning_rate": 0.0085,
61
+ "loss": 0.9188,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 16.33,
66
+ "learning_rate": 0.008333333333333333,
67
+ "loss": 0.7241,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 17.96,
72
+ "learning_rate": 0.008166666666666666,
73
+ "loss": 0.5775,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 19.59,
78
+ "learning_rate": 0.008,
79
+ "loss": 0.4235,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 21.22,
84
+ "learning_rate": 0.007833333333333333,
85
+ "loss": 0.3182,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 22.86,
90
+ "learning_rate": 0.007666666666666667,
91
+ "loss": 0.2155,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 24.49,
96
+ "learning_rate": 0.0075,
97
+ "loss": 0.1633,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 26.12,
102
+ "learning_rate": 0.007333333333333333,
103
+ "loss": 0.1234,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 27.76,
108
+ "learning_rate": 0.007166666666666667,
109
+ "loss": 0.0911,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 29.39,
114
+ "learning_rate": 0.006999999999999999,
115
+ "loss": 0.0738,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 31.02,
120
+ "learning_rate": 0.006833333333333334,
121
+ "loss": 0.0673,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 32.65,
126
+ "learning_rate": 0.006666666666666666,
127
+ "loss": 0.0544,
128
+ "step": 200
129
+ }
130
+ ],
131
+ "max_steps": 600,
132
+ "num_train_epochs": 100,
133
+ "total_flos": 2.3514963125469184e+17,
134
+ "trial_name": null,
135
+ "trial_params": null
136
+ }
checkpoint-200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
3
+ size 4472
checkpoint-300/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "chatglm2-6b",
3
+ "add_bias_linear": false,
4
+ "add_qkv_bias": true,
5
+ "apply_query_key_layer_scaling": true,
6
+ "apply_residual_connection_post_layernorm": false,
7
+ "architectures": [
8
+ "ChatGLMForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "attention_softmax_in_fp32": true,
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_chatglm.ChatGLMConfig",
14
+ "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
15
+ "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
16
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
17
+ "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
18
+ },
19
+ "bias_dropout_fusion": true,
20
+ "classifier_dropout": null,
21
+ "eos_token_id": 2,
22
+ "ffn_hidden_size": 13696,
23
+ "fp32_residual_connection": false,
24
+ "hidden_dropout": 0.0,
25
+ "hidden_size": 4096,
26
+ "kv_channels": 128,
27
+ "layernorm_epsilon": 1e-05,
28
+ "model_type": "chatglm",
29
+ "multi_query_attention": true,
30
+ "multi_query_group_num": 2,
31
+ "num_attention_heads": 32,
32
+ "num_layers": 28,
33
+ "original_rope": true,
34
+ "pad_token_id": 0,
35
+ "padded_vocab_size": 65024,
36
+ "post_layer_norm": true,
37
+ "pre_seq_len": 128,
38
+ "prefix_projection": false,
39
+ "quantization_bit": 0,
40
+ "rmsnorm": true,
41
+ "seq_length": 32768,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float16",
44
+ "transformers_version": "4.30.2",
45
+ "use_cache": true,
46
+ "vocab_size": 65024
47
+ }
checkpoint-300/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.30.2"
6
+ }
checkpoint-300/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb80ab0b61a192373221d205400431f1f9db5591d3be1fcdb9051924f1b410d2
3
+ size 14681892
checkpoint-300/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:187b159480029f605a8ec08a6da076afe43110d3c1ae18d10931f2ac9e5793ec
3
+ size 7341306
checkpoint-300/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5035452976c183913e118a486015c4dbd9cf61159f30c79ac9dd02dbf2cd81c
3
+ size 14244
checkpoint-300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b5ff897392aa57ce97759b435acfdb4ee39aef21d4a4a68095c3294c513f6c0
3
+ size 1064
checkpoint-300/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-300/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
3
+ size 1018370
checkpoint-300/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_chatglm.ChatGLMTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": false,
9
+ "do_lower_case": false,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "padding_side": "left",
12
+ "remove_space": false,
13
+ "tokenizer_class": "ChatGLMTokenizer"
14
+ }
checkpoint-300/trainer_state.json ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 48.97959183673469,
5
+ "global_step": 300,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.63,
12
+ "learning_rate": 0.009833333333333333,
13
+ "loss": 2.53,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 3.27,
18
+ "learning_rate": 0.009666666666666667,
19
+ "loss": 2.0016,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 4.9,
24
+ "learning_rate": 0.0095,
25
+ "loss": 1.7775,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 6.53,
30
+ "learning_rate": 0.009333333333333334,
31
+ "loss": 1.6576,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 8.16,
36
+ "learning_rate": 0.009166666666666667,
37
+ "loss": 1.5048,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 9.8,
42
+ "learning_rate": 0.009000000000000001,
43
+ "loss": 1.3572,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 11.43,
48
+ "learning_rate": 0.008833333333333334,
49
+ "loss": 1.2067,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 13.06,
54
+ "learning_rate": 0.008666666666666668,
55
+ "loss": 1.0777,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 14.69,
60
+ "learning_rate": 0.0085,
61
+ "loss": 0.9188,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 16.33,
66
+ "learning_rate": 0.008333333333333333,
67
+ "loss": 0.7241,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 17.96,
72
+ "learning_rate": 0.008166666666666666,
73
+ "loss": 0.5775,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 19.59,
78
+ "learning_rate": 0.008,
79
+ "loss": 0.4235,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 21.22,
84
+ "learning_rate": 0.007833333333333333,
85
+ "loss": 0.3182,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 22.86,
90
+ "learning_rate": 0.007666666666666667,
91
+ "loss": 0.2155,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 24.49,
96
+ "learning_rate": 0.0075,
97
+ "loss": 0.1633,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 26.12,
102
+ "learning_rate": 0.007333333333333333,
103
+ "loss": 0.1234,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 27.76,
108
+ "learning_rate": 0.007166666666666667,
109
+ "loss": 0.0911,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 29.39,
114
+ "learning_rate": 0.006999999999999999,
115
+ "loss": 0.0738,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 31.02,
120
+ "learning_rate": 0.006833333333333334,
121
+ "loss": 0.0673,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 32.65,
126
+ "learning_rate": 0.006666666666666666,
127
+ "loss": 0.0544,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 34.29,
132
+ "learning_rate": 0.006500000000000001,
133
+ "loss": 0.0492,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 35.92,
138
+ "learning_rate": 0.006333333333333333,
139
+ "loss": 0.0458,
140
+ "step": 220
141
+ },
142
+ {
143
+ "epoch": 37.55,
144
+ "learning_rate": 0.0061666666666666675,
145
+ "loss": 0.0434,
146
+ "step": 230
147
+ },
148
+ {
149
+ "epoch": 39.18,
150
+ "learning_rate": 0.006,
151
+ "loss": 0.0387,
152
+ "step": 240
153
+ },
154
+ {
155
+ "epoch": 40.82,
156
+ "learning_rate": 0.005833333333333334,
157
+ "loss": 0.0375,
158
+ "step": 250
159
+ },
160
+ {
161
+ "epoch": 42.45,
162
+ "learning_rate": 0.005666666666666666,
163
+ "loss": 0.0363,
164
+ "step": 260
165
+ },
166
+ {
167
+ "epoch": 44.08,
168
+ "learning_rate": 0.0055000000000000005,
169
+ "loss": 0.0347,
170
+ "step": 270
171
+ },
172
+ {
173
+ "epoch": 45.71,
174
+ "learning_rate": 0.005333333333333333,
175
+ "loss": 0.0341,
176
+ "step": 280
177
+ },
178
+ {
179
+ "epoch": 47.35,
180
+ "learning_rate": 0.0051666666666666675,
181
+ "loss": 0.0327,
182
+ "step": 290
183
+ },
184
+ {
185
+ "epoch": 48.98,
186
+ "learning_rate": 0.005,
187
+ "loss": 0.0307,
188
+ "step": 300
189
+ }
190
+ ],
191
+ "max_steps": 600,
192
+ "num_train_epochs": 100,
193
+ "total_flos": 3.5272444688203776e+17,
194
+ "trial_name": null,
195
+ "trial_params": null
196
+ }
checkpoint-300/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
3
+ size 4472
checkpoint-400/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "chatglm2-6b",
3
+ "add_bias_linear": false,
4
+ "add_qkv_bias": true,
5
+ "apply_query_key_layer_scaling": true,
6
+ "apply_residual_connection_post_layernorm": false,
7
+ "architectures": [
8
+ "ChatGLMForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "attention_softmax_in_fp32": true,
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_chatglm.ChatGLMConfig",
14
+ "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
15
+ "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
16
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
17
+ "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
18
+ },
19
+ "bias_dropout_fusion": true,
20
+ "classifier_dropout": null,
21
+ "eos_token_id": 2,
22
+ "ffn_hidden_size": 13696,
23
+ "fp32_residual_connection": false,
24
+ "hidden_dropout": 0.0,
25
+ "hidden_size": 4096,
26
+ "kv_channels": 128,
27
+ "layernorm_epsilon": 1e-05,
28
+ "model_type": "chatglm",
29
+ "multi_query_attention": true,
30
+ "multi_query_group_num": 2,
31
+ "num_attention_heads": 32,
32
+ "num_layers": 28,
33
+ "original_rope": true,
34
+ "pad_token_id": 0,
35
+ "padded_vocab_size": 65024,
36
+ "post_layer_norm": true,
37
+ "pre_seq_len": 128,
38
+ "prefix_projection": false,
39
+ "quantization_bit": 0,
40
+ "rmsnorm": true,
41
+ "seq_length": 32768,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float16",
44
+ "transformers_version": "4.30.2",
45
+ "use_cache": true,
46
+ "vocab_size": 65024
47
+ }
checkpoint-400/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.30.2"
6
+ }
checkpoint-400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ddda63cbe968668b459a73f0a54c34fc36c007f9f202063794ded2a8814a37a
3
+ size 14681892
checkpoint-400/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b954c8f23337c53ad1c86bafb2969338878db3b96c2bc2459aa04e1198a2141
3
+ size 7341306
checkpoint-400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11204a688e287bc0c7409fba921f7fd490e9471d91d738932d045851e4742a4e
3
+ size 14244
checkpoint-400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c32c17fb8a573adc159285286f456bfb53c7e2d80664d0c2cce541b6013ed8d7
3
+ size 1064
checkpoint-400/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
checkpoint-400/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
3
+ size 1018370
checkpoint-400/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_chatglm.ChatGLMTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": false,
9
+ "do_lower_case": false,
10
+ "model_max_length": 1000000000000000019884624838656,
11
+ "padding_side": "left",
12
+ "remove_space": false,
13
+ "tokenizer_class": "ChatGLMTokenizer"
14
+ }
checkpoint-400/trainer_state.json ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 65.3061224489796,
5
+ "global_step": 400,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 1.63,
12
+ "learning_rate": 0.009833333333333333,
13
+ "loss": 2.53,
14
+ "step": 10
15
+ },
16
+ {
17
+ "epoch": 3.27,
18
+ "learning_rate": 0.009666666666666667,
19
+ "loss": 2.0016,
20
+ "step": 20
21
+ },
22
+ {
23
+ "epoch": 4.9,
24
+ "learning_rate": 0.0095,
25
+ "loss": 1.7775,
26
+ "step": 30
27
+ },
28
+ {
29
+ "epoch": 6.53,
30
+ "learning_rate": 0.009333333333333334,
31
+ "loss": 1.6576,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 8.16,
36
+ "learning_rate": 0.009166666666666667,
37
+ "loss": 1.5048,
38
+ "step": 50
39
+ },
40
+ {
41
+ "epoch": 9.8,
42
+ "learning_rate": 0.009000000000000001,
43
+ "loss": 1.3572,
44
+ "step": 60
45
+ },
46
+ {
47
+ "epoch": 11.43,
48
+ "learning_rate": 0.008833333333333334,
49
+ "loss": 1.2067,
50
+ "step": 70
51
+ },
52
+ {
53
+ "epoch": 13.06,
54
+ "learning_rate": 0.008666666666666668,
55
+ "loss": 1.0777,
56
+ "step": 80
57
+ },
58
+ {
59
+ "epoch": 14.69,
60
+ "learning_rate": 0.0085,
61
+ "loss": 0.9188,
62
+ "step": 90
63
+ },
64
+ {
65
+ "epoch": 16.33,
66
+ "learning_rate": 0.008333333333333333,
67
+ "loss": 0.7241,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 17.96,
72
+ "learning_rate": 0.008166666666666666,
73
+ "loss": 0.5775,
74
+ "step": 110
75
+ },
76
+ {
77
+ "epoch": 19.59,
78
+ "learning_rate": 0.008,
79
+ "loss": 0.4235,
80
+ "step": 120
81
+ },
82
+ {
83
+ "epoch": 21.22,
84
+ "learning_rate": 0.007833333333333333,
85
+ "loss": 0.3182,
86
+ "step": 130
87
+ },
88
+ {
89
+ "epoch": 22.86,
90
+ "learning_rate": 0.007666666666666667,
91
+ "loss": 0.2155,
92
+ "step": 140
93
+ },
94
+ {
95
+ "epoch": 24.49,
96
+ "learning_rate": 0.0075,
97
+ "loss": 0.1633,
98
+ "step": 150
99
+ },
100
+ {
101
+ "epoch": 26.12,
102
+ "learning_rate": 0.007333333333333333,
103
+ "loss": 0.1234,
104
+ "step": 160
105
+ },
106
+ {
107
+ "epoch": 27.76,
108
+ "learning_rate": 0.007166666666666667,
109
+ "loss": 0.0911,
110
+ "step": 170
111
+ },
112
+ {
113
+ "epoch": 29.39,
114
+ "learning_rate": 0.006999999999999999,
115
+ "loss": 0.0738,
116
+ "step": 180
117
+ },
118
+ {
119
+ "epoch": 31.02,
120
+ "learning_rate": 0.006833333333333334,
121
+ "loss": 0.0673,
122
+ "step": 190
123
+ },
124
+ {
125
+ "epoch": 32.65,
126
+ "learning_rate": 0.006666666666666666,
127
+ "loss": 0.0544,
128
+ "step": 200
129
+ },
130
+ {
131
+ "epoch": 34.29,
132
+ "learning_rate": 0.006500000000000001,
133
+ "loss": 0.0492,
134
+ "step": 210
135
+ },
136
+ {
137
+ "epoch": 35.92,
138
+ "learning_rate": 0.006333333333333333,
139
+ "loss": 0.0458,
140
+ "step": 220
141
+ },
142
+ {
143
+ "epoch": 37.55,
144
+ "learning_rate": 0.0061666666666666675,
145
+ "loss": 0.0434,
146
+ "step": 230
147
+ },
148
+ {
149
+ "epoch": 39.18,
150
+ "learning_rate": 0.006,
151
+ "loss": 0.0387,
152
+ "step": 240
153
+ },
154
+ {
155
+ "epoch": 40.82,
156
+ "learning_rate": 0.005833333333333334,
157
+ "loss": 0.0375,
158
+ "step": 250
159
+ },
160
+ {
161
+ "epoch": 42.45,
162
+ "learning_rate": 0.005666666666666666,
163
+ "loss": 0.0363,
164
+ "step": 260
165
+ },
166
+ {
167
+ "epoch": 44.08,
168
+ "learning_rate": 0.0055000000000000005,
169
+ "loss": 0.0347,
170
+ "step": 270
171
+ },
172
+ {
173
+ "epoch": 45.71,
174
+ "learning_rate": 0.005333333333333333,
175
+ "loss": 0.0341,
176
+ "step": 280
177
+ },
178
+ {
179
+ "epoch": 47.35,
180
+ "learning_rate": 0.0051666666666666675,
181
+ "loss": 0.0327,
182
+ "step": 290
183
+ },
184
+ {
185
+ "epoch": 48.98,
186
+ "learning_rate": 0.005,
187
+ "loss": 0.0307,
188
+ "step": 300
189
+ },
190
+ {
191
+ "epoch": 50.61,
192
+ "learning_rate": 0.004833333333333334,
193
+ "loss": 0.031,
194
+ "step": 310
195
+ },
196
+ {
197
+ "epoch": 52.24,
198
+ "learning_rate": 0.004666666666666667,
199
+ "loss": 0.0312,
200
+ "step": 320
201
+ },
202
+ {
203
+ "epoch": 53.88,
204
+ "learning_rate": 0.0045000000000000005,
205
+ "loss": 0.033,
206
+ "step": 330
207
+ },
208
+ {
209
+ "epoch": 55.51,
210
+ "learning_rate": 0.004333333333333334,
211
+ "loss": 0.0294,
212
+ "step": 340
213
+ },
214
+ {
215
+ "epoch": 57.14,
216
+ "learning_rate": 0.004166666666666667,
217
+ "loss": 0.0308,
218
+ "step": 350
219
+ },
220
+ {
221
+ "epoch": 58.78,
222
+ "learning_rate": 0.004,
223
+ "loss": 0.0301,
224
+ "step": 360
225
+ },
226
+ {
227
+ "epoch": 60.41,
228
+ "learning_rate": 0.0038333333333333336,
229
+ "loss": 0.0292,
230
+ "step": 370
231
+ },
232
+ {
233
+ "epoch": 62.04,
234
+ "learning_rate": 0.0036666666666666666,
235
+ "loss": 0.0316,
236
+ "step": 380
237
+ },
238
+ {
239
+ "epoch": 63.67,
240
+ "learning_rate": 0.0034999999999999996,
241
+ "loss": 0.0302,
242
+ "step": 390
243
+ },
244
+ {
245
+ "epoch": 65.31,
246
+ "learning_rate": 0.003333333333333333,
247
+ "loss": 0.0295,
248
+ "step": 400
249
+ }
250
+ ],
251
+ "max_steps": 600,
252
+ "num_train_epochs": 100,
253
+ "total_flos": 4.702992625093837e+17,
254
+ "trial_name": null,
255
+ "trial_params": null
256
+ }
checkpoint-400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0a343e1f2ccb38a19082ba999546089030c0e15418471a24d346cbb68fa7af
3
+ size 4472
checkpoint-500/config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "chatglm2-6b",
3
+ "add_bias_linear": false,
4
+ "add_qkv_bias": true,
5
+ "apply_query_key_layer_scaling": true,
6
+ "apply_residual_connection_post_layernorm": false,
7
+ "architectures": [
8
+ "ChatGLMForConditionalGeneration"
9
+ ],
10
+ "attention_dropout": 0.0,
11
+ "attention_softmax_in_fp32": true,
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_chatglm.ChatGLMConfig",
14
+ "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
15
+ "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
16
+ "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
17
+ "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
18
+ },
19
+ "bias_dropout_fusion": true,
20
+ "classifier_dropout": null,
21
+ "eos_token_id": 2,
22
+ "ffn_hidden_size": 13696,
23
+ "fp32_residual_connection": false,
24
+ "hidden_dropout": 0.0,
25
+ "hidden_size": 4096,
26
+ "kv_channels": 128,
27
+ "layernorm_epsilon": 1e-05,
28
+ "model_type": "chatglm",
29
+ "multi_query_attention": true,
30
+ "multi_query_group_num": 2,
31
+ "num_attention_heads": 32,
32
+ "num_layers": 28,
33
+ "original_rope": true,
34
+ "pad_token_id": 0,
35
+ "padded_vocab_size": 65024,
36
+ "post_layer_norm": true,
37
+ "pre_seq_len": 128,
38
+ "prefix_projection": false,
39
+ "quantization_bit": 0,
40
+ "rmsnorm": true,
41
+ "seq_length": 32768,
42
+ "tie_word_embeddings": false,
43
+ "torch_dtype": "float16",
44
+ "transformers_version": "4.30.2",
45
+ "use_cache": true,
46
+ "vocab_size": 65024
47
+ }
checkpoint-500/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 0,
5
+ "transformers_version": "4.30.2"
6
+ }
checkpoint-500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9df4c877e409ae83e7bc7c0f1205d623699a931f44d97cbd852d2946c9fa1c96
3
+ size 14681892
checkpoint-500/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a87e680f1db9957f77578eb4f8c6df8112d5951619472ae6cfe33f88f3f54e
3
+ size 7341306
checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e88ac4017435c2ca3872f675a493a2f3116de05fe3fa16f5cc26289716e59698
3
+ size 14244