ctlin commited on
Commit
5147945
1 Parent(s): 9ea2847

upload model files

Browse files
README.md CHANGED
@@ -1,3 +1,28 @@
1
  ---
 
 
 
 
 
 
 
 
2
  license: gpl-3.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - zh
4
+ thumbnail: https://ckip.iis.sinica.edu.tw/files/ckip_logo.png
5
+ tags:
6
+ - pytorch
7
+ - token-classification
8
+ - bert
9
+ - zh
10
  license: gpl-3.0
11
  ---
12
+
13
+ # CKIP BERT Base Han Chinese POS
14
+
15
+ This model provides part-of-speech (POS) tagging for the ancient Chinese language. Our training dataset covers four eras of the Chinese language.
16
+
17
+ ## Homepage
18
+ * [ckiplab/han-transformers](https://github.com/ckiplab/han-transformers)
19
+
20
+ ## Training Datasets
21
+ The copyright of the datasets belongs to the Institute of Linguistics, Academia Sinica.
22
+ * [中央研究院上古漢語標記語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/akiwi/kiwi.sh)
23
+ * [中央研究院中古漢語語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/dkiwi/kiwi.sh)
24
+ * [中央研究院近代漢語語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/pkiwi/kiwi.sh)
25
+ * [中央研究院現代漢語語料庫](http://asbc.iis.sinica.edu.tw)
26
+
27
+ ## Contributors
28
+ * Chin-Tung Lin at [CKIP](https://ckip.iis.sinica.edu.tw/)
config.json ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "hub/ckiplab/bert-base-chinese-20210817-001848",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "directionality": "bidi",
8
+ "finetuning_task": "ner",
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "label2id": {
14
+ "571": 53,
15
+ "A": 41,
16
+ "C": 86,
17
+ "COLONCATEGORY": 49,
18
+ "COMMACATEGORY": 32,
19
+ "Caa": 34,
20
+ "Cbb": 20,
21
+ "D": 74,
22
+ "DASHCATEGORY": 56,
23
+ "DAb": 92,
24
+ "DC": 97,
25
+ "DE": 29,
26
+ "DFa": 80,
27
+ "DH": 88,
28
+ "DJ": 91,
29
+ "Da": 89,
30
+ "Daa": 40,
31
+ "Dab": 28,
32
+ "Db": 95,
33
+ "Dba": 45,
34
+ "Dbb": 43,
35
+ "Dc": 12,
36
+ "Dd": 19,
37
+ "Df": 94,
38
+ "Dfa": 37,
39
+ "Dfb": 59,
40
+ "Dg": 61,
41
+ "Dh": 27,
42
+ "Di": 81,
43
+ "Dj": 39,
44
+ "Dk": 52,
45
+ "Dl": 18,
46
+ "EXCLAMATIONCATEGORY": 50,
47
+ "EXCLANATIONCATEGORY": 79,
48
+ "FW": 64,
49
+ "I": 57,
50
+ "N": 78,
51
+ "NA": 93,
52
+ "NB": 77,
53
+ "ND": 76,
54
+ "NH": 70,
55
+ "Na": 0,
56
+ "Nb": 9,
57
+ "Nc": 7,
58
+ "Nd": 11,
59
+ "Ne": 85,
60
+ "Neqa": 31,
61
+ "Nes": 25,
62
+ "Neu": 8,
63
+ "Nf": 33,
64
+ "Ng": 22,
65
+ "Nh": 4,
66
+ "P": 10,
67
+ "PARENTHES7ISCATEGORY": 90,
68
+ "PARENTHESISCATEGORY": 51,
69
+ "PAUSECATEGORY": 58,
70
+ "PERIODCATEGORY": 1,
71
+ "Q": 62,
72
+ "QUESTIONCATEGORY": 48,
73
+ "R": 47,
74
+ "SEMICOLONCATEGORY": 55,
75
+ "SHI": 38,
76
+ "T": 26,
77
+ "T3": 82,
78
+ "T4": 66,
79
+ "T5": 46,
80
+ "T6": 42,
81
+ "T7": 54,
82
+ "T8": 15,
83
+ "U": 60,
84
+ "V": 72,
85
+ "VA": 23,
86
+ "VAC": 30,
87
+ "VB": 63,
88
+ "VC": 3,
89
+ "VCL": 13,
90
+ "VD": 35,
91
+ "VE": 5,
92
+ "VF": 36,
93
+ "VG": 21,
94
+ "VH": 2,
95
+ "VHC": 14,
96
+ "VI": 87,
97
+ "VJ": 16,
98
+ "VK": 6,
99
+ "VL": 17,
100
+ "V_2": 24,
101
+ "Vc": 65,
102
+ "Ve": 71,
103
+ "Vf": 84,
104
+ "Vg": 67,
105
+ "V\uff3f\uff12": 69,
106
+ "b": 44,
107
+ "na": 75,
108
+ "nc": 83,
109
+ "neu": 96,
110
+ "r": 73,
111
+ "\u7b26\uff0c\u5c1a\u7121\u8cc7\u6599": 68
112
+ },
113
+ "initializer_range": 0.02,
114
+ "intermediate_size": 3072,
115
+ "id2label": {
116
+ "0": "Na",
117
+ "1": "PERIODCATEGORY",
118
+ "2": "VH",
119
+ "3": "VC",
120
+ "4": "Nh",
121
+ "5": "VE",
122
+ "6": "VK",
123
+ "7": "Nc",
124
+ "8": "Neu",
125
+ "9": "Nb",
126
+ "10": "P",
127
+ "11": "Nd",
128
+ "12": "Dc",
129
+ "13": "VCL",
130
+ "14": "VHC",
131
+ "15": "T8",
132
+ "16": "VJ",
133
+ "17": "VL",
134
+ "18": "Dl",
135
+ "19": "Dd",
136
+ "20": "Cbb",
137
+ "21": "VG",
138
+ "22": "Ng",
139
+ "23": "VA",
140
+ "24": "V_2",
141
+ "25": "Nes",
142
+ "26": "T",
143
+ "27": "Dh",
144
+ "28": "Dab",
145
+ "29": "DE",
146
+ "30": "VAC",
147
+ "31": "Neqa",
148
+ "32": "COMMACATEGORY",
149
+ "33": "Nf",
150
+ "34": "Caa",
151
+ "35": "VD",
152
+ "36": "VF",
153
+ "37": "Dfa",
154
+ "38": "SHI",
155
+ "39": "Dj",
156
+ "40": "Daa",
157
+ "41": "A",
158
+ "42": "T6",
159
+ "43": "Dbb",
160
+ "44": "b",
161
+ "45": "Dba",
162
+ "46": "T5",
163
+ "47": "R",
164
+ "48": "QUESTIONCATEGORY",
165
+ "49": "COLONCATEGORY",
166
+ "50": "EXCLAMATIONCATEGORY",
167
+ "51": "PARENTHESISCATEGORY",
168
+ "52": "Dk",
169
+ "53": "571",
170
+ "54": "T7",
171
+ "55": "SEMICOLONCATEGORY",
172
+ "56": "DASHCATEGORY",
173
+ "57": "I",
174
+ "58": "PAUSECATEGORY",
175
+ "59": "Dfb",
176
+ "60": "U",
177
+ "61": "Dg",
178
+ "62": "Q",
179
+ "63": "VB",
180
+ "64": "FW",
181
+ "65": "Vc",
182
+ "66": "T4",
183
+ "67": "Vg",
184
+ "68": "\u7b26\uff0c\u5c1a\u7121\u8cc7\u6599",
185
+ "69": "V\uff3f\uff12",
186
+ "70": "NH",
187
+ "71": "Ve",
188
+ "72": "V",
189
+ "73": "r",
190
+ "74": "D",
191
+ "75": "na",
192
+ "76": "ND",
193
+ "77": "NB",
194
+ "78": "N",
195
+ "79": "EXCLANATIONCATEGORY",
196
+ "80": "DFa",
197
+ "81": "Di",
198
+ "82": "T3",
199
+ "83": "nc",
200
+ "84": "Vf",
201
+ "85": "Ne",
202
+ "86": "C",
203
+ "87": "VI",
204
+ "88": "DH",
205
+ "89": "Da",
206
+ "90": "PARENTHES7ISCATEGORY",
207
+ "91": "DJ",
208
+ "92": "DAb",
209
+ "93": "NA",
210
+ "94": "Df",
211
+ "95": "Db",
212
+ "96": "neu",
213
+ "97": "DC"
214
+ },
215
+ "layer_norm_eps": 1e-12,
216
+ "max_position_embeddings": 512,
217
+ "model_type": "bert",
218
+ "num_attention_heads": 12,
219
+ "num_hidden_layers": 12,
220
+ "pad_token_id": 0,
221
+ "pooler_fc_size": 768,
222
+ "pooler_num_attention_heads": 12,
223
+ "pooler_num_fc_layers": 3,
224
+ "pooler_size_per_head": 128,
225
+ "pooler_type": "first_token_transform",
226
+ "position_embedding_type": "absolute",
227
+ "tokenizer_class": "BertTokenizerFast",
228
+ "transformers_version": "4.7.0",
229
+ "type_vocab_size": 2,
230
+ "use_cache": true,
231
+ "vocab_size": 26140
232
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e46de4be93e99b3e3068b00906adc50e6c7c3f5aeb742ab282ca9730bd7349
3
+ size 422458267
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "name_or_path": "hub/ckiplab/bert-base-chinese-20210817-001848", "special_tokens_map_file": "/home/cindy666/.cache/huggingface/transformers/d8a1a1b7a3de221ae53bf9d55154b9df9c4cda18409b393ee0fda4bce4ca7818.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d", "do_basic_tokenize": true, "never_split": null}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff