ctlin commited on
Commit
65a407e
1 Parent(s): fe3bd10

upload model files

Browse files
README.md CHANGED
@@ -1,3 +1,28 @@
1
  ---
 
 
 
 
 
 
 
 
2
  license: gpl-3.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - zh
4
+ thumbnail: https://ckip.iis.sinica.edu.tw/files/ckip_logo.png
5
+ tags:
6
+ - pytorch
7
+ - token-classification
8
+ - bert
9
+ - zh
10
  license: gpl-3.0
11
  ---
12
+
13
+ # CKIP BERT Base Han Chinese POS
14
+
15
+ This model provides part-of-speech (POS) tagging for the ancient Chinese language. Our training dataset covers four eras of the Chinese language.
16
+
17
+ ## Homepage
18
+ * [ckiplab/han-transformers](https://github.com/ckiplab/han-transformers)
19
+
20
+ ## Training Datasets
21
+ The copyright of the datasets belongs to the Institute of Linguistics, Academia Sinica.
22
+ * [中央研究院上古漢語標記語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/akiwi/kiwi.sh)
23
+ * [中央研究院中古漢語語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/dkiwi/kiwi.sh)
24
+ * [中央研究院近代漢語語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/pkiwi/kiwi.sh)
25
+ * [中央研究院現代漢語語料庫](http://asbc.iis.sinica.edu.tw)
26
+
27
+ ## Contributors
28
+ * Chin-Tung Lin at [CKIP](https://ckip.iis.sinica.edu.tw/)
config.json ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "hub/ckiplab/bert-base-chinese-20210817-001848",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "directionality": "bidi",
8
+ "finetuning_task": "ner",
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "label2id": {
14
+ "/Na": 88,
15
+ "3": 72,
16
+ "A": 46,
17
+ "CE": 82,
18
+ "COLONCATEGORY": 11,
19
+ "COMMACATEGORY": 0,
20
+ "Caa": 56,
21
+ "Cbb": 33,
22
+ "D": 96,
23
+ "DASHCATEGORY": 63,
24
+ "DE": 24,
25
+ "Daa": 38,
26
+ "Dab": 39,
27
+ "Dba": 54,
28
+ "Dbb": 35,
29
+ "Dc": 18,
30
+ "Dd": 22,
31
+ "Df": 95,
32
+ "Dfa": 48,
33
+ "Dfb": 65,
34
+ "Dg": 67,
35
+ "Dh": 32,
36
+ "Dha": 94,
37
+ "Di": 21,
38
+ "Dj": 43,
39
+ "Dk": 60,
40
+ "Dl": 14,
41
+ "ETCCATEGORY": 62,
42
+ "EXCLAMATIONCATEGORY": 58,
43
+ "EXCLANATIONCATEGORY": 28,
44
+ "FW": 52,
45
+ "I": 64,
46
+ "N": 80,
47
+ "NA": 81,
48
+ "Na": 1,
49
+ "Nb": 7,
50
+ "Nc": 10,
51
+ "Nd": 19,
52
+ "Neqa": 40,
53
+ "Nes": 41,
54
+ "Neu": 12,
55
+ "Nf": 15,
56
+ "Ng": 17,
57
+ "Nh": 4,
58
+ "Nha": 87,
59
+ "P": 16,
60
+ "PARENTHESISCATEGOR": 74,
61
+ "PARENTHESISCATEGORY": 8,
62
+ "PAUSECATEGORY": 42,
63
+ "PERIODCATEGORY": 2,
64
+ "Q": 70,
65
+ "QUESTIONCATEGORY": 26,
66
+ "R": 66,
67
+ "SEMICOLONCATEGORY": 44,
68
+ "SHI": 25,
69
+ "T": 23,
70
+ "T3": 45,
71
+ "T4": 49,
72
+ "T5": 55,
73
+ "T6": 50,
74
+ "T7": 57,
75
+ "T8": 51,
76
+ "U": 53,
77
+ "V": 77,
78
+ "V-2": 76,
79
+ "VA": 20,
80
+ "VAC": 47,
81
+ "VAL": 89,
82
+ "VB": 61,
83
+ "VC": 3,
84
+ "VCL": 9,
85
+ "VCl": 79,
86
+ "VD": 37,
87
+ "VE": 5,
88
+ "VF": 30,
89
+ "VG": 36,
90
+ "VH": 6,
91
+ "VHC": 29,
92
+ "VHL": 71,
93
+ "VI": 68,
94
+ "VJ": 27,
95
+ "VK": 13,
96
+ "VL": 31,
97
+ "VU": 97,
98
+ "V_": 90,
99
+ "V_2": 34,
100
+ "Va": 91,
101
+ "Vf": 78,
102
+ "Vh": 83,
103
+ "Vk": 93,
104
+ "X": 75,
105
+ "b": 59,
106
+ "cr": 92,
107
+ "q": 73,
108
+ "u": 84,
109
+ "x": 69,
110
+ "\u5750": 85,
111
+ "\u7c59": 86
112
+ },
113
+ "initializer_range": 0.02,
114
+ "intermediate_size": 3072,
115
+ "id2label": {
116
+ "0": "COMMACATEGORY",
117
+ "1": "Na",
118
+ "2": "PERIODCATEGORY",
119
+ "3": "VC",
120
+ "4": "Nh",
121
+ "5": "VE",
122
+ "6": "VH",
123
+ "7": "Nb",
124
+ "8": "PARENTHESISCATEGORY",
125
+ "9": "VCL",
126
+ "10": "Nc",
127
+ "11": "COLONCATEGORY",
128
+ "12": "Neu",
129
+ "13": "VK",
130
+ "14": "Dl",
131
+ "15": "Nf",
132
+ "16": "P",
133
+ "17": "Ng",
134
+ "18": "Dc",
135
+ "19": "Nd",
136
+ "20": "VA",
137
+ "21": "Di",
138
+ "22": "Dd",
139
+ "23": "T",
140
+ "24": "DE",
141
+ "25": "SHI",
142
+ "26": "QUESTIONCATEGORY",
143
+ "27": "VJ",
144
+ "28": "EXCLANATIONCATEGORY",
145
+ "29": "VHC",
146
+ "30": "VF",
147
+ "31": "VL",
148
+ "32": "Dh",
149
+ "33": "Cbb",
150
+ "34": "V_2",
151
+ "35": "Dbb",
152
+ "36": "VG",
153
+ "37": "VD",
154
+ "38": "Daa",
155
+ "39": "Dab",
156
+ "40": "Neqa",
157
+ "41": "Nes",
158
+ "42": "PAUSECATEGORY",
159
+ "43": "Dj",
160
+ "44": "SEMICOLONCATEGORY",
161
+ "45": "T3",
162
+ "46": "A",
163
+ "47": "VAC",
164
+ "48": "Dfa",
165
+ "49": "T4",
166
+ "50": "T6",
167
+ "51": "T8",
168
+ "52": "FW",
169
+ "53": "U",
170
+ "54": "Dba",
171
+ "55": "T5",
172
+ "56": "Caa",
173
+ "57": "T7",
174
+ "58": "EXCLAMATIONCATEGORY",
175
+ "59": "b",
176
+ "60": "Dk",
177
+ "61": "VB",
178
+ "62": "ETCCATEGORY",
179
+ "63": "DASHCATEGORY",
180
+ "64": "I",
181
+ "65": "Dfb",
182
+ "66": "R",
183
+ "67": "Dg",
184
+ "68": "VI",
185
+ "69": "x",
186
+ "70": "Q",
187
+ "71": "VHL",
188
+ "72": "3",
189
+ "73": "q",
190
+ "74": "PARENTHESISCATEGOR",
191
+ "75": "X",
192
+ "76": "V-2",
193
+ "77": "V",
194
+ "78": "Vf",
195
+ "79": "VCl",
196
+ "80": "N",
197
+ "81": "NA",
198
+ "82": "CE",
199
+ "83": "Vh",
200
+ "84": "u",
201
+ "85": "\u5750",
202
+ "86": "\u7c59",
203
+ "87": "Nha",
204
+ "88": "/Na",
205
+ "89": "VAL",
206
+ "90": "V_",
207
+ "91": "Va",
208
+ "92": "cr",
209
+ "93": "Vk",
210
+ "94": "Dha",
211
+ "95": "Df",
212
+ "96": "D",
213
+ "97": "VU"
214
+ },
215
+ "layer_norm_eps": 1e-12,
216
+ "max_position_embeddings": 512,
217
+ "model_type": "bert",
218
+ "num_attention_heads": 12,
219
+ "num_hidden_layers": 12,
220
+ "pad_token_id": 0,
221
+ "pooler_fc_size": 768,
222
+ "pooler_num_attention_heads": 12,
223
+ "pooler_num_fc_layers": 3,
224
+ "pooler_size_per_head": 128,
225
+ "pooler_type": "first_token_transform",
226
+ "position_embedding_type": "absolute",
227
+ "tokenizer_class": "BertTokenizerFast",
228
+ "transformers_version": "4.7.0",
229
+ "type_vocab_size": 2,
230
+ "use_cache": true,
231
+ "vocab_size": 26140
232
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c18e15377f8c615c95b2c095f5d064e5ae8ab2a30d996dca100b7644b3a44c70
3
+ size 422458643
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "name_or_path": "hub/ckiplab/bert-base-chinese-20210817-001848", "special_tokens_map_file": "/home/cindy666/.cache/huggingface/transformers/d8a1a1b7a3de221ae53bf9d55154b9df9c4cda18409b393ee0fda4bce4ca7818.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d", "do_basic_tokenize": true, "never_split": null}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff