ctlin commited on
Commit
1aacd2a
1 Parent(s): 9c6470a

upload model files

Browse files
README.md CHANGED
@@ -1,3 +1,28 @@
1
  ---
 
 
 
 
 
 
 
 
2
  license: gpl-3.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - zh
4
+ thumbnail: https://ckip.iis.sinica.edu.tw/files/ckip_logo.png
5
+ tags:
6
+ - pytorch
7
+ - token-classification
8
+ - bert
9
+ - zh
10
  license: gpl-3.0
11
  ---
12
+
13
+ # CKIP BERT Base Han Chinese POS
14
+
15
+ This model provides part-of-speech (POS) tagging for the ancient Chinese language. Our training dataset covers four eras of the Chinese language.
16
+
17
+ ## Homepage
18
+ * [ckiplab/han-transformers](https://github.com/ckiplab/han-transformers)
19
+
20
+ ## Training Datasets
21
+ The copyright of the datasets belongs to the Institute of Linguistics, Academia Sinica.
22
+ * [中央研究院上古漢語標記語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/akiwi/kiwi.sh)
23
+ * [中央研究院中古漢語語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/dkiwi/kiwi.sh)
24
+ * [中央研究院近代漢語語料庫](http://lingcorpus.iis.sinica.edu.tw/cgi-bin/kiwi/pkiwi/kiwi.sh)
25
+ * [中央研究院現代漢語語料庫](http://asbc.iis.sinica.edu.tw)
26
+
27
+ ## Contributors
28
+ * Chin-Tung Lin at [CKIP](https://ckip.iis.sinica.edu.tw/)
config.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "hub/ckiplab/bert-base-chinese-20210817-001848",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "directionality": "bidi",
8
+ "finetuning_task": "ner",
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "label2id": {
14
+ "A": 33,
15
+ "COLONCATEGORY": 40,
16
+ "COMMACATEGORY": 1,
17
+ "Caa": 21,
18
+ "Cab": 47,
19
+ "Cba": 57,
20
+ "Cbb": 13,
21
+ "D": 2,
22
+ "DASHCATEGORY": 53,
23
+ "DE": 3,
24
+ "DK": 64,
25
+ "DM": 59,
26
+ "DOTCATEGORY": 58,
27
+ "Da": 38,
28
+ "De": 63,
29
+ "Dfa": 27,
30
+ "Dfb": 55,
31
+ "Di": 26,
32
+ "Dk": 49,
33
+ "ETCCATEGORY": 51,
34
+ "EXCLAMATIONCATEGORY": 44,
35
+ "FW": 37,
36
+ "I": 52,
37
+ "N": 73,
38
+ "ND": 68,
39
+ "NG": 61,
40
+ "Na": 0,
41
+ "Nb": 15,
42
+ "Nc": 6,
43
+ "Ncd": 29,
44
+ "Nd": 17,
45
+ "Nep": 23,
46
+ "Neqa": 25,
47
+ "Neqb": 56,
48
+ "Nes": 35,
49
+ "Neu": 11,
50
+ "Nf": 10,
51
+ "Ng": 20,
52
+ "Nh": 9,
53
+ "Nv": 28,
54
+ "P": 7,
55
+ "PARENTHESISCATEGORY": 12,
56
+ "PAUSECATEGORY": 22,
57
+ "PERIODCATEGORY": 8,
58
+ "QUESTIONCATEGORY": 41,
59
+ "SEMICOLONCATEGORY": 46,
60
+ "SHI": 19,
61
+ "SPCHANGECATEGORY": 48,
62
+ "T": 32,
63
+ "V": 66,
64
+ "VA": 16,
65
+ "VAC": 54,
66
+ "VB": 45,
67
+ "VC": 4,
68
+ "VCL": 34,
69
+ "VD": 42,
70
+ "VE": 14,
71
+ "VF": 43,
72
+ "VG": 30,
73
+ "VH": 5,
74
+ "VHC": 39,
75
+ "VI": 50,
76
+ "VJ": 18,
77
+ "VK": 24,
78
+ "VL": 36,
79
+ "V_2": 31,
80
+ "Vc": 65,
81
+ "cbb": 67,
82
+ "dI": 75,
83
+ "na": 69,
84
+ "nf": 72,
85
+ "p": 71,
86
+ "sHI": 60,
87
+ "vA": 62,
88
+ "vC": 70,
89
+ "vH": 74
90
+ },
91
+ "initializer_range": 0.02,
92
+ "intermediate_size": 3072,
93
+ "id2label": {
94
+ "0": "Na",
95
+ "1": "COMMACATEGORY",
96
+ "2": "D",
97
+ "3": "DE",
98
+ "4": "VC",
99
+ "5": "VH",
100
+ "6": "Nc",
101
+ "7": "P",
102
+ "8": "PERIODCATEGORY",
103
+ "9": "Nh",
104
+ "10": "Nf",
105
+ "11": "Neu",
106
+ "12": "PARENTHESISCATEGORY",
107
+ "13": "Cbb",
108
+ "14": "VE",
109
+ "15": "Nb",
110
+ "16": "VA",
111
+ "17": "Nd",
112
+ "18": "VJ",
113
+ "19": "SHI",
114
+ "20": "Ng",
115
+ "21": "Caa",
116
+ "22": "PAUSECATEGORY",
117
+ "23": "Nep",
118
+ "24": "VK",
119
+ "25": "Neqa",
120
+ "26": "Di",
121
+ "27": "Dfa",
122
+ "28": "Nv",
123
+ "29": "Ncd",
124
+ "30": "VG",
125
+ "31": "V_2",
126
+ "32": "T",
127
+ "33": "A",
128
+ "34": "VCL",
129
+ "35": "Nes",
130
+ "36": "VL",
131
+ "37": "FW",
132
+ "38": "Da",
133
+ "39": "VHC",
134
+ "40": "COLONCATEGORY",
135
+ "41": "QUESTIONCATEGORY",
136
+ "42": "VD",
137
+ "43": "VF",
138
+ "44": "EXCLAMATIONCATEGORY",
139
+ "45": "VB",
140
+ "46": "SEMICOLONCATEGORY",
141
+ "47": "Cab",
142
+ "48": "SPCHANGECATEGORY",
143
+ "49": "Dk",
144
+ "50": "VI",
145
+ "51": "ETCCATEGORY",
146
+ "52": "I",
147
+ "53": "DASHCATEGORY",
148
+ "54": "VAC",
149
+ "55": "Dfb",
150
+ "56": "Neqb",
151
+ "57": "Cba",
152
+ "58": "DOTCATEGORY",
153
+ "59": "DM",
154
+ "60": "sHI",
155
+ "61": "NG",
156
+ "62": "vA",
157
+ "63": "De",
158
+ "64": "DK",
159
+ "65": "Vc",
160
+ "66": "V",
161
+ "67": "cbb",
162
+ "68": "ND",
163
+ "69": "na",
164
+ "70": "vC",
165
+ "71": "p",
166
+ "72": "nf",
167
+ "73": "N",
168
+ "74": "vH",
169
+ "75": "dI"
170
+ },
171
+ "layer_norm_eps": 1e-12,
172
+ "max_position_embeddings": 512,
173
+ "model_type": "bert",
174
+ "num_attention_heads": 12,
175
+ "num_hidden_layers": 12,
176
+ "pad_token_id": 0,
177
+ "pooler_fc_size": 768,
178
+ "pooler_num_attention_heads": 12,
179
+ "pooler_num_fc_layers": 3,
180
+ "pooler_size_per_head": 128,
181
+ "pooler_type": "first_token_transform",
182
+ "position_embedding_type": "absolute",
183
+ "tokenizer_class": "BertTokenizerFast",
184
+ "transformers_version": "4.7.0",
185
+ "type_vocab_size": 2,
186
+ "use_cache": true,
187
+ "vocab_size": 26140
188
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a64721b8cce84399ba6259e968d1629bec858d596ef6fcaa043b894faf8ac5a0
3
+ size 422390967
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "name_or_path": "hub/ckiplab/bert-base-chinese-20210817-001848", "special_tokens_map_file": "/home/cindy666/.cache/huggingface/transformers/d8a1a1b7a3de221ae53bf9d55154b9df9c4cda18409b393ee0fda4bce4ca7818.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d", "do_basic_tokenize": true, "never_split": null}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff