KoichiYasuoka commited on
Commit
02ba126
1 Parent(s): 777bf8e

initial release

Browse files
Files changed (7) hide show
  1. README.md +41 -1
  2. config.json +203 -0
  3. pytorch_model.bin +3 -0
  4. special_tokens_map.json +1 -0
  5. supar.model +3 -0
  6. tokenizer_config.json +1 -0
  7. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,43 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - "lzh"
4
+ tags:
5
+ - "classical chinese"
6
+ - "literary chinese"
7
+ - "ancient chinese"
8
+ - "token-classification"
9
+ - "pos"
10
+ - "dependency-parsing"
11
+ datasets:
12
+ - "universal_dependencies"
13
+ license: "apache-2.0"
14
+ pipeline_tag: "token-classification"
15
+ widget:
16
+ - text: "子曰學而時習之不亦説乎有朋自遠方來不亦樂乎人不知而不慍不亦君子乎"
17
  ---
18
+
19
+ # bert-ancient-chinese-base-upos
20
+
21
+ ## Model Description
22
+
23
+ This is a BERT model pre-trained on Classical Chinese texts for POS-tagging and dependency-parsing, derived from [bert-ancient-chinese](https://huggingface.co/Jihuai/bert-ancient-chinese). Every word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech) and [FEATS](https://universaldependencies.org/u/feat/).
24
+
25
+ ## How to Use
26
+
27
+ ```py
28
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
29
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/bert-ancient-chinese-base-upos")
30
+ model=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/bert-ancient-chinese-base-upos")
31
+ ```
32
+
33
+ or
34
+
35
+ ```py
36
+ import esupar
37
+ nlp=esupar.load("KoichiYasuoka/bert-ancient-chinese-base-upos")
38
+ ```
39
+
40
+ ## See Also
41
+
42
+ [esupar](https://github.com/KoichiYasuoka/esupar): Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models
43
+
config.json ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "directionality": "bidi",
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "ADP",
13
+ "1": "ADP|Degree=Equ",
14
+ "2": "ADV",
15
+ "3": "ADV|AdvType=Cau",
16
+ "4": "ADV|AdvType=Deg|Degree=Cmp",
17
+ "5": "ADV|AdvType=Deg|Degree=Pos",
18
+ "6": "ADV|AdvType=Deg|Degree=Sup",
19
+ "7": "ADV|AdvType=Tim",
20
+ "8": "ADV|AdvType=Tim|Aspect=Perf",
21
+ "9": "ADV|AdvType=Tim|Tense=Fut",
22
+ "10": "ADV|AdvType=Tim|Tense=Past",
23
+ "11": "ADV|AdvType=Tim|Tense=Pres",
24
+ "12": "ADV|Degree=Equ|VerbForm=Conv",
25
+ "13": "ADV|Degree=Pos|VerbForm=Conv",
26
+ "14": "ADV|Polarity=Neg",
27
+ "15": "ADV|Polarity=Neg|VerbForm=Conv",
28
+ "16": "ADV|VerbForm=Conv",
29
+ "17": "AUX|Mood=Des",
30
+ "18": "AUX|Mood=Nec",
31
+ "19": "AUX|Mood=Pot",
32
+ "20": "AUX|VerbType=Cop",
33
+ "21": "AUX|Voice=Pass",
34
+ "22": "B-ADV|VerbForm=Conv",
35
+ "23": "B-NOUN",
36
+ "24": "B-NOUN|Case=Loc",
37
+ "25": "B-NOUN|Case=Tem",
38
+ "26": "B-NUM",
39
+ "27": "B-NUM|NumType=Ord",
40
+ "28": "B-PROPN|Case=Loc|NameType=Geo",
41
+ "29": "B-PROPN|Case=Loc|NameType=Nat",
42
+ "30": "B-PROPN|NameType=Giv",
43
+ "31": "B-PROPN|NameType=Prs",
44
+ "32": "B-PROPN|NameType=Sur",
45
+ "33": "B-VERB",
46
+ "34": "B-VERB|Degree=Equ",
47
+ "35": "B-VERB|Degree=Pos",
48
+ "36": "B-VERB|VerbForm=Part",
49
+ "37": "CCONJ",
50
+ "38": "I-ADV|VerbForm=Conv",
51
+ "39": "I-NOUN",
52
+ "40": "I-NOUN|Case=Loc",
53
+ "41": "I-NOUN|Case=Tem",
54
+ "42": "I-NUM",
55
+ "43": "I-NUM|NumType=Ord",
56
+ "44": "I-PROPN|Case=Loc|NameType=Geo",
57
+ "45": "I-PROPN|Case=Loc|NameType=Nat",
58
+ "46": "I-PROPN|NameType=Giv",
59
+ "47": "I-PROPN|NameType=Prs",
60
+ "48": "I-PROPN|NameType=Sur",
61
+ "49": "I-VERB",
62
+ "50": "I-VERB|Degree=Equ",
63
+ "51": "I-VERB|Degree=Pos",
64
+ "52": "I-VERB|VerbForm=Part",
65
+ "53": "INTJ",
66
+ "54": "NOUN",
67
+ "55": "NOUN|Case=Loc",
68
+ "56": "NOUN|Case=Tem",
69
+ "57": "NOUN|NounType=Clf",
70
+ "58": "NUM",
71
+ "59": "NUM|NumType=Ord",
72
+ "60": "PART",
73
+ "61": "PRON|Person=1|PronType=Prs",
74
+ "62": "PRON|Person=2|PronType=Prs",
75
+ "63": "PRON|Person=3|PronType=Prs",
76
+ "64": "PRON|PronType=Dem",
77
+ "65": "PRON|PronType=Int",
78
+ "66": "PRON|PronType=Prs",
79
+ "67": "PRON|PronType=Prs|Reflex=Yes",
80
+ "68": "PROPN|Case=Loc|NameType=Geo",
81
+ "69": "PROPN|Case=Loc|NameType=Nat",
82
+ "70": "PROPN|NameType=Giv",
83
+ "71": "PROPN|NameType=Prs",
84
+ "72": "PROPN|NameType=Sur",
85
+ "73": "SCONJ",
86
+ "74": "SYM",
87
+ "75": "VERB",
88
+ "76": "VERB|Degree=Equ",
89
+ "77": "VERB|Degree=Equ|VerbForm=Part",
90
+ "78": "VERB|Degree=Pos",
91
+ "79": "VERB|Degree=Pos|VerbForm=Part",
92
+ "80": "VERB|Polarity=Neg",
93
+ "81": "VERB|Polarity=Neg|VerbForm=Part",
94
+ "82": "VERB|VerbForm=Part"
95
+ },
96
+ "initializer_range": 0.02,
97
+ "intermediate_size": 3072,
98
+ "label2id": {
99
+ "ADP": 0,
100
+ "ADP|Degree=Equ": 1,
101
+ "ADV": 2,
102
+ "ADV|AdvType=Cau": 3,
103
+ "ADV|AdvType=Deg|Degree=Cmp": 4,
104
+ "ADV|AdvType=Deg|Degree=Pos": 5,
105
+ "ADV|AdvType=Deg|Degree=Sup": 6,
106
+ "ADV|AdvType=Tim": 7,
107
+ "ADV|AdvType=Tim|Aspect=Perf": 8,
108
+ "ADV|AdvType=Tim|Tense=Fut": 9,
109
+ "ADV|AdvType=Tim|Tense=Past": 10,
110
+ "ADV|AdvType=Tim|Tense=Pres": 11,
111
+ "ADV|Degree=Equ|VerbForm=Conv": 12,
112
+ "ADV|Degree=Pos|VerbForm=Conv": 13,
113
+ "ADV|Polarity=Neg": 14,
114
+ "ADV|Polarity=Neg|VerbForm=Conv": 15,
115
+ "ADV|VerbForm=Conv": 16,
116
+ "AUX|Mood=Des": 17,
117
+ "AUX|Mood=Nec": 18,
118
+ "AUX|Mood=Pot": 19,
119
+ "AUX|VerbType=Cop": 20,
120
+ "AUX|Voice=Pass": 21,
121
+ "B-ADV|VerbForm=Conv": 22,
122
+ "B-NOUN": 23,
123
+ "B-NOUN|Case=Loc": 24,
124
+ "B-NOUN|Case=Tem": 25,
125
+ "B-NUM": 26,
126
+ "B-NUM|NumType=Ord": 27,
127
+ "B-PROPN|Case=Loc|NameType=Geo": 28,
128
+ "B-PROPN|Case=Loc|NameType=Nat": 29,
129
+ "B-PROPN|NameType=Giv": 30,
130
+ "B-PROPN|NameType=Prs": 31,
131
+ "B-PROPN|NameType=Sur": 32,
132
+ "B-VERB": 33,
133
+ "B-VERB|Degree=Equ": 34,
134
+ "B-VERB|Degree=Pos": 35,
135
+ "B-VERB|VerbForm=Part": 36,
136
+ "CCONJ": 37,
137
+ "I-ADV|VerbForm=Conv": 38,
138
+ "I-NOUN": 39,
139
+ "I-NOUN|Case=Loc": 40,
140
+ "I-NOUN|Case=Tem": 41,
141
+ "I-NUM": 42,
142
+ "I-NUM|NumType=Ord": 43,
143
+ "I-PROPN|Case=Loc|NameType=Geo": 44,
144
+ "I-PROPN|Case=Loc|NameType=Nat": 45,
145
+ "I-PROPN|NameType=Giv": 46,
146
+ "I-PROPN|NameType=Prs": 47,
147
+ "I-PROPN|NameType=Sur": 48,
148
+ "I-VERB": 49,
149
+ "I-VERB|Degree=Equ": 50,
150
+ "I-VERB|Degree=Pos": 51,
151
+ "I-VERB|VerbForm=Part": 52,
152
+ "INTJ": 53,
153
+ "NOUN": 54,
154
+ "NOUN|Case=Loc": 55,
155
+ "NOUN|Case=Tem": 56,
156
+ "NOUN|NounType=Clf": 57,
157
+ "NUM": 58,
158
+ "NUM|NumType=Ord": 59,
159
+ "PART": 60,
160
+ "PRON|Person=1|PronType=Prs": 61,
161
+ "PRON|Person=2|PronType=Prs": 62,
162
+ "PRON|Person=3|PronType=Prs": 63,
163
+ "PRON|PronType=Dem": 64,
164
+ "PRON|PronType=Int": 65,
165
+ "PRON|PronType=Prs": 66,
166
+ "PRON|PronType=Prs|Reflex=Yes": 67,
167
+ "PROPN|Case=Loc|NameType=Geo": 68,
168
+ "PROPN|Case=Loc|NameType=Nat": 69,
169
+ "PROPN|NameType=Giv": 70,
170
+ "PROPN|NameType=Prs": 71,
171
+ "PROPN|NameType=Sur": 72,
172
+ "SCONJ": 73,
173
+ "SYM": 74,
174
+ "VERB": 75,
175
+ "VERB|Degree=Equ": 76,
176
+ "VERB|Degree=Equ|VerbForm=Part": 77,
177
+ "VERB|Degree=Pos": 78,
178
+ "VERB|Degree=Pos|VerbForm=Part": 79,
179
+ "VERB|Polarity=Neg": 80,
180
+ "VERB|Polarity=Neg|VerbForm=Part": 81,
181
+ "VERB|VerbForm=Part": 82
182
+ },
183
+ "layer_norm_eps": 1e-12,
184
+ "lstm_dropout_prob": 0.5,
185
+ "lstm_embedding_size": 768,
186
+ "max_position_embeddings": 512,
187
+ "model_type": "bert",
188
+ "num_attention_heads": 12,
189
+ "num_hidden_layers": 12,
190
+ "pad_token_id": 0,
191
+ "pooler_fc_size": 768,
192
+ "pooler_num_attention_heads": 12,
193
+ "pooler_num_fc_layers": 3,
194
+ "pooler_size_per_head": 128,
195
+ "pooler_type": "first_token_transform",
196
+ "position_embedding_type": "absolute",
197
+ "tokenizer_class": "BertTokenizer",
198
+ "torch_dtype": "float32",
199
+ "transformers_version": "4.19.4",
200
+ "type_vocab_size": 2,
201
+ "use_cache": true,
202
+ "vocab_size": 38208
203
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef7f89faff627b146dcee894d57c7b42eff4b9c39148913f29ac82a593e940ff
3
+ size 459503729
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
supar.model ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e1401f9d855b637616a6fb098ac9725b0174cdc6b5f8a9d65a4eeb97143aef
3
+ size 509755365
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff