KoichiYasuoka commited on
Commit
bd1517b
1 Parent(s): 47d0e94

initial release

Browse files
README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "ko"
4
+ tags:
5
+ - "korean"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "cc-by-sa-4.0"
12
+ pipeline_tag: "token-classification"
13
+ widget:
14
+ - text: "홍시 맛이 나서 홍시라 생각한다."
15
+ ---
16
+
17
+ # roberta-large-korean-upos
18
+
19
+ ## Model Description
20
+
21
+ This is a RoBERTa model for POS-tagging and dependency-parsing, derived from [klue/roberta-large](https://huggingface.co/klue/roberta-large).
22
+
23
+ ## How to Use
24
+
25
+ ```py
26
+ from transformers import AutoTokenizer,AutoModelForTokenClassification,TokenClassificationPipeline
27
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/roberta-large-korean-upos")
28
+ model=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/roberta-large-korean-upos")
29
+ pipeline=TokenClassificationPipeline(tokenizer=tokenizer,model=model,aggregation_strategy="simple")
30
+ nlp=lambda x:[(x[t["start"]:t["end"]],t["entity_group"]) for t in pipeline(x)]
31
+ print(nlp("홍시 맛이 나서 홍시라 생각한다."))
32
+ ```
33
+
34
+ or
35
+
36
+ ```py
37
+ import esupar
38
+ nlp=esupar.load("KoichiYasuoka/roberta-large-korean-upos")
39
+ print(nlp("홍시 맛이 나서 홍시라 생각한다."))
40
+ ```
41
+
42
+ ## See Also
43
+
44
+ [esupar](https://github.com/KoichiYasuoka/esupar): Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models
config.json ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "id2label": {
14
+ "0": "ADJ",
15
+ "1": "ADJ+ADJ",
16
+ "2": "ADJ+NOUN",
17
+ "3": "ADJ+VERB",
18
+ "4": "ADP",
19
+ "5": "ADP+PRON",
20
+ "6": "ADV",
21
+ "7": "ADV+ADJ",
22
+ "8": "ADV+CCONJ",
23
+ "9": "ADV+NOUN",
24
+ "10": "ADV+PROPN",
25
+ "11": "ADV+SCONJ",
26
+ "12": "ADV+VERB",
27
+ "13": "AUX",
28
+ "14": "AUX+NOUN",
29
+ "15": "B-ADJ",
30
+ "16": "B-ADP",
31
+ "17": "B-ADV",
32
+ "18": "B-AUX",
33
+ "19": "B-CCONJ",
34
+ "20": "B-DET",
35
+ "21": "B-INTJ",
36
+ "22": "B-NOUN",
37
+ "23": "B-NUM",
38
+ "24": "B-NUM+PUNCT+NUM",
39
+ "25": "B-PART",
40
+ "26": "B-PRON",
41
+ "27": "B-PROPN",
42
+ "28": "B-PUNCT",
43
+ "29": "B-SCONJ",
44
+ "30": "B-SYM",
45
+ "31": "B-VERB",
46
+ "32": "B-X",
47
+ "33": "CCONJ",
48
+ "34": "DET",
49
+ "35": "DET+NOUN",
50
+ "36": "I-ADJ",
51
+ "37": "I-ADP",
52
+ "38": "I-ADV",
53
+ "39": "I-AUX",
54
+ "40": "I-CCONJ",
55
+ "41": "I-DET",
56
+ "42": "I-INTJ",
57
+ "43": "I-NOUN",
58
+ "44": "I-NUM",
59
+ "45": "I-NUM+PUNCT+NUM",
60
+ "46": "I-PART",
61
+ "47": "I-PRON",
62
+ "48": "I-PROPN",
63
+ "49": "I-PUNCT",
64
+ "50": "I-SCONJ",
65
+ "51": "I-SYM",
66
+ "52": "I-VERB",
67
+ "53": "I-X",
68
+ "54": "INTJ",
69
+ "55": "NOUN",
70
+ "56": "NOUN+ADJ",
71
+ "57": "NOUN+ADV",
72
+ "58": "NOUN+CCONJ",
73
+ "59": "NOUN+NOUN",
74
+ "60": "NOUN+SCONJ",
75
+ "61": "NOUN+VERB",
76
+ "62": "NUM",
77
+ "63": "PART",
78
+ "64": "PRON",
79
+ "65": "PRON+ADV",
80
+ "66": "PRON+CCONJ",
81
+ "67": "PROPN",
82
+ "68": "PROPN+PROPN",
83
+ "69": "PROPN+VERB",
84
+ "70": "PUNCT",
85
+ "71": "SCONJ",
86
+ "72": "SCONJ+NOUN",
87
+ "73": "SCONJ+PROPN",
88
+ "74": "SCONJ+SCONJ",
89
+ "75": "SYM",
90
+ "76": "VERB",
91
+ "77": "VERB+ADV",
92
+ "78": "VERB+NOUN",
93
+ "79": "VERB+PROPN",
94
+ "80": "X"
95
+ },
96
+ "initializer_range": 0.02,
97
+ "intermediate_size": 4096,
98
+ "label2id": {
99
+ "ADJ": 0,
100
+ "ADJ+ADJ": 1,
101
+ "ADJ+NOUN": 2,
102
+ "ADJ+VERB": 3,
103
+ "ADP": 4,
104
+ "ADP+PRON": 5,
105
+ "ADV": 6,
106
+ "ADV+ADJ": 7,
107
+ "ADV+CCONJ": 8,
108
+ "ADV+NOUN": 9,
109
+ "ADV+PROPN": 10,
110
+ "ADV+SCONJ": 11,
111
+ "ADV+VERB": 12,
112
+ "AUX": 13,
113
+ "AUX+NOUN": 14,
114
+ "B-ADJ": 15,
115
+ "B-ADP": 16,
116
+ "B-ADV": 17,
117
+ "B-AUX": 18,
118
+ "B-CCONJ": 19,
119
+ "B-DET": 20,
120
+ "B-INTJ": 21,
121
+ "B-NOUN": 22,
122
+ "B-NUM": 23,
123
+ "B-NUM+PUNCT+NUM": 24,
124
+ "B-PART": 25,
125
+ "B-PRON": 26,
126
+ "B-PROPN": 27,
127
+ "B-PUNCT": 28,
128
+ "B-SCONJ": 29,
129
+ "B-SYM": 30,
130
+ "B-VERB": 31,
131
+ "B-X": 32,
132
+ "CCONJ": 33,
133
+ "DET": 34,
134
+ "DET+NOUN": 35,
135
+ "I-ADJ": 36,
136
+ "I-ADP": 37,
137
+ "I-ADV": 38,
138
+ "I-AUX": 39,
139
+ "I-CCONJ": 40,
140
+ "I-DET": 41,
141
+ "I-INTJ": 42,
142
+ "I-NOUN": 43,
143
+ "I-NUM": 44,
144
+ "I-NUM+PUNCT+NUM": 45,
145
+ "I-PART": 46,
146
+ "I-PRON": 47,
147
+ "I-PROPN": 48,
148
+ "I-PUNCT": 49,
149
+ "I-SCONJ": 50,
150
+ "I-SYM": 51,
151
+ "I-VERB": 52,
152
+ "I-X": 53,
153
+ "INTJ": 54,
154
+ "NOUN": 55,
155
+ "NOUN+ADJ": 56,
156
+ "NOUN+ADV": 57,
157
+ "NOUN+CCONJ": 58,
158
+ "NOUN+NOUN": 59,
159
+ "NOUN+SCONJ": 60,
160
+ "NOUN+VERB": 61,
161
+ "NUM": 62,
162
+ "PART": 63,
163
+ "PRON": 64,
164
+ "PRON+ADV": 65,
165
+ "PRON+CCONJ": 66,
166
+ "PROPN": 67,
167
+ "PROPN+PROPN": 68,
168
+ "PROPN+VERB": 69,
169
+ "PUNCT": 70,
170
+ "SCONJ": 71,
171
+ "SCONJ+NOUN": 72,
172
+ "SCONJ+PROPN": 73,
173
+ "SCONJ+SCONJ": 74,
174
+ "SYM": 75,
175
+ "VERB": 76,
176
+ "VERB+ADV": 77,
177
+ "VERB+NOUN": 78,
178
+ "VERB+PROPN": 79,
179
+ "X": 80
180
+ },
181
+ "layer_norm_eps": 1e-05,
182
+ "max_position_embeddings": 514,
183
+ "model_type": "roberta",
184
+ "num_attention_heads": 16,
185
+ "num_hidden_layers": 24,
186
+ "pad_token_id": 1,
187
+ "position_embedding_type": "absolute",
188
+ "task_specific_params": {
189
+ "upos_multiword": {
190
+ "ADJ+ADJ": {
191
+ "\uc5c6\ub294\uc5f7\uc740": [
192
+ "\uc5c6\ub294",
193
+ "\uc5f7\uc740"
194
+ ]
195
+ },
196
+ "ADJ+VERB": {
197
+ "\uc544\ub2cc\ud2a4\ub9bd\uc774\ub780": [
198
+ "\uc544\ub2cc",
199
+ "\ud2a4\ub9bd\uc774\ub780"
200
+ ]
201
+ },
202
+ "ADP+PRON": {
203
+ "\uac00\uc167\ub2e4\uc6b4\uc81c\uac00": [
204
+ "\uac00",
205
+ "\uc167\ub2e4\uc6b4\uc81c\uac00"
206
+ ]
207
+ },
208
+ "ADV+CCONJ": {
209
+ "\uc11c\uc11c\ud788\uc5f7\uc5b4\uc9c0\uace0": [
210
+ "\uc11c\uc11c\ud788",
211
+ "\uc5f7\uc5b4\uc9c0\uace0"
212
+ ]
213
+ },
214
+ "ADV+PROPN": {
215
+ "\uc18d\uc5d0\ubeec\ub808\uc2a4\ud2b8\ub85c\uc774\uce74\uac00": [
216
+ "\uc18d\uc5d0",
217
+ "\ubeec\ub808\uc2a4\ud2b8\ub85c\uc774\uce74\uac00"
218
+ ]
219
+ },
220
+ "ADV+SCONJ": {
221
+ "\ub0c7\ubb3c\uc5d0\ud5f9\uad6c\uc5b4\uc11c": [
222
+ "\ub0c7\ubb3c\uc5d0",
223
+ "\ud5f9\uad6c\uc5b4\uc11c"
224
+ ],
225
+ "\uc815\ub3c4\ub85c\ucad1\uc54c\uac70\ub9ac\uace0\ub294": [
226
+ "\uc815\ub3c4\ub85c",
227
+ "\ucad1\uc54c\uac70\ub9ac\uace0\ub294"
228
+ ]
229
+ },
230
+ "ADV+VERB": {
231
+ "\uc55e\uc5d0\uc11c\uca54\uca54\ub9f8\ub2e4": [
232
+ "\uc55e\uc5d0\uc11c",
233
+ "\uca54\uca54\ub9f8\ub2e4"
234
+ ]
235
+ },
236
+ "NOUN+ADV": {
237
+ "\ub0a0\ud03c\ub978\ubc29\uc1a1\uad6d\ub3c4": [
238
+ "\ub0a0",
239
+ "\ud03c\ub978\ubc29\uc1a1\uad6d\ub3c4"
240
+ ]
241
+ },
242
+ "NOUN+CCONJ": {
243
+ "\uc815\uce58\ud615\ud0dc\ub97c\ucd9c\ud604\uc2dc\ucf2f\uace0": [
244
+ "\uc815\uce58\ud615\ud0dc\ub97c",
245
+ "\ucd9c\ud604\uc2dc\ucf2f\uace0"
246
+ ]
247
+ },
248
+ "NOUN+NOUN": {
249
+ "\uad70\uc911\uc774\ud288\ub974\ub9ac": [
250
+ "\uad70\uc911\uc774",
251
+ "\ud288\ub974\ub9ac"
252
+ ],
253
+ "\ud558\ub8e8\ud488\uc0af\uc774": [
254
+ "\ud558\ub8e8",
255
+ "\ud488\uc0af\uc774"
256
+ ]
257
+ },
258
+ "NOUN+SCONJ": {
259
+ "\ud615\ud0dc\ub97c\ub768\uc9c0\ub77c\ub3c4": [
260
+ "\ud615\ud0dc\ub97c",
261
+ "\ub768\uc9c0\ub77c\ub3c4"
262
+ ]
263
+ },
264
+ "NOUN+VERB": {
265
+ "\ub048\uc744\ub9ec": [
266
+ "\ub048\uc744",
267
+ "\ub9ec"
268
+ ],
269
+ "\ud480\ubc2d\uc5d0\uc11c\ub294\ube73\ube73\ud55c": [
270
+ "\ud480\ubc2d\uc5d0\uc11c\ub294",
271
+ "\ube73\ube73\ud55c"
272
+ ],
273
+ "\ud669\uae08\uc744\uac70\uba38\uc958": [
274
+ "\ud669\uae08\uc744",
275
+ "\uac70\uba38\uc958"
276
+ ]
277
+ },
278
+ "NUM+PUNCT+NUM": {
279
+ "5157\uc5b5": [
280
+ "5",
281
+ "15",
282
+ "7\uc5b5"
283
+ ]
284
+ },
285
+ "PRON+ADV": {
286
+ "\uadf8\ub4e4\uc740\uaf3c\ubba8\uc5d0\uc11c": [
287
+ "\uadf8\ub4e4\uc740",
288
+ "\uaf3c\ubba8\uc5d0\uc11c"
289
+ ]
290
+ },
291
+ "PRON+CCONJ": {
292
+ "\uadf8\ub294\ud06c\ub808\ubbc8\ub9b0\uacfc": [
293
+ "\uadf8\ub294",
294
+ "\ud06c\ub808\ubbc8\ub9b0\uacfc"
295
+ ]
296
+ },
297
+ "PROPN+PROPN": {
298
+ "\ubfcc\uce58\uac00\ubeec\ub808\uc2a4\ud2b8\ub85c\uc774\uce74\uc758": [
299
+ "\ubfcc\uce58\uac00",
300
+ "\ubeec\ub808\uc2a4\ud2b8\ub85c\uc774\uce74\uc758"
301
+ ]
302
+ },
303
+ "PROPN+VERB": {
304
+ "\uc0bc\uc131\uc804\uc790\uac00\uc774\ub055\ub2c8\ub2e4": [
305
+ "\uc0bc\uc131\uc804\uc790\uac00",
306
+ "\uc774\ub055\ub2c8\ub2e4"
307
+ ]
308
+ },
309
+ "SCONJ+NOUN": {
310
+ "\uc788\uac8c\ud154\ub808\ube44\uc83c\uc740": [
311
+ "\uc788\uac8c",
312
+ "\ud154\ub808\ube44\uc83c\uc740"
313
+ ]
314
+ },
315
+ "SCONJ+PROPN": {
316
+ "\uc81c\uc678\ud55c\ub2e4\uba74\uaf3c\ubba8\uc81c\ub3c4\ub294": [
317
+ "\uc81c\uc678\ud55c\ub2e4\uba74",
318
+ "\uaf3c\ubba8\uc81c\ub3c4\ub294"
319
+ ]
320
+ },
321
+ "VERB+ADV": {
322
+ "\uad00\ud55c\ud5d9\ubc95\uc815\uc2e0\uc5d0": [
323
+ "\uad00\ud55c",
324
+ "\ud5d9\ubc95\uc815\uc2e0\uc5d0"
325
+ ]
326
+ },
327
+ "VERB+NOUN": {
328
+ "\ub450\uc5b4\uc84c\uace0\ud64b\uce74\uc774\ub3c4": [
329
+ "\ub450\uc5b4\uc84c\uace0",
330
+ "\ud64b\uce74\uc774\ub3c4"
331
+ ],
332
+ "\ub9db\uc788\uace0\ucf00\uc78c\ub3c4": [
333
+ "\ub9db\uc788\uace0",
334
+ "\ucf00\uc78c\ub3c4"
335
+ ],
336
+ "\uc5f0\ud569\uccb4\uc778\uacbd\uc81c\uaf3c\ubba8": [
337
+ "\uc5f0\ud569\uccb4\uc778",
338
+ "\uacbd\uc81c\uaf3c\ubba8"
339
+ ]
340
+ },
341
+ "VERB+PROPN": {
342
+ "\ub290\ub080\ubd10\uc3ed\uc740": [
343
+ "\ub290\ub080",
344
+ "\ubd10\uc3ed\uc740"
345
+ ],
346
+ "\uba38\ubb38\uc138\uac9c\uc740": [
347
+ "\uba38\ubb38",
348
+ "\uc138\uac9c\uc740"
349
+ ],
350
+ "\uc2e4\uba85\ud558\uac8c\ub41c\ubabd\ub5bc\uc2a4\ub028\ub294": [
351
+ "\uc2e4\uba85\ud558\uac8c\ub41c",
352
+ "\ubabd\ub5bc\uc2a4\ub028\ub294"
353
+ ]
354
+ }
355
+ }
356
+ },
357
+ "tokenizer_class": "BertTokenizerFast",
358
+ "torch_dtype": "float32",
359
+ "transformers_version": "4.22.1",
360
+ "type_vocab_size": 1,
361
+ "use_cache": true,
362
+ "vocab_size": 32000
363
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27bdc0d1693c020ef429c024312507112e375241c256029ae32551344eb8dbc2
3
+ size 1342895793
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
supar.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0ac3914f1d7eff573f44ce139349649bbb0b3f29af4d84cb4bc298e94e520a4
3
+ size 1407899365
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_basic_tokenize": true,
5
+ "do_lower_case": false,
6
+ "eos_token": "[SEP]",
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "never_split": null,
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "BertTokenizerFast",
15
+ "unk_token": "[UNK]"
16
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff