KoichiYasuoka commited on
Commit
1e70cce
1 Parent(s): 6056cc2

initial release

Browse files
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "ja"
4
+ tags:
5
+ - "japanese"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "cc-by-sa-4.0"
12
+ pipeline_tag: "token-classification"
13
+ widget:
14
+ - text: "国境の長いトンネルを抜けると雪国であった。"
15
+ ---
16
+
17
+ # deberta-large-japanese-upos
18
+
19
+ ## Model Description
20
+
21
+ This is a DeBERTa(V2) model pre-trained on 青空文庫 texts for POS-tagging and dependency-parsing, derived from [deberta-large-japanese-aozora](https://huggingface.co/KoichiYasuoka/deberta-large-japanese-aozora). Every short-unit-word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
22
+
23
+ ## How to Use
24
+
25
+ ```py
26
+ import torch
27
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
28
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/deberta-large-japanese-upos")
29
+ model=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/deberta-large-japanese-upos")
30
+ s="国境の長いトンネルを抜けると雪国であった。"
31
+ t=tokenizer.tokenize(s)
32
+ p=[model.config.id2label[q] for q in torch.argmax(model(tokenizer.encode(s,return_tensors="pt"))["logits"],dim=2)[0].tolist()[1:-1]]
33
+ print(list(zip(t,p)))
34
+ ```
35
+
36
+ or
37
+
38
+ ```py
39
+ import esupar
40
+ nlp=esupar.load("KoichiYasuoka/deberta-large-japanese-upos")
41
+ print(nlp("国境の長いトンネルを抜けると雪国であった。"))
42
+ ```
43
+
44
+ ## See Also
45
+
46
+ [esupar](https://github.com/KoichiYasuoka/esupar): Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models
47
+
config.json ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DebertaV2ForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "id2label": {
12
+ "0": "ADJ",
13
+ "1": "ADP",
14
+ "2": "ADV",
15
+ "3": "AUX",
16
+ "4": "AUX+AUX",
17
+ "5": "B-ADJ",
18
+ "6": "B-ADJ+AUX",
19
+ "7": "B-ADP",
20
+ "8": "B-ADV",
21
+ "9": "B-ADV+ADV",
22
+ "10": "B-AUX",
23
+ "11": "B-CCONJ",
24
+ "12": "B-DET",
25
+ "13": "B-INTJ",
26
+ "14": "B-NOUN",
27
+ "15": "B-NOUN+VERB",
28
+ "16": "B-NUM",
29
+ "17": "B-PART",
30
+ "18": "B-PRON",
31
+ "19": "B-PRON+ADP",
32
+ "20": "B-PROPN",
33
+ "21": "B-VERB",
34
+ "22": "B-VERB+AUX",
35
+ "23": "B-X",
36
+ "24": "CCONJ",
37
+ "25": "DET",
38
+ "26": "I-ADJ",
39
+ "27": "I-ADJ+AUX",
40
+ "28": "I-ADP",
41
+ "29": "I-ADV",
42
+ "30": "I-ADV+ADV",
43
+ "31": "I-AUX",
44
+ "32": "I-CCONJ",
45
+ "33": "I-DET",
46
+ "34": "I-INTJ",
47
+ "35": "I-NOUN",
48
+ "36": "I-NOUN+VERB",
49
+ "37": "I-NUM",
50
+ "38": "I-PART",
51
+ "39": "I-PRON",
52
+ "40": "I-PRON+ADP",
53
+ "41": "I-PROPN",
54
+ "42": "I-VERB",
55
+ "43": "I-VERB+AUX",
56
+ "44": "I-X",
57
+ "45": "INTJ",
58
+ "46": "NOUN",
59
+ "47": "NOUN+AUX",
60
+ "48": "NOUN+NOUN",
61
+ "49": "NUM",
62
+ "50": "NUM+NOUN",
63
+ "51": "NUM+PART",
64
+ "52": "PART",
65
+ "53": "PRON",
66
+ "54": "PROPN",
67
+ "55": "PUNCT",
68
+ "56": "SCONJ",
69
+ "57": "SYM",
70
+ "58": "VERB",
71
+ "59": "VERB+SCONJ"
72
+ },
73
+ "initializer_range": 0.02,
74
+ "intermediate_size": 4096,
75
+ "label2id": {
76
+ "ADJ": 0,
77
+ "ADP": 1,
78
+ "ADV": 2,
79
+ "AUX": 3,
80
+ "AUX+AUX": 4,
81
+ "B-ADJ": 5,
82
+ "B-ADJ+AUX": 6,
83
+ "B-ADP": 7,
84
+ "B-ADV": 8,
85
+ "B-ADV+ADV": 9,
86
+ "B-AUX": 10,
87
+ "B-CCONJ": 11,
88
+ "B-DET": 12,
89
+ "B-INTJ": 13,
90
+ "B-NOUN": 14,
91
+ "B-NOUN+VERB": 15,
92
+ "B-NUM": 16,
93
+ "B-PART": 17,
94
+ "B-PRON": 18,
95
+ "B-PRON+ADP": 19,
96
+ "B-PROPN": 20,
97
+ "B-VERB": 21,
98
+ "B-VERB+AUX": 22,
99
+ "B-X": 23,
100
+ "CCONJ": 24,
101
+ "DET": 25,
102
+ "I-ADJ": 26,
103
+ "I-ADJ+AUX": 27,
104
+ "I-ADP": 28,
105
+ "I-ADV": 29,
106
+ "I-ADV+ADV": 30,
107
+ "I-AUX": 31,
108
+ "I-CCONJ": 32,
109
+ "I-DET": 33,
110
+ "I-INTJ": 34,
111
+ "I-NOUN": 35,
112
+ "I-NOUN+VERB": 36,
113
+ "I-NUM": 37,
114
+ "I-PART": 38,
115
+ "I-PRON": 39,
116
+ "I-PRON+ADP": 40,
117
+ "I-PROPN": 41,
118
+ "I-VERB": 42,
119
+ "I-VERB+AUX": 43,
120
+ "I-X": 44,
121
+ "INTJ": 45,
122
+ "NOUN": 46,
123
+ "NOUN+AUX": 47,
124
+ "NOUN+NOUN": 48,
125
+ "NUM": 49,
126
+ "NUM+NOUN": 50,
127
+ "NUM+PART": 51,
128
+ "PART": 52,
129
+ "PRON": 53,
130
+ "PROPN": 54,
131
+ "PUNCT": 55,
132
+ "SCONJ": 56,
133
+ "SYM": 57,
134
+ "VERB": 58,
135
+ "VERB+SCONJ": 59
136
+ },
137
+ "layer_norm_eps": 1e-07,
138
+ "max_position_embeddings": 512,
139
+ "max_relative_positions": -1,
140
+ "model_type": "deberta-v2",
141
+ "num_attention_heads": 16,
142
+ "num_hidden_layers": 24,
143
+ "pad_token_id": 1,
144
+ "pooler_dropout": 0,
145
+ "pooler_hidden_act": "gelu",
146
+ "pooler_hidden_size": 1024,
147
+ "pos_att_type": null,
148
+ "position_biased_input": true,
149
+ "relative_attention": false,
150
+ "task_specific_params": {
151
+ "upos_multiword": {
152
+ "ADJ+AUX": {
153
+ "\u5e73\u304b\u306a\u308a": [
154
+ "\u5e73\u304b",
155
+ "\u306a\u308a"
156
+ ]
157
+ },
158
+ "ADV+ADV": {
159
+ "\u672a\u66fe\u3066": [
160
+ "\u672a",
161
+ "\u66fe\u3066"
162
+ ]
163
+ },
164
+ "AUX+AUX": {
165
+ "\u305b\u3057\u3081": [
166
+ "\u305b",
167
+ "\u3057\u3081"
168
+ ],
169
+ "\u306a\u3089\u305a": [
170
+ "\u306a\u3089",
171
+ "\u305a"
172
+ ]
173
+ },
174
+ "NOUN+AUX": {
175
+ "\u7570\u306a\u3089": [
176
+ "\u7570",
177
+ "\u306a\u3089"
178
+ ]
179
+ },
180
+ "NOUN+NOUN": {
181
+ "\u8fb2\u5de5": [
182
+ "\u8fb2",
183
+ "\u5de5"
184
+ ]
185
+ },
186
+ "NOUN+VERB": {
187
+ "\u4e16\u51fa\u3057": [
188
+ "\u4e16\u51fa",
189
+ "\u3057"
190
+ ],
191
+ "\u58d3\u4f0f\u3057": [
192
+ "\u58d3\u4f0f",
193
+ "\u3057"
194
+ ],
195
+ "\u653e\u8a31\u3057": [
196
+ "\u653e\u8a31",
197
+ "\u3057"
198
+ ],
199
+ "\u767e\u51fa\u3057": [
200
+ "\u767e\u51fa",
201
+ "\u3057"
202
+ ],
203
+ "\u7a81\u8d77\u3057": [
204
+ "\u7a81\u8d77",
205
+ "\u3057"
206
+ ],
207
+ "\u8655\u51b3\u3057": [
208
+ "\u8655\u51b3",
209
+ "\u3057"
210
+ ],
211
+ "\u8f29\u51fa\u3057": [
212
+ "\u8f29\u51fa",
213
+ "\u3057"
214
+ ],
215
+ "\u9192\u8d77\u3057": [
216
+ "\u9192\u8d77",
217
+ "\u3057"
218
+ ]
219
+ },
220
+ "NUM+NOUN": {
221
+ "\u4e00\u4eba": [
222
+ "\u4e00",
223
+ "\u4eba"
224
+ ]
225
+ },
226
+ "NUM+PART": {
227
+ "\u4e00\u4eba": [
228
+ "\u4e00",
229
+ "\u4eba"
230
+ ]
231
+ },
232
+ "PRON+ADP": {
233
+ "\u4f55\u3093\u305e": [
234
+ "\u4f55\u3093",
235
+ "\u305e"
236
+ ]
237
+ },
238
+ "VERB+AUX": {
239
+ "\u3059\u3079\u304b\u3089": [
240
+ "\u3059",
241
+ "\u3079\u304b\u3089"
242
+ ],
243
+ "\u3059\u3079\u304d": [
244
+ "\u3059",
245
+ "\u3079\u304d"
246
+ ],
247
+ "\u306a\u3059\u3079\u304b\u3089": [
248
+ "\u306a\u3059",
249
+ "\u3079\u304b\u3089"
250
+ ],
251
+ "\u4e00\u81f4\u305b": [
252
+ "\u4e00\u81f4",
253
+ "\u305b"
254
+ ],
255
+ "\u4ed8\u3059\u3079\u3051": [
256
+ "\u4ed8\u3059",
257
+ "\u3079\u3051"
258
+ ],
259
+ "\u4f3c\u3059\u3079\u304b\u3089": [
260
+ "\u4f3c\u3059",
261
+ "\u3079\u304b\u3089"
262
+ ],
263
+ "\u4fc3\u3055\u3093": [
264
+ "\u4fc3\u3055",
265
+ "\u3093"
266
+ ],
267
+ "\u5047\u3059\u3079\u304b\u3089": [
268
+ "\u5047\u3059",
269
+ "\u3079\u304b\u3089"
270
+ ],
271
+ "\u51b3\u3059\u3079\u3057": [
272
+ "\u51b3\u3059",
273
+ "\u3079\u3057"
274
+ ],
275
+ "\u5c5e\u305b\u3057\u3081": [
276
+ "\u5c5e\u305b",
277
+ "\u3057\u3081"
278
+ ],
279
+ "\u6210\u3059\u3079\u304b\u3089": [
280
+ "\u6210\u3059",
281
+ "\u3079\u304b\u3089"
282
+ ],
283
+ "\u653e\u6d41\u3057": [
284
+ "\u653e\u6d41",
285
+ "\u3057"
286
+ ],
287
+ "\u6575\u3059\u3079\u304b\u3089": [
288
+ "\u6575\u3059",
289
+ "\u3079\u304b\u3089"
290
+ ],
291
+ "\u6575\u3059\u3079\u3051": [
292
+ "\u6575\u3059",
293
+ "\u3079\u3051"
294
+ ],
295
+ "\u66f8\u3059\u3079\u304b\u3089": [
296
+ "\u66f8\u3059",
297
+ "\u3079\u304b\u3089"
298
+ ],
299
+ "\u66f8\u3059\u3079\u304d": [
300
+ "\u66f8\u3059",
301
+ "\u3079\u304d"
302
+ ],
303
+ "\u670d\u3059\u3079\u304d": [
304
+ "\u670d\u3059",
305
+ "\u3079\u304d"
306
+ ],
307
+ "\u6bba\u3055\u3093": [
308
+ "\u6bba\u3055",
309
+ "\u3093"
310
+ ],
311
+ "\u6df7\u5408\u305b": [
312
+ "\u6df7\u5408",
313
+ "\u305b"
314
+ ],
315
+ "\u7232\u3055\u3093": [
316
+ "\u7232\u3055",
317
+ "\u3093"
318
+ ],
319
+ "\u7232\u3059\u3079\u304b\u3089": [
320
+ "\u7232\u3059",
321
+ "\u3079\u304b\u3089"
322
+ ],
323
+ "\u7232\u3059\u3079\u304d": [
324
+ "\u7232\u3059",
325
+ "\u3079\u304d"
326
+ ],
327
+ "\u7232\u3059\u3079\u3057": [
328
+ "\u7232\u3059",
329
+ "\u3079\u3057"
330
+ ],
331
+ "\u72af\u3059\u3079\u304d": [
332
+ "\u72af\u3059",
333
+ "\u3079\u304d"
334
+ ],
335
+ "\u751f\u305c\u3093": [
336
+ "\u751f\u305c",
337
+ "\u3093"
338
+ ],
339
+ "\u76ee\u3059\u3079\u304d": [
340
+ "\u76ee\u3059",
341
+ "\u3079\u304d"
342
+ ],
343
+ "\u7f6e\u304b\u3056\u308a": [
344
+ "\u7f6e\u304b",
345
+ "\u3056\u308a"
346
+ ],
347
+ "\u7f70\u3059\u3079\u304b\u3089": [
348
+ "\u7f70\u3059",
349
+ "\u3079\u304b\u3089"
350
+ ],
351
+ "\u7f70\u3059\u3079\u3057": [
352
+ "\u7f70\u3059",
353
+ "\u3079\u3057"
354
+ ],
355
+ "\u8207\u3059\u3079\u304b\u3089": [
356
+ "\u8207\u3059",
357
+ "\u3079\u304b\u3089"
358
+ ],
359
+ "\u8655\u3059\u3079\u304b\u3089": [
360
+ "\u8655\u3059",
361
+ "\u3079\u304b\u3089"
362
+ ],
363
+ "\u89e3\u3059\u3079\u304b\u3089": [
364
+ "\u89e3\u3059",
365
+ "\u3079\u304b\u3089"
366
+ ],
367
+ "\u8aa6\u3059\u3079\u304f": [
368
+ "\u8aa6\u3059",
369
+ "\u3079\u304f"
370
+ ],
371
+ "\u8d77\u3055\u3093": [
372
+ "\u8d77\u3055",
373
+ "\u3093"
374
+ ],
375
+ "\u8d77\u3059\u3079\u304d": [
376
+ "\u8d77\u3059",
377
+ "\u3079\u304d"
378
+ ],
379
+ "\u8d77\u3059\u3079\u3057": [
380
+ "\u8d77\u3059",
381
+ "\u3079\u3057"
382
+ ]
383
+ },
384
+ "VERB+SCONJ": {
385
+ "\u7acb\u3066": [
386
+ "\u7acb",
387
+ "\u3066"
388
+ ]
389
+ }
390
+ }
391
+ },
392
+ "tokenizer_class": "DebertaV2TokenizerFast",
393
+ "torch_dtype": "float32",
394
+ "transformers_version": "4.19.2",
395
+ "type_vocab_size": 0,
396
+ "vocab_size": 32000
397
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e9c374541f717f9eabb6064a95a89766bb56a75bb226f7c8d4f45453338f77b
3
+ size 1342797683
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
3
+ size 1
supar.model ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08261816ec7da11116a145547b46d89ea5e58b8fbcb3732cb99613ac6f38bc53
3
+ size 1391495275
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": true, "keep_accents": true, "model_max_length": 512, "tokenizer_class": "DebertaV2TokenizerFast"}