KoichiYasuoka commited on
Commit
1cf9623
1 Parent(s): e3e81af

initial release

Browse files
Files changed (7) hide show
  1. README.md +49 -0
  2. config.json +299 -0
  3. pytorch_model.bin +3 -0
  4. special_tokens_map.json +1 -0
  5. supar.model +3 -0
  6. tokenizer_config.json +1 -0
  7. vocab.txt +0 -0
README.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "ja"
4
+ tags:
5
+ - "japanese"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "cc-by-sa-4.0"
12
+ pipeline_tag: "token-classification"
13
+ widget:
14
+ - text: "国境の長いトンネルを抜けると雪国であった。"
15
+ ---
16
+
17
+ # deberta-large-japanese-unidic-luw-upos
18
+
19
+ ## Model Description
20
+
21
+ This is a DeBERTa(V2) model pre-trained on 青空文庫 texts for POS-tagging and dependency-parsing, derived from [deberta-large-japanese-unidic](https://huggingface.co/KoichiYasuoka/deberta-large-japanese-unidic). Every long-unit-word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
22
+
23
+ ## How to Use
24
+
25
+ ```py
26
+ import torch
27
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
28
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/deberta-large-japanese-unidic-luw-upos")
29
+ model=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/deberta-large-japanese-unidic-luw-upos")
30
+ s="国境の長いトンネルを抜けると雪国であった。"
31
+ t=tokenizer.tokenize(s)
32
+ p=[model.config.id2label[q] for q in torch.argmax(model(tokenizer.encode(s,return_tensors="pt"))["logits"],dim=2)[0].tolist()[1:-1]]
33
+ print(list(zip(t,p)))
34
+ ```
35
+
36
+ or
37
+
38
+ ```py
39
+ import esupar
40
+ nlp=esupar.load("KoichiYasuoka/deberta-large-japanese-unidic-luw-upos")
41
+ print(nlp("国境の長いトンネルを抜けると雪国であった。"))
42
+ ```
43
+
44
+ [fugashi](https://pypi.org/project/fugashi), [unidic-lite](https://pypi.org/project/unidic-lite) and [pytokenizations](https://pypi.org/project/pytokenizations) are required.
45
+
46
+ ## See Also
47
+
48
+ [esupar](https://github.com/KoichiYasuoka/esupar): Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa/DeBERTa models
49
+
config.json ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DebertaV2ForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "id2label": {
12
+ "0": "ADJ",
13
+ "1": "ADJ+VERB",
14
+ "2": "ADP",
15
+ "3": "ADP+ADP",
16
+ "4": "ADP+ADP+VERB",
17
+ "5": "ADP+VERB",
18
+ "6": "ADV",
19
+ "7": "AUX",
20
+ "8": "B-ADJ",
21
+ "9": "B-ADJ+VERB",
22
+ "10": "B-ADP",
23
+ "11": "B-ADP+ADJ",
24
+ "12": "B-ADP+NOUN+ADP",
25
+ "13": "B-ADV",
26
+ "14": "B-AUX",
27
+ "15": "B-AUX+AUX",
28
+ "16": "B-AUX+NOUN",
29
+ "17": "B-CCONJ",
30
+ "18": "B-INTJ",
31
+ "19": "B-NOUN",
32
+ "20": "B-NOUN+ADP",
33
+ "21": "B-NOUN+NOUN",
34
+ "22": "B-NUM",
35
+ "23": "B-PART",
36
+ "24": "B-PRON",
37
+ "25": "B-PROPN",
38
+ "26": "B-PROPN+ADP",
39
+ "27": "B-PUNCT",
40
+ "28": "B-SCONJ",
41
+ "29": "B-SYM",
42
+ "30": "B-VERB",
43
+ "31": "B-VERB+AUX",
44
+ "32": "B-VERB+SCONJ",
45
+ "33": "B-X",
46
+ "34": "CCONJ",
47
+ "35": "DET",
48
+ "36": "DET+NOUN",
49
+ "37": "I-ADJ",
50
+ "38": "I-ADJ+VERB",
51
+ "39": "I-ADP",
52
+ "40": "I-ADP+ADJ",
53
+ "41": "I-ADP+NOUN+ADP",
54
+ "42": "I-ADV",
55
+ "43": "I-AUX",
56
+ "44": "I-AUX+AUX",
57
+ "45": "I-AUX+NOUN",
58
+ "46": "I-CCONJ",
59
+ "47": "I-INTJ",
60
+ "48": "I-NOUN",
61
+ "49": "I-NOUN+ADP",
62
+ "50": "I-NOUN+NOUN",
63
+ "51": "I-NUM",
64
+ "52": "I-PART",
65
+ "53": "I-PRON",
66
+ "54": "I-PROPN",
67
+ "55": "I-PROPN+ADP",
68
+ "56": "I-PUNCT",
69
+ "57": "I-SCONJ",
70
+ "58": "I-SYM",
71
+ "59": "I-VERB",
72
+ "60": "I-VERB+AUX",
73
+ "61": "I-VERB+SCONJ",
74
+ "62": "I-X",
75
+ "63": "INTJ",
76
+ "64": "NOUN",
77
+ "65": "NUM",
78
+ "66": "PART",
79
+ "67": "PRON",
80
+ "68": "PROPN",
81
+ "69": "PUNCT",
82
+ "70": "SCONJ",
83
+ "71": "SYM",
84
+ "72": "VERB",
85
+ "73": "VERB+AUX",
86
+ "74": "X"
87
+ },
88
+ "initializer_range": 0.02,
89
+ "intermediate_size": 4096,
90
+ "label2id": {
91
+ "ADJ": 0,
92
+ "ADJ+VERB": 1,
93
+ "ADP": 2,
94
+ "ADP+ADP": 3,
95
+ "ADP+ADP+VERB": 4,
96
+ "ADP+VERB": 5,
97
+ "ADV": 6,
98
+ "AUX": 7,
99
+ "B-ADJ": 8,
100
+ "B-ADJ+VERB": 9,
101
+ "B-ADP": 10,
102
+ "B-ADP+ADJ": 11,
103
+ "B-ADP+NOUN+ADP": 12,
104
+ "B-ADV": 13,
105
+ "B-AUX": 14,
106
+ "B-AUX+AUX": 15,
107
+ "B-AUX+NOUN": 16,
108
+ "B-CCONJ": 17,
109
+ "B-INTJ": 18,
110
+ "B-NOUN": 19,
111
+ "B-NOUN+ADP": 20,
112
+ "B-NOUN+NOUN": 21,
113
+ "B-NUM": 22,
114
+ "B-PART": 23,
115
+ "B-PRON": 24,
116
+ "B-PROPN": 25,
117
+ "B-PROPN+ADP": 26,
118
+ "B-PUNCT": 27,
119
+ "B-SCONJ": 28,
120
+ "B-SYM": 29,
121
+ "B-VERB": 30,
122
+ "B-VERB+AUX": 31,
123
+ "B-VERB+SCONJ": 32,
124
+ "B-X": 33,
125
+ "CCONJ": 34,
126
+ "DET": 35,
127
+ "DET+NOUN": 36,
128
+ "I-ADJ": 37,
129
+ "I-ADJ+VERB": 38,
130
+ "I-ADP": 39,
131
+ "I-ADP+ADJ": 40,
132
+ "I-ADP+NOUN+ADP": 41,
133
+ "I-ADV": 42,
134
+ "I-AUX": 43,
135
+ "I-AUX+AUX": 44,
136
+ "I-AUX+NOUN": 45,
137
+ "I-CCONJ": 46,
138
+ "I-INTJ": 47,
139
+ "I-NOUN": 48,
140
+ "I-NOUN+ADP": 49,
141
+ "I-NOUN+NOUN": 50,
142
+ "I-NUM": 51,
143
+ "I-PART": 52,
144
+ "I-PRON": 53,
145
+ "I-PROPN": 54,
146
+ "I-PROPN+ADP": 55,
147
+ "I-PUNCT": 56,
148
+ "I-SCONJ": 57,
149
+ "I-SYM": 58,
150
+ "I-VERB": 59,
151
+ "I-VERB+AUX": 60,
152
+ "I-VERB+SCONJ": 61,
153
+ "I-X": 62,
154
+ "INTJ": 63,
155
+ "NOUN": 64,
156
+ "NUM": 65,
157
+ "PART": 66,
158
+ "PRON": 67,
159
+ "PROPN": 68,
160
+ "PUNCT": 69,
161
+ "SCONJ": 70,
162
+ "SYM": 71,
163
+ "VERB": 72,
164
+ "VERB+AUX": 73,
165
+ "X": 74
166
+ },
167
+ "layer_norm_eps": 1e-07,
168
+ "max_position_embeddings": 512,
169
+ "max_relative_positions": -1,
170
+ "model_type": "deberta-v2",
171
+ "num_attention_heads": 16,
172
+ "num_hidden_layers": 24,
173
+ "pad_token_id": 1,
174
+ "pooler_dropout": 0,
175
+ "pooler_hidden_act": "gelu",
176
+ "pooler_hidden_size": 1024,
177
+ "pos_att_type": null,
178
+ "position_biased_input": true,
179
+ "relative_attention": false,
180
+ "task_specific_params": {
181
+ "upos_multiword": {
182
+ "ADJ+VERB": {
183
+ "\u7121\u304f\u3057": [
184
+ "\u7121\u304f",
185
+ "\u3057"
186
+ ],
187
+ "\u7a0b\u306a\u304f\u3057": [
188
+ "\u7a0b\u306a\u304f",
189
+ "\u3057"
190
+ ]
191
+ },
192
+ "ADP+ADJ": {
193
+ "\u306f\u3089\u30fb\u3080\u3046\u3093\u4f5c\u54c1\u5171\u901a": [
194
+ "\u306f",
195
+ "\u3089\u30fb\u3080\u3046\u3093\u4f5c\u54c1\u5171\u901a"
196
+ ]
197
+ },
198
+ "ADP+ADP": {
199
+ "\u3068\u3082": [
200
+ "\u3068",
201
+ "\u3082"
202
+ ]
203
+ },
204
+ "ADP+ADP+VERB": {
205
+ "\u3068\u3082\u3057": [
206
+ "\u3068",
207
+ "\u3082",
208
+ "\u3057"
209
+ ]
210
+ },
211
+ "ADP+NOUN+ADP": {
212
+ "\u306e\u307f\u305d\u306e": [
213
+ "\u306e",
214
+ "\u307f\u305d",
215
+ "\u306e"
216
+ ]
217
+ },
218
+ "ADP+VERB": {
219
+ "\u3067\u304d": [
220
+ "\u3067",
221
+ "\u304d"
222
+ ],
223
+ "\u3067\u3057": [
224
+ "\u3067",
225
+ "\u3057"
226
+ ],
227
+ "\u306f\u3057": [
228
+ "\u306f",
229
+ "\u3057"
230
+ ]
231
+ },
232
+ "AUX+AUX": {
233
+ "\u30c1\u30e3\u30c3\u30bf": [
234
+ "\u30c1\u30e3\u30c3",
235
+ "\u30bf"
236
+ ]
237
+ },
238
+ "AUX+NOUN": {
239
+ "\u306a\u304a\u5024\u6bb5": [
240
+ "\u306a",
241
+ "\u304a\u5024\u6bb5"
242
+ ],
243
+ "\u306a\u304a\u5e03\u65bd": [
244
+ "\u306a",
245
+ "\u304a\u5e03\u65bd"
246
+ ]
247
+ },
248
+ "DET+NOUN": {
249
+ "\u3053\u306e\u9803": [
250
+ "\u3053\u306e",
251
+ "\u9803"
252
+ ]
253
+ },
254
+ "NOUN+ADP": {
255
+ "\u3046\u305d\u306e": [
256
+ "\u3046\u305d",
257
+ "\u306e"
258
+ ]
259
+ },
260
+ "NOUN+NOUN": {
261
+ "\u9ce5\u53d6\u770c\u6559\u80b2\u59d4\u54e1\u4f1a\u793e\u4f1a\u6559\u80b2\u59d4\u54e1": [
262
+ "\u9ce5\u53d6\u770c\u6559\u80b2\u59d4\u54e1\u4f1a",
263
+ "\u793e\u4f1a\u6559\u80b2\u59d4\u54e1"
264
+ ]
265
+ },
266
+ "PROPN+ADP": {
267
+ "\u3061\u307b\u306e": [
268
+ "\u3061\u307b",
269
+ "\u306e"
270
+ ],
271
+ "\u3088\u3090\u3053\u306e": [
272
+ "\u3088\u3090\u3053",
273
+ "\u306e"
274
+ ]
275
+ },
276
+ "VERB+AUX": {
277
+ "\u305f\u3063\u305f": [
278
+ "\u305f\u3063",
279
+ "\u305f"
280
+ ],
281
+ "\u5c0f\u5c4b\u639b\u3051\u3055\u305b": [
282
+ "\u5c0f\u5c4b\u639b\u3051\u3055",
283
+ "\u305b"
284
+ ]
285
+ },
286
+ "VERB+SCONJ": {
287
+ "\u8cb8\u5207\u3063\u3066": [
288
+ "\u8cb8\u5207\u3063",
289
+ "\u3066"
290
+ ]
291
+ }
292
+ }
293
+ },
294
+ "tokenizer_class": "BertJapaneseTokenizer",
295
+ "torch_dtype": "float32",
296
+ "transformers_version": "4.19.2",
297
+ "type_vocab_size": 0,
298
+ "vocab_size": 32000
299
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08568e2215c4e5e6d62863a18c002922cc5a89451f1f3065a16bfc25c5c26703
3
+ size 1342859187
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
supar.model ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3e447a16a225fb0952d8032d02a97aa3a8d619bcd61bc088ab6e162b02c1fb8
3
+ size 1391385771
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "do_lower_case": false, "do_word_tokenize": true, "do_subword_tokenize": true, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", "never_split": ["[CLS]", "[PAD]", "[SEP]", "[UNK]", "[MASK]"], "mecab_kwargs": {"mecab_dic": "unidic_lite"}, "model_max_length": 512, "tokenizer_class": "BertJapaneseTokenizer"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff