KoichiYasuoka commited on
Commit
1c88002
1 Parent(s): cded958

initial release

Browse files
Files changed (8) hide show
  1. README.md +58 -0
  2. config.json +433 -0
  3. maker.py +47 -0
  4. pytorch_model.bin +3 -0
  5. special_tokens_map.json +9 -0
  6. spm.model +3 -0
  7. tokenizer.json +0 -0
  8. tokenizer_config.json +14 -0
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "th"
4
+ tags:
5
+ - "thai"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "apache-2.0"
12
+ pipeline_tag: "token-classification"
13
+ widget:
14
+ - text: "หลายหัวดีกว่าหัวเดียว"
15
+ ---
16
+
17
+ # deberta-base-thai-ud-goeswith
18
+
19
+ ## Model Description
20
+
21
+ This is a DeBERTa(V2) model pre-trained on Thai Wikipedia texts for POS-tagging and dependency-parsing (using `goeswith` for subwords), derived from [deberta-base-thai](https://huggingface.co/KoichiYasuoka/deberta-base-thai).
22
+
23
+ ## How to Use
24
+
25
+ ```py
26
+ class UDgoeswith(object):
27
+ def __init__(self,bert):
28
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
29
+ self.tokenizer=AutoTokenizer.from_pretrained(bert)
30
+ self.model=AutoModelForTokenClassification.from_pretrained(bert)
31
+ def __call__(self,text):
32
+ import numpy,torch,ufal.chu_liu_edmonds
33
+ w=self.tokenizer(text,return_offsets_mapping=True)
34
+ v=w["input_ids"]
35
+ n=len(v)-1
36
+ with torch.no_grad():
37
+ d=self.model(input_ids=torch.tensor([v[0:i]+[self.tokenizer.mask_token_id]+v[i+1:]+[v[i]] for i in range(1,n)]))
38
+ e=d.logits.numpy()[:,1:n,:]
39
+ e[:,:,0]=numpy.nan
40
+ m=numpy.full((n,n),numpy.nan)
41
+ m[1:,1:]=numpy.nanmax(e,axis=2).transpose()
42
+ p=numpy.zeros((n,n))
43
+ p[1:,1:]=numpy.nanargmax(e,axis=2).transpose()
44
+ for i in range(1,n):
45
+ m[i,0],m[i,i],p[i,0]=m[i,i],numpy.nan,p[i,i]
46
+ h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
47
+ u="# text = "+text+"\n"
48
+ v=[(s,e) for s,e in w["offset_mapping"] if s<e]
49
+ for i,(s,e) in enumerate(v,1):
50
+ q=self.model.config.id2label[p[i,h[i]]].split("|")
51
+ u+="\t".join([str(i),text[s:e],"_",q[0],"_","|".join(q[1:-1]),str(h[i]),q[-1],"_","_" if i<len(v) and e<v[i][0] else "SpaceAfter=No"])+"\n"
52
+ return u+"\n"
53
+
54
+ nlp=UDgoeswith("KoichiYasuoka/deberta-base-thai-ud-goeswith")
55
+ print(nlp("หลายหัวดีกว่าหัวเดียว"))
56
+ ```
57
+
58
+ [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/) is required.
config.json ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DebertaV2ForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "hidden_act": "gelu",
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_size": 768,
9
+ "id2label": {
10
+ "0": "-|_|dep",
11
+ "1": "ADP|_|acl",
12
+ "2": "ADP|_|advcl",
13
+ "3": "ADP|_|advmod",
14
+ "4": "ADP|_|appos",
15
+ "5": "ADP|_|case",
16
+ "6": "ADP|_|cc",
17
+ "7": "ADP|_|cc:preconj",
18
+ "8": "ADP|_|csubj",
19
+ "9": "ADP|_|fixed",
20
+ "10": "ADP|_|mark",
21
+ "11": "ADP|_|obl",
22
+ "12": "ADP|_|root",
23
+ "13": "ADV|_|advcl",
24
+ "14": "ADV|_|advmod",
25
+ "15": "ADV|_|aux",
26
+ "16": "ADV|_|cc",
27
+ "17": "ADV|_|ccomp",
28
+ "18": "ADV|_|conj",
29
+ "19": "ADV|_|fixed",
30
+ "20": "ADV|_|mark",
31
+ "21": "ADV|_|obj",
32
+ "22": "ADV|_|root",
33
+ "23": "ADV|_|xcomp",
34
+ "24": "AUX|_|advmod",
35
+ "25": "AUX|_|aux",
36
+ "26": "AUX|_|aux:pass",
37
+ "27": "AUX|_|ccomp",
38
+ "28": "AUX|_|conj",
39
+ "29": "AUX|_|cop",
40
+ "30": "AUX|_|mark",
41
+ "31": "CCONJ|_|advmod",
42
+ "32": "CCONJ|_|case",
43
+ "33": "CCONJ|_|cc",
44
+ "34": "CCONJ|_|compound",
45
+ "35": "CCONJ|_|conj",
46
+ "36": "CCONJ|_|fixed",
47
+ "37": "CCONJ|_|mark",
48
+ "38": "CCONJ|_|nsubj",
49
+ "39": "CCONJ|_|obl",
50
+ "40": "CCONJ|_|root",
51
+ "41": "DET|_|advmod",
52
+ "42": "DET|_|case",
53
+ "43": "DET|_|cc:preconj",
54
+ "44": "DET|_|conj",
55
+ "45": "DET|_|det",
56
+ "46": "DET|_|det:predet",
57
+ "47": "DET|_|fixed",
58
+ "48": "DET|_|mark",
59
+ "49": "DET|_|nsubj",
60
+ "50": "DET|_|nsubj:pass",
61
+ "51": "DET|_|obj",
62
+ "52": "DET|_|obl",
63
+ "53": "DET|_|obl:tmod",
64
+ "54": "DET|_|root",
65
+ "55": "INTJ|_|acl",
66
+ "56": "INTJ|_|nsubj",
67
+ "57": "INTJ|_|root",
68
+ "58": "NOUN|_|acl",
69
+ "59": "NOUN|_|acl:relcl",
70
+ "60": "NOUN|_|advcl",
71
+ "61": "NOUN|_|advmod",
72
+ "62": "NOUN|_|appos",
73
+ "63": "NOUN|_|aux",
74
+ "64": "NOUN|_|case",
75
+ "65": "NOUN|_|cc",
76
+ "66": "NOUN|_|ccomp",
77
+ "67": "NOUN|_|clf",
78
+ "68": "NOUN|_|compound",
79
+ "69": "NOUN|_|conj",
80
+ "70": "NOUN|_|dislocated",
81
+ "71": "NOUN|_|fixed",
82
+ "72": "NOUN|_|flat:name",
83
+ "73": "NOUN|_|iobj",
84
+ "74": "NOUN|_|mark",
85
+ "75": "NOUN|_|nmod",
86
+ "76": "NOUN|_|nmod:poss",
87
+ "77": "NOUN|_|nsubj",
88
+ "78": "NOUN|_|nsubj:pass",
89
+ "79": "NOUN|_|obj",
90
+ "80": "NOUN|_|obl",
91
+ "81": "NOUN|_|obl:poss",
92
+ "82": "NOUN|_|obl:tmod",
93
+ "83": "NOUN|_|parataxis",
94
+ "84": "NOUN|_|root",
95
+ "85": "NOUN|_|vocative",
96
+ "86": "NOUN|_|xcomp",
97
+ "87": "NUM|_|acl",
98
+ "88": "NUM|_|acl:relcl",
99
+ "89": "NUM|_|advmod",
100
+ "90": "NUM|_|appos",
101
+ "91": "NUM|_|ccomp",
102
+ "92": "NUM|_|clf",
103
+ "93": "NUM|_|conj",
104
+ "94": "NUM|_|flat:name",
105
+ "95": "NUM|_|nmod",
106
+ "96": "NUM|_|nsubj",
107
+ "97": "NUM|_|nummod",
108
+ "98": "NUM|_|obj",
109
+ "99": "NUM|_|obl",
110
+ "100": "NUM|_|obl:poss",
111
+ "101": "NUM|_|obl:tmod",
112
+ "102": "NUM|_|root",
113
+ "103": "NUM|_|xcomp",
114
+ "104": "PART|_|acl",
115
+ "105": "PART|_|advmod",
116
+ "106": "PART|_|aux",
117
+ "107": "PART|_|cc",
118
+ "108": "PART|_|cc:preconj",
119
+ "109": "PART|_|ccomp",
120
+ "110": "PART|_|clf",
121
+ "111": "PART|_|compound",
122
+ "112": "PART|_|compound:prt",
123
+ "113": "PART|_|conj",
124
+ "114": "PART|_|discourse",
125
+ "115": "PART|_|fixed",
126
+ "116": "PART|_|mark",
127
+ "117": "PART|_|nmod",
128
+ "118": "PART|_|nmod:poss",
129
+ "119": "PART|_|nsubj",
130
+ "120": "PART|_|obj",
131
+ "121": "PART|_|obl",
132
+ "122": "PART|_|root",
133
+ "123": "PART|_|xcomp",
134
+ "124": "PRON|_|acl",
135
+ "125": "PRON|_|acl:relcl",
136
+ "126": "PRON|_|advcl",
137
+ "127": "PRON|_|advmod",
138
+ "128": "PRON|_|appos",
139
+ "129": "PRON|_|ccomp",
140
+ "130": "PRON|_|compound",
141
+ "131": "PRON|_|conj",
142
+ "132": "PRON|_|fixed",
143
+ "133": "PRON|_|nmod",
144
+ "134": "PRON|_|nmod:poss",
145
+ "135": "PRON|_|nsubj",
146
+ "136": "PRON|_|nsubj:pass",
147
+ "137": "PRON|_|obj",
148
+ "138": "PRON|_|obl",
149
+ "139": "PRON|_|obl:poss",
150
+ "140": "PRON|_|reparandum",
151
+ "141": "PRON|_|root",
152
+ "142": "PRON|_|xcomp",
153
+ "143": "PROPN|_|acl",
154
+ "144": "PROPN|_|acl:relcl",
155
+ "145": "PROPN|_|advmod",
156
+ "146": "PROPN|_|appos",
157
+ "147": "PROPN|_|aux",
158
+ "148": "PROPN|_|cc",
159
+ "149": "PROPN|_|ccomp",
160
+ "150": "PROPN|_|clf",
161
+ "151": "PROPN|_|compound",
162
+ "152": "PROPN|_|conj",
163
+ "153": "PROPN|_|flat:name",
164
+ "154": "PROPN|_|goeswith",
165
+ "155": "PROPN|_|nmod",
166
+ "156": "PROPN|_|nmod:poss",
167
+ "157": "PROPN|_|nsubj",
168
+ "158": "PROPN|_|nsubj:pass",
169
+ "159": "PROPN|_|obj",
170
+ "160": "PROPN|_|obl",
171
+ "161": "PROPN|_|obl:poss",
172
+ "162": "PROPN|_|obl:tmod",
173
+ "163": "PROPN|_|root",
174
+ "164": "PROPN|_|xcomp",
175
+ "165": "PUNCT|_|advmod",
176
+ "166": "PUNCT|_|clf",
177
+ "167": "PUNCT|_|punct",
178
+ "168": "PUNCT|_|root",
179
+ "169": "SCONJ|_|mark",
180
+ "170": "SYM|_|advmod",
181
+ "171": "SYM|_|clf",
182
+ "172": "SYM|_|nsubj",
183
+ "173": "SYM|_|obj",
184
+ "174": "SYM|_|obl",
185
+ "175": "VERB|_|acl",
186
+ "176": "VERB|_|acl:relcl",
187
+ "177": "VERB|_|advcl",
188
+ "178": "VERB|_|advmod",
189
+ "179": "VERB|_|appos",
190
+ "180": "VERB|_|aux",
191
+ "181": "VERB|_|aux:pass",
192
+ "182": "VERB|_|case",
193
+ "183": "VERB|_|cc",
194
+ "184": "VERB|_|ccomp",
195
+ "185": "VERB|_|compound",
196
+ "186": "VERB|_|conj",
197
+ "187": "VERB|_|csubj",
198
+ "188": "VERB|_|fixed",
199
+ "189": "VERB|_|mark",
200
+ "190": "VERB|_|nmod",
201
+ "191": "VERB|_|nmod:poss",
202
+ "192": "VERB|_|nsubj",
203
+ "193": "VERB|_|obj",
204
+ "194": "VERB|_|obl",
205
+ "195": "VERB|_|obl:poss",
206
+ "196": "VERB|_|parataxis",
207
+ "197": "VERB|_|root",
208
+ "198": "VERB|_|xcomp",
209
+ "199": "X|_|goeswith"
210
+ },
211
+ "initializer_range": 0.02,
212
+ "intermediate_size": 3072,
213
+ "label2id": {
214
+ "-|_|dep": 0,
215
+ "ADP|_|acl": 1,
216
+ "ADP|_|advcl": 2,
217
+ "ADP|_|advmod": 3,
218
+ "ADP|_|appos": 4,
219
+ "ADP|_|case": 5,
220
+ "ADP|_|cc": 6,
221
+ "ADP|_|cc:preconj": 7,
222
+ "ADP|_|csubj": 8,
223
+ "ADP|_|fixed": 9,
224
+ "ADP|_|mark": 10,
225
+ "ADP|_|obl": 11,
226
+ "ADP|_|root": 12,
227
+ "ADV|_|advcl": 13,
228
+ "ADV|_|advmod": 14,
229
+ "ADV|_|aux": 15,
230
+ "ADV|_|cc": 16,
231
+ "ADV|_|ccomp": 17,
232
+ "ADV|_|conj": 18,
233
+ "ADV|_|fixed": 19,
234
+ "ADV|_|mark": 20,
235
+ "ADV|_|obj": 21,
236
+ "ADV|_|root": 22,
237
+ "ADV|_|xcomp": 23,
238
+ "AUX|_|advmod": 24,
239
+ "AUX|_|aux": 25,
240
+ "AUX|_|aux:pass": 26,
241
+ "AUX|_|ccomp": 27,
242
+ "AUX|_|conj": 28,
243
+ "AUX|_|cop": 29,
244
+ "AUX|_|mark": 30,
245
+ "CCONJ|_|advmod": 31,
246
+ "CCONJ|_|case": 32,
247
+ "CCONJ|_|cc": 33,
248
+ "CCONJ|_|compound": 34,
249
+ "CCONJ|_|conj": 35,
250
+ "CCONJ|_|fixed": 36,
251
+ "CCONJ|_|mark": 37,
252
+ "CCONJ|_|nsubj": 38,
253
+ "CCONJ|_|obl": 39,
254
+ "CCONJ|_|root": 40,
255
+ "DET|_|advmod": 41,
256
+ "DET|_|case": 42,
257
+ "DET|_|cc:preconj": 43,
258
+ "DET|_|conj": 44,
259
+ "DET|_|det": 45,
260
+ "DET|_|det:predet": 46,
261
+ "DET|_|fixed": 47,
262
+ "DET|_|mark": 48,
263
+ "DET|_|nsubj": 49,
264
+ "DET|_|nsubj:pass": 50,
265
+ "DET|_|obj": 51,
266
+ "DET|_|obl": 52,
267
+ "DET|_|obl:tmod": 53,
268
+ "DET|_|root": 54,
269
+ "INTJ|_|acl": 55,
270
+ "INTJ|_|nsubj": 56,
271
+ "INTJ|_|root": 57,
272
+ "NOUN|_|acl": 58,
273
+ "NOUN|_|acl:relcl": 59,
274
+ "NOUN|_|advcl": 60,
275
+ "NOUN|_|advmod": 61,
276
+ "NOUN|_|appos": 62,
277
+ "NOUN|_|aux": 63,
278
+ "NOUN|_|case": 64,
279
+ "NOUN|_|cc": 65,
280
+ "NOUN|_|ccomp": 66,
281
+ "NOUN|_|clf": 67,
282
+ "NOUN|_|compound": 68,
283
+ "NOUN|_|conj": 69,
284
+ "NOUN|_|dislocated": 70,
285
+ "NOUN|_|fixed": 71,
286
+ "NOUN|_|flat:name": 72,
287
+ "NOUN|_|iobj": 73,
288
+ "NOUN|_|mark": 74,
289
+ "NOUN|_|nmod": 75,
290
+ "NOUN|_|nmod:poss": 76,
291
+ "NOUN|_|nsubj": 77,
292
+ "NOUN|_|nsubj:pass": 78,
293
+ "NOUN|_|obj": 79,
294
+ "NOUN|_|obl": 80,
295
+ "NOUN|_|obl:poss": 81,
296
+ "NOUN|_|obl:tmod": 82,
297
+ "NOUN|_|parataxis": 83,
298
+ "NOUN|_|root": 84,
299
+ "NOUN|_|vocative": 85,
300
+ "NOUN|_|xcomp": 86,
301
+ "NUM|_|acl": 87,
302
+ "NUM|_|acl:relcl": 88,
303
+ "NUM|_|advmod": 89,
304
+ "NUM|_|appos": 90,
305
+ "NUM|_|ccomp": 91,
306
+ "NUM|_|clf": 92,
307
+ "NUM|_|conj": 93,
308
+ "NUM|_|flat:name": 94,
309
+ "NUM|_|nmod": 95,
310
+ "NUM|_|nsubj": 96,
311
+ "NUM|_|nummod": 97,
312
+ "NUM|_|obj": 98,
313
+ "NUM|_|obl": 99,
314
+ "NUM|_|obl:poss": 100,
315
+ "NUM|_|obl:tmod": 101,
316
+ "NUM|_|root": 102,
317
+ "NUM|_|xcomp": 103,
318
+ "PART|_|acl": 104,
319
+ "PART|_|advmod": 105,
320
+ "PART|_|aux": 106,
321
+ "PART|_|cc": 107,
322
+ "PART|_|cc:preconj": 108,
323
+ "PART|_|ccomp": 109,
324
+ "PART|_|clf": 110,
325
+ "PART|_|compound": 111,
326
+ "PART|_|compound:prt": 112,
327
+ "PART|_|conj": 113,
328
+ "PART|_|discourse": 114,
329
+ "PART|_|fixed": 115,
330
+ "PART|_|mark": 116,
331
+ "PART|_|nmod": 117,
332
+ "PART|_|nmod:poss": 118,
333
+ "PART|_|nsubj": 119,
334
+ "PART|_|obj": 120,
335
+ "PART|_|obl": 121,
336
+ "PART|_|root": 122,
337
+ "PART|_|xcomp": 123,
338
+ "PRON|_|acl": 124,
339
+ "PRON|_|acl:relcl": 125,
340
+ "PRON|_|advcl": 126,
341
+ "PRON|_|advmod": 127,
342
+ "PRON|_|appos": 128,
343
+ "PRON|_|ccomp": 129,
344
+ "PRON|_|compound": 130,
345
+ "PRON|_|conj": 131,
346
+ "PRON|_|fixed": 132,
347
+ "PRON|_|nmod": 133,
348
+ "PRON|_|nmod:poss": 134,
349
+ "PRON|_|nsubj": 135,
350
+ "PRON|_|nsubj:pass": 136,
351
+ "PRON|_|obj": 137,
352
+ "PRON|_|obl": 138,
353
+ "PRON|_|obl:poss": 139,
354
+ "PRON|_|reparandum": 140,
355
+ "PRON|_|root": 141,
356
+ "PRON|_|xcomp": 142,
357
+ "PROPN|_|acl": 143,
358
+ "PROPN|_|acl:relcl": 144,
359
+ "PROPN|_|advmod": 145,
360
+ "PROPN|_|appos": 146,
361
+ "PROPN|_|aux": 147,
362
+ "PROPN|_|cc": 148,
363
+ "PROPN|_|ccomp": 149,
364
+ "PROPN|_|clf": 150,
365
+ "PROPN|_|compound": 151,
366
+ "PROPN|_|conj": 152,
367
+ "PROPN|_|flat:name": 153,
368
+ "PROPN|_|goeswith": 154,
369
+ "PROPN|_|nmod": 155,
370
+ "PROPN|_|nmod:poss": 156,
371
+ "PROPN|_|nsubj": 157,
372
+ "PROPN|_|nsubj:pass": 158,
373
+ "PROPN|_|obj": 159,
374
+ "PROPN|_|obl": 160,
375
+ "PROPN|_|obl:poss": 161,
376
+ "PROPN|_|obl:tmod": 162,
377
+ "PROPN|_|root": 163,
378
+ "PROPN|_|xcomp": 164,
379
+ "PUNCT|_|advmod": 165,
380
+ "PUNCT|_|clf": 166,
381
+ "PUNCT|_|punct": 167,
382
+ "PUNCT|_|root": 168,
383
+ "SCONJ|_|mark": 169,
384
+ "SYM|_|advmod": 170,
385
+ "SYM|_|clf": 171,
386
+ "SYM|_|nsubj": 172,
387
+ "SYM|_|obj": 173,
388
+ "SYM|_|obl": 174,
389
+ "VERB|_|acl": 175,
390
+ "VERB|_|acl:relcl": 176,
391
+ "VERB|_|advcl": 177,
392
+ "VERB|_|advmod": 178,
393
+ "VERB|_|appos": 179,
394
+ "VERB|_|aux": 180,
395
+ "VERB|_|aux:pass": 181,
396
+ "VERB|_|case": 182,
397
+ "VERB|_|cc": 183,
398
+ "VERB|_|ccomp": 184,
399
+ "VERB|_|compound": 185,
400
+ "VERB|_|conj": 186,
401
+ "VERB|_|csubj": 187,
402
+ "VERB|_|fixed": 188,
403
+ "VERB|_|mark": 189,
404
+ "VERB|_|nmod": 190,
405
+ "VERB|_|nmod:poss": 191,
406
+ "VERB|_|nsubj": 192,
407
+ "VERB|_|obj": 193,
408
+ "VERB|_|obl": 194,
409
+ "VERB|_|obl:poss": 195,
410
+ "VERB|_|parataxis": 196,
411
+ "VERB|_|root": 197,
412
+ "VERB|_|xcomp": 198,
413
+ "X|_|goeswith": 199
414
+ },
415
+ "layer_norm_eps": 1e-07,
416
+ "max_position_embeddings": 512,
417
+ "max_relative_positions": -1,
418
+ "model_type": "deberta-v2",
419
+ "num_attention_heads": 12,
420
+ "num_hidden_layers": 12,
421
+ "pad_token_id": 0,
422
+ "pooler_dropout": 0,
423
+ "pooler_hidden_act": "gelu",
424
+ "pooler_hidden_size": 768,
425
+ "pos_att_type": null,
426
+ "position_biased_input": true,
427
+ "relative_attention": false,
428
+ "tokenizer_class": "DebertaV2TokenizerFast",
429
+ "torch_dtype": "float32",
430
+ "transformers_version": "4.22.1",
431
+ "type_vocab_size": 0,
432
+ "vocab_size": 3000
433
+ }
maker.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ src="KoichiYasuoka/deberta-base-thai"
3
+ tgt="KoichiYasuoka/deberta-base-thai-ud-goeswith"
4
+ url="https://github.com/KoichiYasuoka/spaCy-Thai"
5
+ import os
6
+ d=os.path.join(os.path.basename(url),"UD_Thai-Corpora")
7
+ os.system("test -d {} || git clone --depth=1 {}".format(d,url))
8
+ s='{if(NF>0)u=u$0"\\n";else{if(u~/\\t0\\troot\\t/)print u>"train.conllu";u=""}}'
9
+ os.system("nawk -F'\\t' '{}' {}/*-ud-*.conllu".format(s,d))
10
+ class UDgoeswithDataset(object):
11
+ def __init__(self,conllu,tokenizer):
12
+ self.ids,self.tags,label=[],[],set()
13
+ with open(conllu,"r",encoding="utf-8") as r:
14
+ cls,sep,msk=tokenizer.cls_token_id,tokenizer.sep_token_id,tokenizer.mask_token_id
15
+ dep,c="-|_|dep",[]
16
+ for s in r:
17
+ t=s.split("\t")
18
+ if len(t)==10 and t[0].isdecimal():
19
+ c.append(t)
20
+ elif c!=[]:
21
+ v=tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
22
+ for i in range(len(v)-1,-1,-1):
23
+ for j in range(1,len(v[i])):
24
+ c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"])
25
+ y=["0"]+[t[0] for t in c]
26
+ h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
27
+ p,v=[t[3]+"|_|"+t[7] for t in c],sum(v,[])
28
+ self.ids.append([cls]+v+[sep])
29
+ self.tags.append([dep]+p+[dep])
30
+ label=set(sum([self.tags[-1],list(label)],[]))
31
+ for i,k in enumerate(v):
32
+ self.ids.append([cls]+v[0:i]+[msk]+v[i+1:]+[sep,k])
33
+ self.tags.append([dep]+[t if h[j]==i+1 else dep for j,t in enumerate(p)]+[dep,dep])
34
+ c=[]
35
+ self.label2id={l:i for i,l in enumerate(sorted(label))}
36
+ __len__=lambda self:len(self.ids)
37
+ __getitem__=lambda self,i:{"input_ids":self.ids[i],"labels":[self.label2id[t] for t in self.tags[i]]}
38
+ from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
39
+ tkz=AutoTokenizer.from_pretrained(src)
40
+ trainDS=UDgoeswithDataset("train.conllu",tkz)
41
+ lid=trainDS.label2id
42
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()})
43
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=32,output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1)
44
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),train_dataset=trainDS)
45
+ trn.train()
46
+ trn.save_model(tgt)
47
+ tkz.save_pretrained(tgt)
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2745c8d0e68df19842e0e9c8981ec4a47cb7df6c21f4b375b19caa4d4deb5974
3
+ size 351699123
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
3
+ size 1
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "split_by_punct": true,
12
+ "tokenizer_class": "DebertaV2TokenizerFast",
13
+ "unk_token": "[UNK]"
14
+ }