KoichiYasuoka commited on
Commit
6577bc1
1 Parent(s): 728e6ce

initial release

Browse files
README.md ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - "th"
4
+ tags:
5
+ - "thai"
6
+ - "question-answering"
7
+ - "dependency-parsing"
8
+ datasets:
9
+ - "universal_dependencies"
10
+ license: "apache-2.0"
11
+ pipeline_tag: "question-answering"
12
+ widget:
13
+ - text: "กว่า"
14
+ context: "หลายหัวดีกว่าหัวเดียว"
15
+ - text: "หลาย"
16
+ context: "หลายหัวดีกว่าหัวเดียว"
17
+ - text: "หัว"
18
+ context: "หลาย[MASK]ดีกว่าหัวเดียว"
19
+ ---
20
+
21
+ # roberta-base-thai-spm-ud-head
22
+
23
+ ## Model Description
24
+
25
+ This is a DeBERTa(V2) model pretrained on Thai Wikipedia texts for dependency-parsing (head-detection on Universal Dependencies) as question-answering, derived from [roberta-base-thai-spm](https://huggingface.co/KoichiYasuoka/roberta-base-thai-spm). Use [MASK] inside `context` to avoid ambiguity when specifying a multiple-used word as `question`.
26
+
27
+ ## How to Use
28
+
29
+ ```py
30
+ import torch
31
+ from transformers import AutoTokenizer,AutoModelForQuestionAnswering
32
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/roberta-base-thai-spm-ud-head")
33
+ model=AutoModelForQuestionAnswering.from_pretrained("KoichiYasuoka/roberta-base-thai-spm-ud-head")
34
+ question="กว่า"
35
+ context="หลายหัวดีกว่าหัวเดียว"
36
+ inputs=tokenizer(question,context,return_tensors="pt",return_offsets_mapping=True)
37
+ offsets=inputs.pop("offset_mapping").tolist()[0]
38
+ outputs=model(**inputs)
39
+ start,end=torch.argmax(outputs.start_logits),torch.argmax(outputs.end_logits)
40
+ print(context[offsets[start][0]:offsets[end][-1]])
41
+ ```
42
+
43
+ or (with [ufal.chu-liu-edmonds](https://pypi.org/project/ufal.chu-liu-edmonds/))
44
+
45
+ ```py
46
+ class TransformersUD(object):
47
+ def __init__(self,bert):
48
+ import os
49
+ from transformers import (AutoTokenizer,AutoModelForQuestionAnswering,
50
+ AutoModelForTokenClassification,AutoConfig,TokenClassificationPipeline)
51
+ self.tokenizer=AutoTokenizer.from_pretrained(bert)
52
+ self.model=AutoModelForQuestionAnswering.from_pretrained(bert)
53
+ x=AutoModelForTokenClassification.from_pretrained
54
+ if os.path.isdir(bert):
55
+ d,t=x(os.path.join(bert,"deprel")),x(os.path.join(bert,"tagger"))
56
+ else:
57
+ from transformers.file_utils import hf_bucket_url
58
+ c=AutoConfig.from_pretrained(hf_bucket_url(bert,"deprel/config.json"))
59
+ d=x(hf_bucket_url(bert,"deprel/pytorch_model.bin"),config=c)
60
+ s=AutoConfig.from_pretrained(hf_bucket_url(bert,"tagger/config.json"))
61
+ t=x(hf_bucket_url(bert,"tagger/pytorch_model.bin"),config=s)
62
+ self.deprel=TokenClassificationPipeline(model=d,tokenizer=self.tokenizer,
63
+ aggregation_strategy="simple")
64
+ self.tagger=TokenClassificationPipeline(model=t,tokenizer=self.tokenizer)
65
+ def __call__(self,text):
66
+ import numpy,torch,ufal.chu_liu_edmonds
67
+ w=[(t["start"],t["end"],t["entity_group"]) for t in self.deprel(text)]
68
+ z,n={t["start"]:t["entity"].split("|") for t in self.tagger(text)},len(w)
69
+ r,m=[text[s:e] for s,e,p in w],numpy.full((n+1,n+1),numpy.nan)
70
+ v,c=self.tokenizer(r,add_special_tokens=False)["input_ids"],[]
71
+ for i,t in enumerate(v):
72
+ q=[self.tokenizer.cls_token_id]+t+[self.tokenizer.sep_token_id]
73
+ c.append([q]+v[0:i]+[[self.tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]])
74
+ b=[[len(sum(x[0:j+1],[])) for j in range(len(x))] for x in c]
75
+ with torch.no_grad():
76
+ d=self.model(input_ids=torch.tensor([sum(x,[]) for x in c]),
77
+ token_type_ids=torch.tensor([[0]*x[0]+[1]*(x[-1]-x[0]) for x in b]))
78
+ s,e=d.start_logits.tolist(),d.end_logits.tolist()
79
+ for i in range(n):
80
+ for j in range(n):
81
+ m[i+1,0 if i==j else j+1]=s[i][b[i][j]]+e[i][b[i][j+1]-1]
82
+ h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
83
+ if [0 for i in h if i==0]!=[0]:
84
+ i=([p for s,e,p in w]+["root"]).index("root")
85
+ j=i+1 if i<n else numpy.nanargmax(m[:,0])
86
+ m[0:j,0]=m[j+1:,0]=numpy.nan
87
+ h=ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
88
+ u="# text = "+text.replace("\n"," ")+"\n"
89
+ for i,(s,e,p) in enumerate(w,1):
90
+ p="root" if h[i]==0 else "dep" if p=="root" else p
91
+ u+="\t".join([str(i),r[i-1],"_",z[s][0][2:],"_","|".join(z[s][1:]),
92
+ str(h[i]),p,"_","_" if i<n and w[i][0]<e else "SpaceAfter=No"])+"\n"
93
+ return u+"\n"
94
+
95
+ nlp=TransformersUD("KoichiYasuoka/roberta-base-thai-spm-ud-head")
96
+ print(nlp("หลายหัวดีกว่าหัวเดียว"))
97
+ ```
98
+
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForQuestionAnswering"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "tokenizer_class": "RemBertTokenizerFast",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.19.4",
24
+ "type_vocab_size": 2,
25
+ "use_cache": true,
26
+ "vocab_size": 3005
27
+ }
deprel/config.json ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "B-acl",
14
+ "1": "B-acl:relcl",
15
+ "2": "B-advcl",
16
+ "3": "B-advmod",
17
+ "4": "B-appos",
18
+ "5": "B-aux",
19
+ "6": "B-aux:pass",
20
+ "7": "B-case",
21
+ "8": "B-cc",
22
+ "9": "B-cc:preconj",
23
+ "10": "B-ccomp",
24
+ "11": "B-clf",
25
+ "12": "B-compound",
26
+ "13": "B-compound:prt",
27
+ "14": "B-conj",
28
+ "15": "B-cop",
29
+ "16": "B-csubj",
30
+ "17": "B-det",
31
+ "18": "B-det:predet",
32
+ "19": "B-discourse",
33
+ "20": "B-dislocated",
34
+ "21": "B-fixed",
35
+ "22": "B-flat:name",
36
+ "23": "B-goeswith",
37
+ "24": "B-iobj",
38
+ "25": "B-mark",
39
+ "26": "B-nmod",
40
+ "27": "B-nmod:poss",
41
+ "28": "B-nsubj",
42
+ "29": "B-nsubj:pass",
43
+ "30": "B-nummod",
44
+ "31": "B-obj",
45
+ "32": "B-obl",
46
+ "33": "B-obl:poss",
47
+ "34": "B-obl:tmod",
48
+ "35": "B-parataxis",
49
+ "36": "B-punct",
50
+ "37": "B-reparandum",
51
+ "38": "B-root",
52
+ "39": "B-vocative",
53
+ "40": "B-xcomp",
54
+ "41": "I-acl",
55
+ "42": "I-acl:relcl",
56
+ "43": "I-advcl",
57
+ "44": "I-advmod",
58
+ "45": "I-appos",
59
+ "46": "I-aux",
60
+ "47": "I-aux:pass",
61
+ "48": "I-case",
62
+ "49": "I-cc",
63
+ "50": "I-ccomp",
64
+ "51": "I-clf",
65
+ "52": "I-compound",
66
+ "53": "I-conj",
67
+ "54": "I-cop",
68
+ "55": "I-csubj",
69
+ "56": "I-det",
70
+ "57": "I-det:predet",
71
+ "58": "I-discourse",
72
+ "59": "I-dislocated",
73
+ "60": "I-fixed",
74
+ "61": "I-flat:name",
75
+ "62": "I-goeswith",
76
+ "63": "I-mark",
77
+ "64": "I-nmod",
78
+ "65": "I-nmod:poss",
79
+ "66": "I-nsubj",
80
+ "67": "I-nsubj:pass",
81
+ "68": "I-nummod",
82
+ "69": "I-obj",
83
+ "70": "I-obl",
84
+ "71": "I-obl:poss",
85
+ "72": "I-obl:tmod",
86
+ "73": "I-parataxis",
87
+ "74": "I-punct",
88
+ "75": "I-root",
89
+ "76": "I-vocative",
90
+ "77": "I-xcomp"
91
+ },
92
+ "initializer_range": 0.02,
93
+ "intermediate_size": 3072,
94
+ "label2id": {
95
+ "B-acl": 0,
96
+ "B-acl:relcl": 1,
97
+ "B-advcl": 2,
98
+ "B-advmod": 3,
99
+ "B-appos": 4,
100
+ "B-aux": 5,
101
+ "B-aux:pass": 6,
102
+ "B-case": 7,
103
+ "B-cc": 8,
104
+ "B-cc:preconj": 9,
105
+ "B-ccomp": 10,
106
+ "B-clf": 11,
107
+ "B-compound": 12,
108
+ "B-compound:prt": 13,
109
+ "B-conj": 14,
110
+ "B-cop": 15,
111
+ "B-csubj": 16,
112
+ "B-det": 17,
113
+ "B-det:predet": 18,
114
+ "B-discourse": 19,
115
+ "B-dislocated": 20,
116
+ "B-fixed": 21,
117
+ "B-flat:name": 22,
118
+ "B-goeswith": 23,
119
+ "B-iobj": 24,
120
+ "B-mark": 25,
121
+ "B-nmod": 26,
122
+ "B-nmod:poss": 27,
123
+ "B-nsubj": 28,
124
+ "B-nsubj:pass": 29,
125
+ "B-nummod": 30,
126
+ "B-obj": 31,
127
+ "B-obl": 32,
128
+ "B-obl:poss": 33,
129
+ "B-obl:tmod": 34,
130
+ "B-parataxis": 35,
131
+ "B-punct": 36,
132
+ "B-reparandum": 37,
133
+ "B-root": 38,
134
+ "B-vocative": 39,
135
+ "B-xcomp": 40,
136
+ "I-acl": 41,
137
+ "I-acl:relcl": 42,
138
+ "I-advcl": 43,
139
+ "I-advmod": 44,
140
+ "I-appos": 45,
141
+ "I-aux": 46,
142
+ "I-aux:pass": 47,
143
+ "I-case": 48,
144
+ "I-cc": 49,
145
+ "I-ccomp": 50,
146
+ "I-clf": 51,
147
+ "I-compound": 52,
148
+ "I-conj": 53,
149
+ "I-cop": 54,
150
+ "I-csubj": 55,
151
+ "I-det": 56,
152
+ "I-det:predet": 57,
153
+ "I-discourse": 58,
154
+ "I-dislocated": 59,
155
+ "I-fixed": 60,
156
+ "I-flat:name": 61,
157
+ "I-goeswith": 62,
158
+ "I-mark": 63,
159
+ "I-nmod": 64,
160
+ "I-nmod:poss": 65,
161
+ "I-nsubj": 66,
162
+ "I-nsubj:pass": 67,
163
+ "I-nummod": 68,
164
+ "I-obj": 69,
165
+ "I-obl": 70,
166
+ "I-obl:poss": 71,
167
+ "I-obl:tmod": 72,
168
+ "I-parataxis": 73,
169
+ "I-punct": 74,
170
+ "I-root": 75,
171
+ "I-vocative": 76,
172
+ "I-xcomp": 77
173
+ },
174
+ "layer_norm_eps": 1e-12,
175
+ "max_position_embeddings": 512,
176
+ "model_type": "roberta",
177
+ "num_attention_heads": 12,
178
+ "num_hidden_layers": 12,
179
+ "pad_token_id": 1,
180
+ "position_embedding_type": "absolute",
181
+ "tokenizer_class": "RemBertTokenizerFast",
182
+ "torch_dtype": "float32",
183
+ "transformers_version": "4.19.4",
184
+ "type_vocab_size": 2,
185
+ "use_cache": true,
186
+ "vocab_size": 3005
187
+ }
deprel/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5daa7d4b8c3c6e0ab27c91a0cdc7953972c61a981195e1d714ab33c46853838e
3
+ size 351345265
deprel/sentencepiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
3
+ size 1
deprel/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
deprel/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
deprel/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "tokenizer_class": "RemBertTokenizerFast"}
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8e2ad36dfd34f137d576a17c0f358dd1dac8eaf003475bdfd61581e7902b079
3
+ size 351111537
sentencepiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
3
+ size 1
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tagger/config.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "B-ADP|_",
14
+ "1": "B-ADV|PronType=Int",
15
+ "2": "B-ADV|_",
16
+ "3": "B-AUX|_",
17
+ "4": "B-CCONJ|_",
18
+ "5": "B-DET|PronType=Int",
19
+ "6": "B-DET|_",
20
+ "7": "B-INTJ|_",
21
+ "8": "B-NOUN|_",
22
+ "9": "B-NUM|_",
23
+ "10": "B-PART|Aspect=Perf",
24
+ "11": "B-PART|Aspect=Prog",
25
+ "12": "B-PART|Polarity=Neg",
26
+ "13": "B-PART|PronType=Int",
27
+ "14": "B-PART|_",
28
+ "15": "B-PRON|Person=1",
29
+ "16": "B-PRON|Person=2",
30
+ "17": "B-PRON|Person=3",
31
+ "18": "B-PRON|PronType=Int",
32
+ "19": "B-PRON|_",
33
+ "20": "B-PROPN|_",
34
+ "21": "B-PUNCT|_",
35
+ "22": "B-SCONJ|_",
36
+ "23": "B-SYM|_",
37
+ "24": "B-VERB|Mood=Imp",
38
+ "25": "B-VERB|Voice=Pass",
39
+ "26": "B-VERB|_",
40
+ "27": "B-X|_",
41
+ "28": "I-ADP|_",
42
+ "29": "I-ADV|PronType=Int",
43
+ "30": "I-ADV|_",
44
+ "31": "I-AUX|_",
45
+ "32": "I-CCONJ|_",
46
+ "33": "I-DET|_",
47
+ "34": "I-INTJ|_",
48
+ "35": "I-NOUN|_",
49
+ "36": "I-NUM|_",
50
+ "37": "I-PART|Aspect=Perf",
51
+ "38": "I-PART|Aspect=Prog",
52
+ "39": "I-PART|Polarity=Neg",
53
+ "40": "I-PART|PronType=Int",
54
+ "41": "I-PART|_",
55
+ "42": "I-PRON|Person=1",
56
+ "43": "I-PRON|Person=2",
57
+ "44": "I-PRON|Person=3",
58
+ "45": "I-PRON|_",
59
+ "46": "I-PROPN|_",
60
+ "47": "I-PUNCT|_",
61
+ "48": "I-VERB|Voice=Pass",
62
+ "49": "I-VERB|_",
63
+ "50": "I-X|_"
64
+ },
65
+ "initializer_range": 0.02,
66
+ "intermediate_size": 3072,
67
+ "label2id": {
68
+ "B-ADP|_": 0,
69
+ "B-ADV|PronType=Int": 1,
70
+ "B-ADV|_": 2,
71
+ "B-AUX|_": 3,
72
+ "B-CCONJ|_": 4,
73
+ "B-DET|PronType=Int": 5,
74
+ "B-DET|_": 6,
75
+ "B-INTJ|_": 7,
76
+ "B-NOUN|_": 8,
77
+ "B-NUM|_": 9,
78
+ "B-PART|Aspect=Perf": 10,
79
+ "B-PART|Aspect=Prog": 11,
80
+ "B-PART|Polarity=Neg": 12,
81
+ "B-PART|PronType=Int": 13,
82
+ "B-PART|_": 14,
83
+ "B-PRON|Person=1": 15,
84
+ "B-PRON|Person=2": 16,
85
+ "B-PRON|Person=3": 17,
86
+ "B-PRON|PronType=Int": 18,
87
+ "B-PRON|_": 19,
88
+ "B-PROPN|_": 20,
89
+ "B-PUNCT|_": 21,
90
+ "B-SCONJ|_": 22,
91
+ "B-SYM|_": 23,
92
+ "B-VERB|Mood=Imp": 24,
93
+ "B-VERB|Voice=Pass": 25,
94
+ "B-VERB|_": 26,
95
+ "B-X|_": 27,
96
+ "I-ADP|_": 28,
97
+ "I-ADV|PronType=Int": 29,
98
+ "I-ADV|_": 30,
99
+ "I-AUX|_": 31,
100
+ "I-CCONJ|_": 32,
101
+ "I-DET|_": 33,
102
+ "I-INTJ|_": 34,
103
+ "I-NOUN|_": 35,
104
+ "I-NUM|_": 36,
105
+ "I-PART|Aspect=Perf": 37,
106
+ "I-PART|Aspect=Prog": 38,
107
+ "I-PART|Polarity=Neg": 39,
108
+ "I-PART|PronType=Int": 40,
109
+ "I-PART|_": 41,
110
+ "I-PRON|Person=1": 42,
111
+ "I-PRON|Person=2": 43,
112
+ "I-PRON|Person=3": 44,
113
+ "I-PRON|_": 45,
114
+ "I-PROPN|_": 46,
115
+ "I-PUNCT|_": 47,
116
+ "I-VERB|Voice=Pass": 48,
117
+ "I-VERB|_": 49,
118
+ "I-X|_": 50
119
+ },
120
+ "layer_norm_eps": 1e-12,
121
+ "max_position_embeddings": 512,
122
+ "model_type": "roberta",
123
+ "num_attention_heads": 12,
124
+ "num_hidden_layers": 12,
125
+ "pad_token_id": 1,
126
+ "position_embedding_type": "absolute",
127
+ "tokenizer_class": "RemBertTokenizerFast",
128
+ "torch_dtype": "float32",
129
+ "transformers_version": "4.19.4",
130
+ "type_vocab_size": 2,
131
+ "use_cache": true,
132
+ "vocab_size": 3005
133
+ }
tagger/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f651e118f06d2162ea925c32f7764e027782ecbbf1ea980586eb18673e99ae5
3
+ size 351262257
tagger/sentencepiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b
3
+ size 1
tagger/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tagger/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tagger/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "tokenizer_class": "RemBertTokenizerFast"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "remove_space": true, "keep_accents": true, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": {"content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "tokenizer_class": "RemBertTokenizerFast"}