KoichiYasuoka commited on
Commit
e996923
1 Parent(s): 9f3eba5

model improved

Browse files
config.json CHANGED
@@ -20,12 +20,15 @@
20
  "pooler_dropout": 0,
21
  "pooler_hidden_act": "gelu",
22
  "pooler_hidden_size": 768,
23
- "pos_att_type": null,
24
- "position_biased_input": true,
25
- "relative_attention": false,
 
 
 
26
  "tokenizer_class": "DebertaV2TokenizerFast",
27
  "torch_dtype": "float32",
28
- "transformers_version": "4.19.4",
29
  "type_vocab_size": 0,
30
  "vocab_size": 32000
31
  }
 
20
  "pooler_dropout": 0,
21
  "pooler_hidden_act": "gelu",
22
  "pooler_hidden_size": 768,
23
+ "pos_att_type": [
24
+ "p2c",
25
+ "c2p"
26
+ ],
27
+ "position_biased_input": false,
28
+ "relative_attention": true,
29
  "tokenizer_class": "DebertaV2TokenizerFast",
30
  "torch_dtype": "float32",
31
+ "transformers_version": "4.22.1",
32
  "type_vocab_size": 0,
33
  "vocab_size": 32000
34
  }
deprel/config.json CHANGED
@@ -5,7 +5,6 @@
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
8
- "finetuning_task": "pos",
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
@@ -21,43 +20,45 @@
21
  "8": "B-compound",
22
  "9": "B-cop",
23
  "10": "B-csubj",
24
- "11": "B-dep",
25
- "12": "B-det",
26
- "13": "B-discourse",
27
- "14": "B-dislocated",
28
  "15": "B-fixed",
29
  "16": "B-mark",
30
  "17": "B-nmod",
31
  "18": "B-nsubj",
32
- "19": "B-nummod",
33
- "20": "B-obj",
34
- "21": "B-obl",
35
- "22": "B-punct",
36
- "23": "B-root",
37
- "24": "I-acl",
38
- "25": "I-advcl",
39
- "26": "I-advmod",
40
- "27": "I-amod",
41
- "28": "I-aux",
42
- "29": "I-case",
43
- "30": "I-cc",
44
- "31": "I-ccomp",
45
- "32": "I-compound",
46
- "33": "I-cop",
47
- "34": "I-csubj",
48
- "35": "I-dep",
49
- "36": "I-det",
50
- "37": "I-discourse",
51
- "38": "I-dislocated",
52
- "39": "I-fixed",
53
- "40": "I-mark",
54
- "41": "I-nmod",
55
- "42": "I-nsubj",
56
- "43": "I-nummod",
57
- "44": "I-obj",
58
- "45": "I-obl",
59
- "46": "I-punct",
60
- "47": "I-root"
 
 
61
  },
62
  "initializer_range": 0.02,
63
  "intermediate_size": 3072,
@@ -73,43 +74,45 @@
73
  "B-compound": 8,
74
  "B-cop": 9,
75
  "B-csubj": 10,
76
- "B-dep": 11,
77
- "B-det": 12,
78
- "B-discourse": 13,
79
- "B-dislocated": 14,
80
  "B-fixed": 15,
81
  "B-mark": 16,
82
  "B-nmod": 17,
83
  "B-nsubj": 18,
84
- "B-nummod": 19,
85
- "B-obj": 20,
86
- "B-obl": 21,
87
- "B-punct": 22,
88
- "B-root": 23,
89
- "I-acl": 24,
90
- "I-advcl": 25,
91
- "I-advmod": 26,
92
- "I-amod": 27,
93
- "I-aux": 28,
94
- "I-case": 29,
95
- "I-cc": 30,
96
- "I-ccomp": 31,
97
- "I-compound": 32,
98
- "I-cop": 33,
99
- "I-csubj": 34,
100
- "I-dep": 35,
101
- "I-det": 36,
102
- "I-discourse": 37,
103
- "I-dislocated": 38,
104
- "I-fixed": 39,
105
- "I-mark": 40,
106
- "I-nmod": 41,
107
- "I-nsubj": 42,
108
- "I-nummod": 43,
109
- "I-obj": 44,
110
- "I-obl": 45,
111
- "I-punct": 46,
112
- "I-root": 47
 
 
113
  },
114
  "layer_norm_eps": 1e-07,
115
  "max_position_embeddings": 512,
@@ -121,12 +124,15 @@
121
  "pooler_dropout": 0,
122
  "pooler_hidden_act": "gelu",
123
  "pooler_hidden_size": 768,
124
- "pos_att_type": null,
125
- "position_biased_input": true,
126
- "relative_attention": false,
 
 
 
127
  "tokenizer_class": "DebertaV2TokenizerFast",
128
  "torch_dtype": "float32",
129
- "transformers_version": "4.19.4",
130
  "type_vocab_size": 0,
131
  "vocab_size": 32000
132
  }
 
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
 
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.1,
10
  "hidden_size": 768,
 
20
  "8": "B-compound",
21
  "9": "B-cop",
22
  "10": "B-csubj",
23
+ "11": "B-csubj:outer",
24
+ "12": "B-dep",
25
+ "13": "B-det",
26
+ "14": "B-discourse",
27
  "15": "B-fixed",
28
  "16": "B-mark",
29
  "17": "B-nmod",
30
  "18": "B-nsubj",
31
+ "19": "B-nsubj:outer",
32
+ "20": "B-nummod",
33
+ "21": "B-obj",
34
+ "22": "B-obl",
35
+ "23": "B-punct",
36
+ "24": "B-root",
37
+ "25": "I-acl",
38
+ "26": "I-advcl",
39
+ "27": "I-advmod",
40
+ "28": "I-amod",
41
+ "29": "I-aux",
42
+ "30": "I-case",
43
+ "31": "I-cc",
44
+ "32": "I-ccomp",
45
+ "33": "I-compound",
46
+ "34": "I-cop",
47
+ "35": "I-csubj",
48
+ "36": "I-csubj:outer",
49
+ "37": "I-dep",
50
+ "38": "I-det",
51
+ "39": "I-discourse",
52
+ "40": "I-fixed",
53
+ "41": "I-mark",
54
+ "42": "I-nmod",
55
+ "43": "I-nsubj",
56
+ "44": "I-nsubj:outer",
57
+ "45": "I-nummod",
58
+ "46": "I-obj",
59
+ "47": "I-obl",
60
+ "48": "I-punct",
61
+ "49": "I-root"
62
  },
63
  "initializer_range": 0.02,
64
  "intermediate_size": 3072,
 
74
  "B-compound": 8,
75
  "B-cop": 9,
76
  "B-csubj": 10,
77
+ "B-csubj:outer": 11,
78
+ "B-dep": 12,
79
+ "B-det": 13,
80
+ "B-discourse": 14,
81
  "B-fixed": 15,
82
  "B-mark": 16,
83
  "B-nmod": 17,
84
  "B-nsubj": 18,
85
+ "B-nsubj:outer": 19,
86
+ "B-nummod": 20,
87
+ "B-obj": 21,
88
+ "B-obl": 22,
89
+ "B-punct": 23,
90
+ "B-root": 24,
91
+ "I-acl": 25,
92
+ "I-advcl": 26,
93
+ "I-advmod": 27,
94
+ "I-amod": 28,
95
+ "I-aux": 29,
96
+ "I-case": 30,
97
+ "I-cc": 31,
98
+ "I-ccomp": 32,
99
+ "I-compound": 33,
100
+ "I-cop": 34,
101
+ "I-csubj": 35,
102
+ "I-csubj:outer": 36,
103
+ "I-dep": 37,
104
+ "I-det": 38,
105
+ "I-discourse": 39,
106
+ "I-fixed": 40,
107
+ "I-mark": 41,
108
+ "I-nmod": 42,
109
+ "I-nsubj": 43,
110
+ "I-nsubj:outer": 44,
111
+ "I-nummod": 45,
112
+ "I-obj": 46,
113
+ "I-obl": 47,
114
+ "I-punct": 48,
115
+ "I-root": 49
116
  },
117
  "layer_norm_eps": 1e-07,
118
  "max_position_embeddings": 512,
 
124
  "pooler_dropout": 0,
125
  "pooler_hidden_act": "gelu",
126
  "pooler_hidden_size": 768,
127
+ "pos_att_type": [
128
+ "p2c",
129
+ "c2p"
130
+ ],
131
+ "position_biased_input": false,
132
+ "relative_attention": true,
133
  "tokenizer_class": "DebertaV2TokenizerFast",
134
  "torch_dtype": "float32",
135
+ "transformers_version": "4.22.1",
136
  "type_vocab_size": 0,
137
  "vocab_size": 32000
138
  }
deprel/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:423343f1b03611981be584d5b49b2d9903fb3e977298416953716ab89f5ba110
3
- size 440319475
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f79b5773908f6496ab08a9c0d54cca6da1170aa301142f7affe2cf27abbc18c
3
+ size 498612115
deprel/special_tokens_map.json CHANGED
@@ -1 +1,9 @@
1
- {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
deprel/tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
deprel/tokenizer_config.json CHANGED
@@ -1 +1,14 @@
1
- {"do_lower_case": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": true, "keep_accents": true, "model_max_length": 512, "tokenizer_class": "DebertaV2TokenizerFast"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "split_by_punct": true,
12
+ "tokenizer_class": "DebertaV2TokenizerFast",
13
+ "unk_token": "[UNK]"
14
+ }
maker.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /usr/bin/python3
2
+ import os
3
+ src="KoichiYasuoka/deberta-base-japanese-wikipedia"
4
+ tgt="KoichiYasuoka/deberta-base-japanese-wikipedia-ud-head"
5
+ url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
6
+ d=os.path.basename(url)
7
+ os.system("test -d {} || git clone --depth=1 {}".format(d,url))
8
+ os.system("for F in train dev test ; do cp "+d+"/*-$F*.conllu $F.conllu ; done")
9
+ from transformers import (AutoTokenizer,AutoModelForQuestionAnswering,
10
+ AutoModelForTokenClassification,AutoConfig,DefaultDataCollator,
11
+ DataCollatorForTokenClassification,TrainingArguments,Trainer)
12
+ class HEADDataset(object):
13
+ def __init__(self,conllu,tokenizer,augment=False,length=384):
14
+ self.qa,self.pad,self.length=[],tokenizer.pad_token_id,length
15
+ with open(conllu,"r",encoding="utf-8") as r:
16
+ form,head=[],[]
17
+ for t in r:
18
+ w=t.split("\t")
19
+ if len(w)==10 and w[0].isdecimal():
20
+ form.append(w[1])
21
+ head.append(len(head) if w[6]=="0" else int(w[6])-1)
22
+ elif t.strip()=="" and form!=[]:
23
+ v=tokenizer(form,add_special_tokens=False)["input_ids"]
24
+ for i,t in enumerate(v):
25
+ q=[tokenizer.cls_token_id]+t+[tokenizer.sep_token_id]
26
+ c=[q]+v[0:i]+[[tokenizer.mask_token_id]]+v[i+1:]+[[q[-1]]]
27
+ b=[len(sum(c[0:j+1],[])) for j in range(len(c))]
28
+ if b[-1]<length:
29
+ self.qa.append((sum(c,[]),head[i],b))
30
+ if augment and [1 for x in v if t==x]==[1]:
31
+ c[i+1]=t
32
+ b=[len(sum(c[0:j+1],[])) for j in range(len(c))]
33
+ if b[-1]<length:
34
+ self.qa.append((sum(c,[]),head[i],b))
35
+ form,head=[],[]
36
+ __len__=lambda self:len(self.qa)
37
+ def __getitem__(self,i):
38
+ (v,h,b),k=self.qa[i],self.length-self.qa[i][2][-1]
39
+ return {"input_ids":v+[self.pad]*k,"attention_mask":[1]*b[-1]+[0]*k,
40
+ "token_type_ids":[0]*b[0]+[1]*(b[-1]-b[0])+[0]*k,
41
+ "start_positions":b[h],"end_positions":b[h+1]-1}
42
+ class UPOSDataset(object):
43
+ def __init__(self,conllu,tokenizer,fields=[3]):
44
+ self.ids,self.upos=[],[]
45
+ label,cls,sep=set(),tokenizer.cls_token_id,tokenizer.sep_token_id
46
+ with open(conllu,"r",encoding="utf-8") as r:
47
+ form,upos=[],[]
48
+ for t in r:
49
+ w=t.split("\t")
50
+ if len(w)==10 and w[0].isdecimal():
51
+ form.append(w[1])
52
+ upos.append("|".join(w[i] for i in fields))
53
+ elif t.strip()=="" and form!=[]:
54
+ v,u=tokenizer(form,add_special_tokens=False)["input_ids"],[]
55
+ for x,y in zip(v,upos):
56
+ u.extend(["B-"+y]*min(len(x),1)+["I-"+y]*(len(x)-1))
57
+ if len(u)>tokenizer.model_max_length-4:
58
+ self.ids.append(sum(v,[])[0:tokenizer.model_max_length-2])
59
+ self.upos.append(u[0:tokenizer.model_max_length-2])
60
+ elif len(u)>0:
61
+ self.ids.append([cls]+sum(v,[])+[sep])
62
+ self.upos.append([u[0]]+u+[u[0]])
63
+ label=set(sum([self.upos[-1],list(label)],[]))
64
+ form,upos=[],[]
65
+ self.label2id={l:i for i,l in enumerate(sorted(label))}
66
+ def __call__(*args):
67
+ label=set(sum([list(t.label2id) for t in args],[]))
68
+ lid={l:i for i,l in enumerate(sorted(label))}
69
+ for t in args:
70
+ t.label2id=lid
71
+ return lid
72
+ __len__=lambda self:len(self.ids)
73
+ __getitem__=lambda self,i:{"input_ids":self.ids[i],
74
+ "labels":[self.label2id[t] for t in self.upos[i]]}
75
+ tkz=AutoTokenizer.from_pretrained(src)
76
+ trainDS=HEADDataset("train.conllu",tkz,True)
77
+ devDS=HEADDataset("dev.conllu",tkz)
78
+ arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=8,
79
+ output_dir="/tmp",overwrite_output_dir=True,save_total_limit=2,
80
+ evaluation_strategy="epoch",learning_rate=5e-05,warmup_ratio=0.1)
81
+ trn=Trainer(args=arg,data_collator=DefaultDataCollator(),
82
+ model=AutoModelForQuestionAnswering.from_pretrained(src),
83
+ train_dataset=trainDS,eval_dataset=devDS)
84
+ trn.train()
85
+ trn.save_model(tgt)
86
+ tkz.save_pretrained(tgt)
87
+ trainDS=UPOSDataset("train.conllu",tkz,[7])
88
+ devDS=UPOSDataset("dev.conllu",tkz,[7])
89
+ testDS=UPOSDataset("test.conllu",tkz,[7])
90
+ lid=trainDS(devDS,testDS)
91
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,
92
+ id2label={i:l for l,i in lid.items()})
93
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),
94
+ model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),
95
+ train_dataset=trainDS,eval_dataset=devDS)
96
+ trn.train()
97
+ trn.save_model(tgt+"/deprel")
98
+ tkz.save_pretrained(tgt+"/deprel")
99
+ trainDS=UPOSDataset("train.conllu",tkz,[3,5])
100
+ devDS=UPOSDataset("dev.conllu",tkz,[3,5])
101
+ testDS=UPOSDataset("test.conllu",tkz,[3,5])
102
+ lid=trainDS(devDS,testDS)
103
+ cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,
104
+ id2label={i:l for l,i in lid.items()})
105
+ trn=Trainer(args=arg,data_collator=DataCollatorForTokenClassification(tkz),
106
+ model=AutoModelForTokenClassification.from_pretrained(src,config=cfg),
107
+ train_dataset=trainDS,eval_dataset=devDS)
108
+ trn.train()
109
+ trn.save_model(tgt+"/tagger")
110
+ tkz.save_pretrained(tgt+"/tagger")
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4d0c84d139b98520551dc35b55ee63a40bf6e3eed3983a986234cb463feabdb
3
- size 440178035
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:688b9a34440a937c6a5aa93bbe75b24a474a768c0980b0218dd9f20d25f53f30
3
+ size 498464467
special_tokens_map.json CHANGED
@@ -1 +1,9 @@
1
- {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tagger/config.json CHANGED
@@ -5,7 +5,6 @@
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
8
- "finetuning_task": "pos",
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
@@ -101,12 +100,15 @@
101
  "pooler_dropout": 0,
102
  "pooler_hidden_act": "gelu",
103
  "pooler_hidden_size": 768,
104
- "pos_att_type": null,
105
- "position_biased_input": true,
106
- "relative_attention": false,
 
 
 
107
  "tokenizer_class": "DebertaV2TokenizerFast",
108
  "torch_dtype": "float32",
109
- "transformers_version": "4.19.4",
110
  "type_vocab_size": 0,
111
  "vocab_size": 32000
112
  }
 
5
  "attention_probs_dropout_prob": 0.1,
6
  "bos_token_id": 0,
7
  "eos_token_id": 2,
 
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.1,
10
  "hidden_size": 768,
 
100
  "pooler_dropout": 0,
101
  "pooler_hidden_act": "gelu",
102
  "pooler_hidden_size": 768,
103
+ "pos_att_type": [
104
+ "p2c",
105
+ "c2p"
106
+ ],
107
+ "position_biased_input": false,
108
+ "relative_attention": true,
109
  "tokenizer_class": "DebertaV2TokenizerFast",
110
  "torch_dtype": "float32",
111
+ "transformers_version": "4.22.1",
112
  "type_vocab_size": 0,
113
  "vocab_size": 32000
114
  }
tagger/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74d7b076bf46d851cb9643edb9b8c5b0dd16cc8ceffb1864bf0f692949653f44
3
- size 440288755
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4551b4b329972065979303c79f1addac688f1ce5a6e4bbc0007ca52739d3ed91
3
+ size 498575187
tagger/special_tokens_map.json CHANGED
@@ -1 +1,9 @@
1
- {"bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
tagger/tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 512,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
tagger/tokenizer_config.json CHANGED
@@ -1 +1,14 @@
1
- {"do_lower_case": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": true, "keep_accents": true, "model_max_length": 512, "tokenizer_class": "DebertaV2TokenizerFast"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "split_by_punct": true,
12
+ "tokenizer_class": "DebertaV2TokenizerFast",
13
+ "unk_token": "[UNK]"
14
+ }
tokenizer.json CHANGED
@@ -1,21 +1,7 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 384,
6
- "strategy": "OnlySecond",
7
- "stride": 128
8
- },
9
- "padding": {
10
- "strategy": {
11
- "Fixed": 384
12
- },
13
- "direction": "Right",
14
- "pad_to_multiple_of": null,
15
- "pad_id": 1,
16
- "pad_type_id": 0,
17
- "pad_token": "[PAD]"
18
- },
19
  "added_tokens": [
20
  {
21
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
tokenizer_config.json CHANGED
@@ -1 +1,14 @@
1
- {"do_lower_case": false, "bos_token": "[CLS]", "eos_token": "[SEP]", "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "split_by_punct": true, "keep_accents": true, "model_max_length": 512, "tokenizer_class": "DebertaV2TokenizerFast"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "eos_token": "[SEP]",
6
+ "keep_accents": true,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "split_by_punct": true,
12
+ "tokenizer_class": "DebertaV2TokenizerFast",
13
+ "unk_token": "[UNK]"
14
+ }