qilowoq commited on
Commit
033550f
1 Parent(s): 9fc352f

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +189 -0
  3. tokenizer_config.json +16 -0
  4. vocab.txt +25 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 160,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 160
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 21,
16
+ "pad_type_id": 0,
17
+ "pad_token": "[PAD]"
18
+ },
19
+ "added_tokens": [
20
+ {
21
+ "id": 0,
22
+ "content": "[CLS]",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 21,
31
+ "content": "[PAD]",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 22,
40
+ "content": "[SEP]",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 23,
49
+ "content": "[MASK]",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ },
56
+ {
57
+ "id": 24,
58
+ "content": "[UNK]",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ }
65
+ ],
66
+ "normalizer": {
67
+ "type": "BertNormalizer",
68
+ "clean_text": true,
69
+ "handle_chinese_chars": true,
70
+ "strip_accents": null,
71
+ "lowercase": false
72
+ },
73
+ "pre_tokenizer": {
74
+ "type": "BertPreTokenizer"
75
+ },
76
+ "post_processor": {
77
+ "type": "TemplateProcessing",
78
+ "single": [
79
+ {
80
+ "SpecialToken": {
81
+ "id": "[CLS]",
82
+ "type_id": 0
83
+ }
84
+ },
85
+ {
86
+ "Sequence": {
87
+ "id": "A",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "SpecialToken": {
93
+ "id": "[SEP]",
94
+ "type_id": 0
95
+ }
96
+ }
97
+ ],
98
+ "pair": [
99
+ {
100
+ "SpecialToken": {
101
+ "id": "[CLS]",
102
+ "type_id": 0
103
+ }
104
+ },
105
+ {
106
+ "Sequence": {
107
+ "id": "A",
108
+ "type_id": 0
109
+ }
110
+ },
111
+ {
112
+ "SpecialToken": {
113
+ "id": "[SEP]",
114
+ "type_id": 0
115
+ }
116
+ },
117
+ {
118
+ "Sequence": {
119
+ "id": "B",
120
+ "type_id": 1
121
+ }
122
+ },
123
+ {
124
+ "SpecialToken": {
125
+ "id": "[SEP]",
126
+ "type_id": 1
127
+ }
128
+ }
129
+ ],
130
+ "special_tokens": {
131
+ "[CLS]": {
132
+ "id": "[CLS]",
133
+ "ids": [
134
+ 0
135
+ ],
136
+ "tokens": [
137
+ "[CLS]"
138
+ ]
139
+ },
140
+ "[SEP]": {
141
+ "id": "[SEP]",
142
+ "ids": [
143
+ 22
144
+ ],
145
+ "tokens": [
146
+ "[SEP]"
147
+ ]
148
+ }
149
+ }
150
+ },
151
+ "decoder": {
152
+ "type": "WordPiece",
153
+ "prefix": "##",
154
+ "cleanup": true
155
+ },
156
+ "model": {
157
+ "type": "WordPiece",
158
+ "unk_token": "[UNK]",
159
+ "continuing_subword_prefix": "##",
160
+ "max_input_chars_per_word": 100,
161
+ "vocab": {
162
+ "[CLS]": 0,
163
+ "M": 1,
164
+ "R": 2,
165
+ "H": 3,
166
+ "K": 4,
167
+ "D": 5,
168
+ "E": 6,
169
+ "S": 7,
170
+ "T": 8,
171
+ "N": 9,
172
+ "Q": 10,
173
+ "C": 11,
174
+ "G": 12,
175
+ "P": 13,
176
+ "A": 14,
177
+ "V": 15,
178
+ "I": 16,
179
+ "F": 17,
180
+ "Y": 18,
181
+ "W": 19,
182
+ "L": 20,
183
+ "[PAD]": 21,
184
+ "[SEP]": 22,
185
+ "[MASK]": 23,
186
+ "[UNK]": 24
187
+ }
188
+ }
189
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": false,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 160,
7
+ "name_or_path": "ablang_tokenizer",
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "special_tokens_map_file": null,
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "BertTokenizer",
15
+ "unk_token": "[UNK]"
16
+ }
vocab.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [CLS]
2
+ M
3
+ R
4
+ H
5
+ K
6
+ D
7
+ E
8
+ S
9
+ T
10
+ N
11
+ Q
12
+ C
13
+ G
14
+ P
15
+ A
16
+ V
17
+ I
18
+ F
19
+ Y
20
+ W
21
+ L
22
+ [PAD]
23
+ [SEP]
24
+ [MASK]
25
+ [UNK]