lhy commited on
Commit
9a2144a
1 Parent(s): ad9c475

add tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer.json +348 -0
  3. tokenizer_config.json +1 -0
  4. vocab.txt +198 -0
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[PAD]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 100,
17
+ "content": "[UNK]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 101,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 102,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 103,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "BertNormalizer",
54
+ "clean_text": true,
55
+ "handle_chinese_chars": false,
56
+ "strip_accents": false,
57
+ "lowercase": false
58
+ },
59
+ "pre_tokenizer": {
60
+ "type": "BertPreTokenizer"
61
+ },
62
+ "post_processor": {
63
+ "type": "TemplateProcessing",
64
+ "single": [
65
+ {
66
+ "SpecialToken": {
67
+ "id": "[CLS]",
68
+ "type_id": 0
69
+ }
70
+ },
71
+ {
72
+ "Sequence": {
73
+ "id": "A",
74
+ "type_id": 0
75
+ }
76
+ },
77
+ {
78
+ "SpecialToken": {
79
+ "id": "[SEP]",
80
+ "type_id": 0
81
+ }
82
+ }
83
+ ],
84
+ "pair": [
85
+ {
86
+ "SpecialToken": {
87
+ "id": "[CLS]",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "Sequence": {
93
+ "id": "A",
94
+ "type_id": 0
95
+ }
96
+ },
97
+ {
98
+ "SpecialToken": {
99
+ "id": "[SEP]",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "Sequence": {
105
+ "id": "B",
106
+ "type_id": 1
107
+ }
108
+ },
109
+ {
110
+ "SpecialToken": {
111
+ "id": "[SEP]",
112
+ "type_id": 1
113
+ }
114
+ }
115
+ ],
116
+ "special_tokens": {
117
+ "[CLS]": {
118
+ "id": "[CLS]",
119
+ "ids": [
120
+ 101
121
+ ],
122
+ "tokens": [
123
+ "[CLS]"
124
+ ]
125
+ },
126
+ "[SEP]": {
127
+ "id": "[SEP]",
128
+ "ids": [
129
+ 102
130
+ ],
131
+ "tokens": [
132
+ "[SEP]"
133
+ ]
134
+ }
135
+ }
136
+ },
137
+ "decoder": {
138
+ "type": "WordPiece",
139
+ "prefix": "##",
140
+ "cleanup": true
141
+ },
142
+ "model": {
143
+ "type": "WordPiece",
144
+ "unk_token": "[UNK]",
145
+ "continuing_subword_prefix": "##",
146
+ "max_input_chars_per_word": 100,
147
+ "vocab": {
148
+ "[PAD]": 0,
149
+ "[unused0]": 1,
150
+ "[unused1]": 2,
151
+ "[unused2]": 3,
152
+ "[unused3]": 4,
153
+ "[unused4]": 5,
154
+ "[unused5]": 6,
155
+ "[unused6]": 7,
156
+ "[unused7]": 8,
157
+ "[unused8]": 9,
158
+ "[unused9]": 10,
159
+ "[unused10]": 11,
160
+ "[unused11]": 12,
161
+ "[unused12]": 13,
162
+ "[unused13]": 14,
163
+ "[unused14]": 15,
164
+ "[unused15]": 16,
165
+ "[unused16]": 17,
166
+ "[unused17]": 18,
167
+ "[unused18]": 19,
168
+ "[unused19]": 20,
169
+ "[unused20]": 21,
170
+ "[unused21]": 22,
171
+ "[unused22]": 23,
172
+ "[unused23]": 24,
173
+ "[unused24]": 25,
174
+ "[unused25]": 26,
175
+ "[unused26]": 27,
176
+ "[unused27]": 28,
177
+ "[unused28]": 29,
178
+ "[unused29]": 30,
179
+ "[unused30]": 31,
180
+ "[unused31]": 32,
181
+ "[unused32]": 33,
182
+ "[unused33]": 34,
183
+ "[unused34]": 35,
184
+ "[unused35]": 36,
185
+ "[unused36]": 37,
186
+ "[unused37]": 38,
187
+ "[unused38]": 39,
188
+ "[unused39]": 40,
189
+ "[unused40]": 41,
190
+ "[unused41]": 42,
191
+ "[unused42]": 43,
192
+ "[unused43]": 44,
193
+ "[unused44]": 45,
194
+ "[unused45]": 46,
195
+ "[unused46]": 47,
196
+ "[unused47]": 48,
197
+ "[unused48]": 49,
198
+ "[unused49]": 50,
199
+ "[unused50]": 51,
200
+ "[unused51]": 52,
201
+ "[unused52]": 53,
202
+ "[unused53]": 54,
203
+ "[unused54]": 55,
204
+ "[unused55]": 56,
205
+ "[unused56]": 57,
206
+ "[unused57]": 58,
207
+ "[unused58]": 59,
208
+ "[unused59]": 60,
209
+ "[unused60]": 61,
210
+ "[unused61]": 62,
211
+ "[unused62]": 63,
212
+ "[unused63]": 64,
213
+ "[unused64]": 65,
214
+ "[unused65]": 66,
215
+ "[unused66]": 67,
216
+ "[unused67]": 68,
217
+ "[unused68]": 69,
218
+ "[unused69]": 70,
219
+ "[unused70]": 71,
220
+ "[unused71]": 72,
221
+ "[unused72]": 73,
222
+ "[unused73]": 74,
223
+ "[unused74]": 75,
224
+ "[unused75]": 76,
225
+ "[unused76]": 77,
226
+ "[unused77]": 78,
227
+ "[unused78]": 79,
228
+ "[unused79]": 80,
229
+ "[unused80]": 81,
230
+ "[unused81]": 82,
231
+ "[unused82]": 83,
232
+ "[unused83]": 84,
233
+ "[unused84]": 85,
234
+ "[unused85]": 86,
235
+ "[unused86]": 87,
236
+ "[unused87]": 88,
237
+ "[unused88]": 89,
238
+ "[unused89]": 90,
239
+ "[unused90]": 91,
240
+ "[unused91]": 92,
241
+ "[unused92]": 93,
242
+ "[unused93]": 94,
243
+ "[unused94]": 95,
244
+ "[unused95]": 96,
245
+ "[unused96]": 97,
246
+ "[unused97]": 98,
247
+ "[unused98]": 99,
248
+ "[UNK]": 100,
249
+ "[CLS]": 101,
250
+ "[SEP]": 102,
251
+ "[MASK]": 103,
252
+ "!": 104,
253
+ "\"": 105,
254
+ "#": 106,
255
+ "$": 107,
256
+ "%": 108,
257
+ "&": 109,
258
+ "'": 110,
259
+ "(": 111,
260
+ ")": 112,
261
+ "*": 113,
262
+ "+": 114,
263
+ ",": 115,
264
+ "-": 116,
265
+ ".": 117,
266
+ "/": 118,
267
+ "0": 119,
268
+ "1": 120,
269
+ "2": 121,
270
+ "3": 122,
271
+ "4": 123,
272
+ "5": 124,
273
+ "6": 125,
274
+ "7": 126,
275
+ "8": 127,
276
+ "9": 128,
277
+ ":": 129,
278
+ ";": 130,
279
+ "<": 131,
280
+ "=": 132,
281
+ ">": 133,
282
+ "?": 134,
283
+ "@": 135,
284
+ "[": 136,
285
+ "\\": 137,
286
+ "]": 138,
287
+ "^": 139,
288
+ "_": 140,
289
+ "`": 141,
290
+ "a": 142,
291
+ "b": 143,
292
+ "c": 144,
293
+ "d": 145,
294
+ "e": 146,
295
+ "f": 147,
296
+ "g": 148,
297
+ "h": 149,
298
+ "i": 150,
299
+ "j": 151,
300
+ "k": 152,
301
+ "l": 153,
302
+ "m": 154,
303
+ "n": 155,
304
+ "o": 156,
305
+ "p": 157,
306
+ "q": 158,
307
+ "r": 159,
308
+ "s": 160,
309
+ "t": 161,
310
+ "u": 162,
311
+ "v": 163,
312
+ "w": 164,
313
+ "x": 165,
314
+ "y": 166,
315
+ "z": 167,
316
+ "{": 168,
317
+ "|": 169,
318
+ "}": 170,
319
+ "~": 171,
320
+ "##a": 172,
321
+ "##b": 173,
322
+ "##c": 174,
323
+ "##d": 175,
324
+ "##e": 176,
325
+ "##f": 177,
326
+ "##g": 178,
327
+ "##h": 179,
328
+ "##i": 180,
329
+ "##j": 181,
330
+ "##k": 182,
331
+ "##l": 183,
332
+ "##m": 184,
333
+ "##n": 185,
334
+ "##o": 186,
335
+ "##p": 187,
336
+ "##q": 188,
337
+ "##r": 189,
338
+ "##s": 190,
339
+ "##t": 191,
340
+ "##u": 192,
341
+ "##v": 193,
342
+ "##w": 194,
343
+ "##x": 195,
344
+ "##y": 196,
345
+ "##z": 197
346
+ }
347
+ }
348
+ }
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": false, "strip_accents": false, "clean_text": false, "model_max_length": 512, "do_basic_tokenize": true, "never_split": null, "special_tokens_map_file": "char-bert-base-uncased/special_tokens_map.json", "name_or_path": "char-bert-base-uncased", "tokenizer_class": "BertTokenizer"}
vocab.txt ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [unused0]
3
+ [unused1]
4
+ [unused2]
5
+ [unused3]
6
+ [unused4]
7
+ [unused5]
8
+ [unused6]
9
+ [unused7]
10
+ [unused8]
11
+ [unused9]
12
+ [unused10]
13
+ [unused11]
14
+ [unused12]
15
+ [unused13]
16
+ [unused14]
17
+ [unused15]
18
+ [unused16]
19
+ [unused17]
20
+ [unused18]
21
+ [unused19]
22
+ [unused20]
23
+ [unused21]
24
+ [unused22]
25
+ [unused23]
26
+ [unused24]
27
+ [unused25]
28
+ [unused26]
29
+ [unused27]
30
+ [unused28]
31
+ [unused29]
32
+ [unused30]
33
+ [unused31]
34
+ [unused32]
35
+ [unused33]
36
+ [unused34]
37
+ [unused35]
38
+ [unused36]
39
+ [unused37]
40
+ [unused38]
41
+ [unused39]
42
+ [unused40]
43
+ [unused41]
44
+ [unused42]
45
+ [unused43]
46
+ [unused44]
47
+ [unused45]
48
+ [unused46]
49
+ [unused47]
50
+ [unused48]
51
+ [unused49]
52
+ [unused50]
53
+ [unused51]
54
+ [unused52]
55
+ [unused53]
56
+ [unused54]
57
+ [unused55]
58
+ [unused56]
59
+ [unused57]
60
+ [unused58]
61
+ [unused59]
62
+ [unused60]
63
+ [unused61]
64
+ [unused62]
65
+ [unused63]
66
+ [unused64]
67
+ [unused65]
68
+ [unused66]
69
+ [unused67]
70
+ [unused68]
71
+ [unused69]
72
+ [unused70]
73
+ [unused71]
74
+ [unused72]
75
+ [unused73]
76
+ [unused74]
77
+ [unused75]
78
+ [unused76]
79
+ [unused77]
80
+ [unused78]
81
+ [unused79]
82
+ [unused80]
83
+ [unused81]
84
+ [unused82]
85
+ [unused83]
86
+ [unused84]
87
+ [unused85]
88
+ [unused86]
89
+ [unused87]
90
+ [unused88]
91
+ [unused89]
92
+ [unused90]
93
+ [unused91]
94
+ [unused92]
95
+ [unused93]
96
+ [unused94]
97
+ [unused95]
98
+ [unused96]
99
+ [unused97]
100
+ [unused98]
101
+ [UNK]
102
+ [CLS]
103
+ [SEP]
104
+ [MASK]
105
+ !
106
+ "
107
+ #
108
+ $
109
+ %
110
+ &
111
+ '
112
+ (
113
+ )
114
+ *
115
+ +
116
+ ,
117
+ -
118
+ .
119
+ /
120
+ 0
121
+ 1
122
+ 2
123
+ 3
124
+ 4
125
+ 5
126
+ 6
127
+ 7
128
+ 8
129
+ 9
130
+ :
131
+ ;
132
+ <
133
+ =
134
+ >
135
+ ?
136
+ @
137
+ [
138
+ \
139
+ ]
140
+ ^
141
+ _
142
+ `
143
+ a
144
+ b
145
+ c
146
+ d
147
+ e
148
+ f
149
+ g
150
+ h
151
+ i
152
+ j
153
+ k
154
+ l
155
+ m
156
+ n
157
+ o
158
+ p
159
+ q
160
+ r
161
+ s
162
+ t
163
+ u
164
+ v
165
+ w
166
+ x
167
+ y
168
+ z
169
+ {
170
+ |
171
+ }
172
+ ~
173
+ ##a
174
+ ##b
175
+ ##c
176
+ ##d
177
+ ##e
178
+ ##f
179
+ ##g
180
+ ##h
181
+ ##i
182
+ ##j
183
+ ##k
184
+ ##l
185
+ ##m
186
+ ##n
187
+ ##o
188
+ ##p
189
+ ##q
190
+ ##r
191
+ ##s
192
+ ##t
193
+ ##u
194
+ ##v
195
+ ##w
196
+ ##x
197
+ ##y
198
+ ##z