bigmorning commited on
Commit
d2f4b13
·
1 Parent(s): f877627

add tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +1 -0
  2. tokenizer.json +225 -0
  3. tokenizer_config.json +1 -0
  4. vocab.json +1 -0
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "[UNK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "[PAD]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 2,
26
+ "content": "[CLS]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 3,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 4,
44
+ "content": "[MASK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "Strip",
54
+ "strip_left": true,
55
+ "strip_right": true
56
+ },
57
+ "pre_tokenizer": {
58
+ "type": "WhitespaceSplit"
59
+ },
60
+ "post_processor": {
61
+ "type": "TemplateProcessing",
62
+ "single": [
63
+ {
64
+ "Sequence": {
65
+ "id": "A",
66
+ "type_id": 0
67
+ }
68
+ },
69
+ {
70
+ "SpecialToken": {
71
+ "id": "[SEP]",
72
+ "type_id": 0
73
+ }
74
+ }
75
+ ],
76
+ "pair": [
77
+ {
78
+ "Sequence": {
79
+ "id": "A",
80
+ "type_id": 0
81
+ }
82
+ },
83
+ {
84
+ "SpecialToken": {
85
+ "id": "[SEP]",
86
+ "type_id": 0
87
+ }
88
+ },
89
+ {
90
+ "Sequence": {
91
+ "id": "B",
92
+ "type_id": 1
93
+ }
94
+ },
95
+ {
96
+ "SpecialToken": {
97
+ "id": "[SEP]",
98
+ "type_id": 1
99
+ }
100
+ }
101
+ ],
102
+ "special_tokens": {
103
+ "[SEP]": {
104
+ "id": "[SEP]",
105
+ "ids": [
106
+ 3
107
+ ],
108
+ "tokens": [
109
+ "[SEP]"
110
+ ]
111
+ }
112
+ }
113
+ },
114
+ "decoder": null,
115
+ "model": {
116
+ "type": "WordLevel",
117
+ "vocab": {
118
+ "[UNK]": 0,
119
+ "[PAD]": 1,
120
+ "[CLS]": 2,
121
+ "[SEP]": 3,
122
+ "[MASK]": 4,
123
+ "69": 5,
124
+ "79": 6,
125
+ "98": 7,
126
+ "26": 8,
127
+ "29": 9,
128
+ "65": 10,
129
+ "86": 11,
130
+ "91": 12,
131
+ "92": 13,
132
+ "05": 14,
133
+ "67": 15,
134
+ "85": 16,
135
+ "03": 17,
136
+ "11": 18,
137
+ "21": 19,
138
+ "28": 20,
139
+ "45": 21,
140
+ "48": 22,
141
+ "50": 23,
142
+ "52": 24,
143
+ "57": 25,
144
+ "59": 26,
145
+ "63": 27,
146
+ "66": 28,
147
+ "76": 29,
148
+ "87": 30,
149
+ "02": 31,
150
+ "06": 32,
151
+ "17": 33,
152
+ "25": 34,
153
+ "35": 35,
154
+ "44": 36,
155
+ "51": 37,
156
+ "53": 38,
157
+ "55": 39,
158
+ "56": 40,
159
+ "62": 41,
160
+ "64": 42,
161
+ "71": 43,
162
+ "82": 44,
163
+ "88": 45,
164
+ "94": 46,
165
+ "95": 47,
166
+ "96": 48,
167
+ "07": 49,
168
+ "10": 50,
169
+ "12": 51,
170
+ "14": 52,
171
+ "18": 53,
172
+ "19": 54,
173
+ "20": 55,
174
+ "24": 56,
175
+ "41": 57,
176
+ "43": 58,
177
+ "49": 59,
178
+ "58": 60,
179
+ "61": 61,
180
+ "72": 62,
181
+ "74": 63,
182
+ "81": 64,
183
+ "90": 65,
184
+ "00": 66,
185
+ "08": 67,
186
+ "09": 68,
187
+ "23": 69,
188
+ "33": 70,
189
+ "34": 71,
190
+ "36": 72,
191
+ "39": 73,
192
+ "68": 74,
193
+ "97": 75,
194
+ "99": 76,
195
+ "13": 77,
196
+ "15": 78,
197
+ "22": 79,
198
+ "27": 80,
199
+ "30": 81,
200
+ "38": 82,
201
+ "47": 83,
202
+ "75": 84,
203
+ "77": 85,
204
+ "80": 86,
205
+ "93": 87,
206
+ "31": 88,
207
+ "32": 89,
208
+ "37": 90,
209
+ "40": 91,
210
+ "42": 92,
211
+ "46": 93,
212
+ "54": 94,
213
+ "01": 95,
214
+ "04": 96,
215
+ "16": 97,
216
+ "60": 98,
217
+ "70": 99,
218
+ "73": 100,
219
+ "78": 101,
220
+ "83": 102,
221
+ "89": 103
222
+ },
223
+ "unk_token": "[UNK]"
224
+ }
225
+ }
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "tokenizer_class": "DistilBertTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"[UNK]":0,"[PAD]":1,"[CLS]":2,"[SEP]":3,"[MASK]":4,"69":5,"79":6,"98":7,"26":8,"29":9,"65":10,"86":11,"91":12,"92":13,"05":14,"67":15,"85":16,"03":17,"11":18,"21":19,"28":20,"45":21,"48":22,"50":23,"52":24,"57":25,"59":26,"63":27,"66":28,"76":29,"87":30,"02":31,"06":32,"17":33,"25":34,"35":35,"44":36,"51":37,"53":38,"55":39,"56":40,"62":41,"64":42,"71":43,"82":44,"88":45,"94":46,"95":47,"96":48,"07":49,"10":50,"12":51,"14":52,"18":53,"19":54,"20":55,"24":56,"41":57,"43":58,"49":59,"58":60,"61":61,"72":62,"74":63,"81":64,"90":65,"00":66,"08":67,"09":68,"23":69,"33":70,"34":71,"36":72,"39":73,"68":74,"97":75,"99":76,"13":77,"15":78,"22":79,"27":80,"30":81,"38":82,"47":83,"75":84,"77":85,"80":86,"93":87,"31":88,"32":89,"37":90,"40":91,"42":92,"46":93,"54":94,"01":95,"04":96,"16":97,"60":98,"70":99,"73":100,"78":101,"83":102,"89":103}