shyamsn97 commited on
Commit
b57aa9d
1 Parent(s): a814a74

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +1 -0
  2. special_tokens_map.json +23 -0
  3. tokenizer.json +292 -0
  4. tokenizer_config.json +33 -0
  5. vocab.json +1 -0
merges.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [],
6
+ "normalizer": null,
7
+ "pre_tokenizer": {
8
+ "type": "ByteLevel",
9
+ "add_prefix_space": false,
10
+ "trim_offsets": true,
11
+ "use_regex": true
12
+ },
13
+ "post_processor": {
14
+ "type": "ByteLevel",
15
+ "add_prefix_space": true,
16
+ "trim_offsets": false,
17
+ "use_regex": true
18
+ },
19
+ "decoder": {
20
+ "type": "ByteLevel",
21
+ "add_prefix_space": true,
22
+ "trim_offsets": true,
23
+ "use_regex": true
24
+ },
25
+ "model": {
26
+ "type": "BPE",
27
+ "dropout": null,
28
+ "unk_token": null,
29
+ "continuing_subword_prefix": "",
30
+ "end_of_word_suffix": "",
31
+ "fuse_unk": false,
32
+ "vocab": {
33
+ "!": 0,
34
+ "\"": 1,
35
+ "#": 2,
36
+ "$": 3,
37
+ "%": 4,
38
+ "&": 5,
39
+ "'": 6,
40
+ "(": 7,
41
+ ")": 8,
42
+ "*": 9,
43
+ "+": 10,
44
+ ",": 11,
45
+ "-": 12,
46
+ ".": 13,
47
+ "/": 14,
48
+ "0": 15,
49
+ "1": 16,
50
+ "2": 17,
51
+ "3": 18,
52
+ "4": 19,
53
+ "5": 20,
54
+ "6": 21,
55
+ "7": 22,
56
+ "8": 23,
57
+ "9": 24,
58
+ ":": 25,
59
+ ";": 26,
60
+ "<": 27,
61
+ "=": 28,
62
+ ">": 29,
63
+ "?": 30,
64
+ "@": 31,
65
+ "A": 32,
66
+ "B": 33,
67
+ "C": 34,
68
+ "D": 35,
69
+ "E": 36,
70
+ "F": 37,
71
+ "G": 38,
72
+ "H": 39,
73
+ "I": 40,
74
+ "J": 41,
75
+ "K": 42,
76
+ "L": 43,
77
+ "M": 44,
78
+ "N": 45,
79
+ "O": 46,
80
+ "P": 47,
81
+ "Q": 48,
82
+ "R": 49,
83
+ "S": 50,
84
+ "T": 51,
85
+ "U": 52,
86
+ "V": 53,
87
+ "W": 54,
88
+ "X": 55,
89
+ "Y": 56,
90
+ "Z": 57,
91
+ "[": 58,
92
+ "\\": 59,
93
+ "]": 60,
94
+ "^": 61,
95
+ "_": 62,
96
+ "`": 63,
97
+ "a": 64,
98
+ "b": 65,
99
+ "c": 66,
100
+ "d": 67,
101
+ "e": 68,
102
+ "f": 69,
103
+ "g": 70,
104
+ "h": 71,
105
+ "i": 72,
106
+ "j": 73,
107
+ "k": 74,
108
+ "l": 75,
109
+ "m": 76,
110
+ "n": 77,
111
+ "o": 78,
112
+ "p": 79,
113
+ "q": 80,
114
+ "r": 81,
115
+ "s": 82,
116
+ "t": 83,
117
+ "u": 84,
118
+ "v": 85,
119
+ "w": 86,
120
+ "x": 87,
121
+ "y": 88,
122
+ "z": 89,
123
+ "{": 90,
124
+ "|": 91,
125
+ "}": 92,
126
+ "~": 93,
127
+ "¡": 94,
128
+ "¢": 95,
129
+ "£": 96,
130
+ "¤": 97,
131
+ "¥": 98,
132
+ "¦": 99,
133
+ "§": 100,
134
+ "¨": 101,
135
+ "©": 102,
136
+ "ª": 103,
137
+ "«": 104,
138
+ "¬": 105,
139
+ "®": 106,
140
+ "¯": 107,
141
+ "°": 108,
142
+ "±": 109,
143
+ "²": 110,
144
+ "³": 111,
145
+ "´": 112,
146
+ "µ": 113,
147
+ "¶": 114,
148
+ "·": 115,
149
+ "¸": 116,
150
+ "¹": 117,
151
+ "º": 118,
152
+ "»": 119,
153
+ "¼": 120,
154
+ "½": 121,
155
+ "¾": 122,
156
+ "¿": 123,
157
+ "À": 124,
158
+ "Á": 125,
159
+ "Â": 126,
160
+ "Ã": 127,
161
+ "Ä": 128,
162
+ "Å": 129,
163
+ "Æ": 130,
164
+ "Ç": 131,
165
+ "È": 132,
166
+ "É": 133,
167
+ "Ê": 134,
168
+ "Ë": 135,
169
+ "Ì": 136,
170
+ "Í": 137,
171
+ "Î": 138,
172
+ "Ï": 139,
173
+ "Ð": 140,
174
+ "Ñ": 141,
175
+ "Ò": 142,
176
+ "Ó": 143,
177
+ "Ô": 144,
178
+ "Õ": 145,
179
+ "Ö": 146,
180
+ "×": 147,
181
+ "Ø": 148,
182
+ "Ù": 149,
183
+ "Ú": 150,
184
+ "Û": 151,
185
+ "Ü": 152,
186
+ "Ý": 153,
187
+ "Þ": 154,
188
+ "ß": 155,
189
+ "à": 156,
190
+ "á": 157,
191
+ "â": 158,
192
+ "ã": 159,
193
+ "ä": 160,
194
+ "å": 161,
195
+ "æ": 162,
196
+ "ç": 163,
197
+ "è": 164,
198
+ "é": 165,
199
+ "ê": 166,
200
+ "ë": 167,
201
+ "ì": 168,
202
+ "í": 169,
203
+ "î": 170,
204
+ "ï": 171,
205
+ "ð": 172,
206
+ "ñ": 173,
207
+ "ò": 174,
208
+ "ó": 175,
209
+ "ô": 176,
210
+ "õ": 177,
211
+ "ö": 178,
212
+ "÷": 179,
213
+ "ø": 180,
214
+ "ù": 181,
215
+ "ú": 182,
216
+ "û": 183,
217
+ "ü": 184,
218
+ "ý": 185,
219
+ "þ": 186,
220
+ "ÿ": 187,
221
+ "Ā": 188,
222
+ "ā": 189,
223
+ "Ă": 190,
224
+ "ă": 191,
225
+ "Ą": 192,
226
+ "ą": 193,
227
+ "Ć": 194,
228
+ "ć": 195,
229
+ "Ĉ": 196,
230
+ "ĉ": 197,
231
+ "Ċ": 198,
232
+ "ċ": 199,
233
+ "Č": 200,
234
+ "č": 201,
235
+ "Ď": 202,
236
+ "ď": 203,
237
+ "Đ": 204,
238
+ "đ": 205,
239
+ "Ē": 206,
240
+ "ē": 207,
241
+ "Ĕ": 208,
242
+ "ĕ": 209,
243
+ "Ė": 210,
244
+ "ė": 211,
245
+ "Ę": 212,
246
+ "ę": 213,
247
+ "Ě": 214,
248
+ "ě": 215,
249
+ "Ĝ": 216,
250
+ "ĝ": 217,
251
+ "Ğ": 218,
252
+ "ğ": 219,
253
+ "Ġ": 220,
254
+ "ġ": 221,
255
+ "Ģ": 222,
256
+ "ģ": 223,
257
+ "Ĥ": 224,
258
+ "ĥ": 225,
259
+ "Ħ": 226,
260
+ "ħ": 227,
261
+ "Ĩ": 228,
262
+ "ĩ": 229,
263
+ "Ī": 230,
264
+ "ī": 231,
265
+ "Ĭ": 232,
266
+ "ĭ": 233,
267
+ "Į": 234,
268
+ "į": 235,
269
+ "İ": 236,
270
+ "ı": 237,
271
+ "IJ": 238,
272
+ "ij": 239,
273
+ "Ĵ": 240,
274
+ "ĵ": 241,
275
+ "Ķ": 242,
276
+ "ķ": 243,
277
+ "ĸ": 244,
278
+ "Ĺ": 245,
279
+ "ĺ": 246,
280
+ "Ļ": 247,
281
+ "ļ": 248,
282
+ "Ľ": 249,
283
+ "ľ": 250,
284
+ "Ŀ": 251,
285
+ "ŀ": 252,
286
+ "Ł": 253,
287
+ "ł": 254,
288
+ "Ń": 255
289
+ },
290
+ "merges": []
291
+ }
292
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 1000000000000000019884624838656,
22
+ "pad_token": null,
23
+ "tokenizer_class": "GPT2Tokenizer",
24
+ "unk_token": {
25
+ "__type": "AddedToken",
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": true,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "vocab_size": 16
33
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"!":0,"\"":1,"#":2,"$":3,"%":4,"&":5,"'":6,"(":7,")":8,"*":9,"+":10,",":11,"-":12,".":13,"/":14,"0":15,"1":16,"2":17,"3":18,"4":19,"5":20,"6":21,"7":22,"8":23,"9":24,":":25,";":26,"<":27,"=":28,">":29,"?":30,"@":31,"A":32,"B":33,"C":34,"D":35,"E":36,"F":37,"G":38,"H":39,"I":40,"J":41,"K":42,"L":43,"M":44,"N":45,"O":46,"P":47,"Q":48,"R":49,"S":50,"T":51,"U":52,"V":53,"W":54,"X":55,"Y":56,"Z":57,"[":58,"\\":59,"]":60,"^":61,"_":62,"`":63,"a":64,"b":65,"c":66,"d":67,"e":68,"f":69,"g":70,"h":71,"i":72,"j":73,"k":74,"l":75,"m":76,"n":77,"o":78,"p":79,"q":80,"r":81,"s":82,"t":83,"u":84,"v":85,"w":86,"x":87,"y":88,"z":89,"{":90,"|":91,"}":92,"~":93,"¡":94,"¢":95,"£":96,"¤":97,"¥":98,"¦":99,"§":100,"¨":101,"©":102,"ª":103,"«":104,"¬":105,"®":106,"¯":107,"°":108,"±":109,"²":110,"³":111,"´":112,"µ":113,"¶":114,"·":115,"¸":116,"¹":117,"º":118,"»":119,"¼":120,"½":121,"¾":122,"¿":123,"À":124,"Á":125,"Â":126,"Ã":127,"Ä":128,"Å":129,"Æ":130,"Ç":131,"È":132,"É":133,"Ê":134,"Ë":135,"Ì":136,"Í":137,"Î":138,"Ï":139,"Ð":140,"Ñ":141,"Ò":142,"Ó":143,"Ô":144,"Õ":145,"Ö":146,"×":147,"Ø":148,"Ù":149,"Ú":150,"Û":151,"Ü":152,"Ý":153,"Þ":154,"ß":155,"à":156,"á":157,"â":158,"ã":159,"ä":160,"å":161,"æ":162,"ç":163,"è":164,"é":165,"ê":166,"ë":167,"ì":168,"í":169,"î":170,"ï":171,"ð":172,"ñ":173,"ò":174,"ó":175,"ô":176,"õ":177,"ö":178,"÷":179,"ø":180,"ù":181,"ú":182,"û":183,"ü":184,"ý":185,"þ":186,"ÿ":187,"Ā":188,"ā":189,"Ă":190,"ă":191,"Ą":192,"ą":193,"Ć":194,"ć":195,"Ĉ":196,"ĉ":197,"Ċ":198,"ċ":199,"Č":200,"č":201,"Ď":202,"ď":203,"Đ":204,"đ":205,"Ē":206,"ē":207,"Ĕ":208,"ĕ":209,"Ė":210,"ė":211,"Ę":212,"ę":213,"Ě":214,"ě":215,"Ĝ":216,"ĝ":217,"Ğ":218,"ğ":219,"Ġ":220,"ġ":221,"Ģ":222,"ģ":223,"Ĥ":224,"ĥ":225,"Ħ":226,"ħ":227,"Ĩ":228,"ĩ":229,"Ī":230,"ī":231,"Ĭ":232,"ĭ":233,"Į":234,"į":235,"İ":236,"ı":237,"IJ":238,"ij":239,"Ĵ":240,"ĵ":241,"Ķ":242,"ķ":243,"ĸ":244,"Ĺ":245,"ĺ":246,"Ļ":247,"ļ":248,"Ľ":249,"ľ":250,"Ŀ":251,"ŀ":252,"Ł":253,"ł":254,"Ń":255}