vldsavelyev commited on
Commit
c64f4f1
1 Parent(s): e4881f2

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +211 -0
  2. special_tokens_map.json +9 -0
  3. tokenizer.json +734 -0
  4. tokenizer_config.json +12 -0
  5. vocab.json +1 -0
merges.txt ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
2
+ Ġ (
3
+ Ġ 3
4
+ Ġ 0
5
+ Ġ |
6
+ Ġ 4
7
+ Ġ 1
8
+ Ġ r
9
+ 1 6
10
+ Ġ 5
11
+ Ġ 2
12
+ ) .
13
+ Ġ3 6
14
+ t u
15
+ Ġ 7
16
+ Ġ :
17
+ Ġ 6
18
+ } .
19
+ Ġ3 5
20
+ Ġ 8
21
+ Ġ4 0
22
+ Ġ 9
23
+ Ġ3 8
24
+ Ġ1 2
25
+ } )
26
+ Ġ4 2
27
+ Ġ1 0
28
+ p m
29
+ l r
30
+ 4 6
31
+ 4 2
32
+ 3 2
33
+ Ġ4 6
34
+ 3 6
35
+ } ).
36
+ 4 9
37
+ Ġ1 4
38
+ Ġ1 1
39
+ Ġ tu
40
+ Ġ1 5
41
+ 5 1
42
+ Ġ4 4
43
+ 1 0
44
+ Ġ5 1
45
+ 5 7
46
+ 3 8
47
+ Ġ1 3
48
+ Ġ1 7
49
+ 4 0
50
+ b e
51
+ 4 4
52
+ 1 2
53
+ 3 5
54
+ Ġ4 9
55
+ Ġ4 5
56
+ Ġ5 7
57
+ Ġ4 1
58
+ Ġ4 3
59
+ Ġ1 6
60
+ Ġ1 9
61
+ 1 1
62
+ Ġ5 4
63
+ 5 3
64
+ ) }
65
+ Ġ4 7
66
+ 5 9
67
+ 5 5
68
+ s t
69
+ Ġ5 9
70
+ 1 4
71
+ Ġ5 3
72
+ Ġ5 5
73
+ Ġ1 8
74
+ Ġ2 0
75
+ 6 4
76
+ ) }.
77
+ Ġ4 8
78
+ 4 5
79
+ 1 3
80
+ 4 1
81
+ 1 5
82
+ Ġ2 2
83
+ 4 3
84
+ a h
85
+ Ġ6 9
86
+ Ġ8 2
87
+ Ġ2 1
88
+ 1 7
89
+ 4 7
90
+ Ġ g
91
+ Ġ be
92
+ Ġ v
93
+ Ġ lr
94
+ Ġ3 7
95
+ 5 4
96
+ 5 2
97
+ Ġ5 0
98
+ Ġ2 4
99
+ Ġ7 0
100
+ n h
101
+ 4 8
102
+ Ġ3 9
103
+ 1 9
104
+ Ġ5 2
105
+ ) })
106
+ Ġ6 4
107
+ 2 0
108
+ Ġ6 3
109
+ 6 9
110
+ Ġ6 1
111
+ Ġ2 3
112
+ ) }).
113
+ Ġ2 7
114
+ Ġ3 3
115
+ 8 2
116
+ 1 8
117
+ 2 2
118
+ Ġ6 0
119
+ Ġ2 8
120
+ 5 0
121
+ Ġ3 1
122
+ 7 0
123
+ 2 1
124
+ Ġ h
125
+ Ġ6 2
126
+ 3 7
127
+ 2 4
128
+ Ġ2 5
129
+ Ġ f
130
+ Ġ5 6
131
+ Ġ8 1
132
+ Ġ8 7
133
+ Ġ2 9
134
+ Ġ2 6
135
+ 3 9
136
+ Ġ8 5
137
+ Ġ6 5
138
+ Ġ pm
139
+ Ġ ah
140
+ Ġ3 2
141
+ Ġ8 6
142
+ 5 6
143
+ 8 6
144
+ 3 3
145
+ Ġ3 4
146
+ 2 7
147
+ Ġ7 7
148
+ Ġ3 0
149
+ 6 2
150
+ 2 8
151
+ 6 0
152
+ 6 1
153
+ 2 3
154
+ Ġ8 0
155
+ s h
156
+ Ġ st
157
+ 6 3
158
+ 2 5
159
+ p h
160
+ Ġ7 5
161
+ 8 7
162
+ Ġ9 9
163
+ 3 1
164
+ Ġ7 6
165
+ Ġ nh
166
+ 2 6
167
+ 6 5
168
+ Ġ7 4
169
+ Ġ6 6
170
+ t r
171
+ t h
172
+ 7 6
173
+ 8 1
174
+ 2 9
175
+ Ġ6 8
176
+ 3 4
177
+ 7 7
178
+ Ġ8 3
179
+ Ġ6 7
180
+ Ġ8 4
181
+ 3 0
182
+ Ġ ph
183
+ 8 0
184
+ 8 4
185
+ Ġ7 3
186
+ 7 3
187
+ 7 5
188
+ Ġ sh
189
+ 7 4
190
+ 6 8
191
+ 5 8
192
+ Ġ5 8
193
+ 6 6
194
+ Ġ tr
195
+ Ġ7 1
196
+ Ġ9 1
197
+ 9 9
198
+ Ġ9 6
199
+ 6 7
200
+ Ġ7 9
201
+ 8 5
202
+ Ġ7 8
203
+ 8 3
204
+ Ġ7 2
205
+ Ġ9 4
206
+ Ġ th
207
+ 7 1
208
+ 7 9
209
+ 9 6
210
+ 7 2
211
+ 9 5
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|pad|>"
4
+ ],
5
+ "bos_token": "<|endoftext|>",
6
+ "eos_token": "<|endoftext|>",
7
+ "pad_token": "<|pad|>",
8
+ "unk_token": "<|endoftext|>"
9
+ }
tokenizer.json ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<|endoftext|>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 1,
17
+ "content": "<|pad|>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ }
24
+ ],
25
+ "normalizer": null,
26
+ "pre_tokenizer": {
27
+ "type": "ByteLevel",
28
+ "add_prefix_space": false,
29
+ "trim_offsets": true,
30
+ "use_regex": true
31
+ },
32
+ "post_processor": {
33
+ "type": "ByteLevel",
34
+ "add_prefix_space": true,
35
+ "trim_offsets": false,
36
+ "use_regex": true
37
+ },
38
+ "decoder": {
39
+ "type": "ByteLevel",
40
+ "add_prefix_space": true,
41
+ "trim_offsets": true,
42
+ "use_regex": true
43
+ },
44
+ "model": {
45
+ "type": "BPE",
46
+ "dropout": null,
47
+ "unk_token": null,
48
+ "continuing_subword_prefix": "",
49
+ "end_of_word_suffix": "",
50
+ "fuse_unk": false,
51
+ "vocab": {
52
+ "<|endoftext|>": 0,
53
+ "<|pad|>": 1,
54
+ "!": 2,
55
+ "\"": 3,
56
+ "#": 4,
57
+ "$": 5,
58
+ "%": 6,
59
+ "&": 7,
60
+ "'": 8,
61
+ "(": 9,
62
+ ")": 10,
63
+ "*": 11,
64
+ "+": 12,
65
+ ",": 13,
66
+ "-": 14,
67
+ ".": 15,
68
+ "/": 16,
69
+ "0": 17,
70
+ "1": 18,
71
+ "2": 19,
72
+ "3": 20,
73
+ "4": 21,
74
+ "5": 22,
75
+ "6": 23,
76
+ "7": 24,
77
+ "8": 25,
78
+ "9": 26,
79
+ ":": 27,
80
+ ";": 28,
81
+ "<": 29,
82
+ "=": 30,
83
+ ">": 31,
84
+ "?": 32,
85
+ "@": 33,
86
+ "A": 34,
87
+ "B": 35,
88
+ "C": 36,
89
+ "D": 37,
90
+ "E": 38,
91
+ "F": 39,
92
+ "G": 40,
93
+ "H": 41,
94
+ "I": 42,
95
+ "J": 43,
96
+ "K": 44,
97
+ "L": 45,
98
+ "M": 46,
99
+ "N": 47,
100
+ "O": 48,
101
+ "P": 49,
102
+ "Q": 50,
103
+ "R": 51,
104
+ "S": 52,
105
+ "T": 53,
106
+ "U": 54,
107
+ "V": 55,
108
+ "W": 56,
109
+ "X": 57,
110
+ "Y": 58,
111
+ "Z": 59,
112
+ "[": 60,
113
+ "\\": 61,
114
+ "]": 62,
115
+ "^": 63,
116
+ "_": 64,
117
+ "`": 65,
118
+ "a": 66,
119
+ "b": 67,
120
+ "c": 68,
121
+ "d": 69,
122
+ "e": 70,
123
+ "f": 71,
124
+ "g": 72,
125
+ "h": 73,
126
+ "i": 74,
127
+ "j": 75,
128
+ "k": 76,
129
+ "l": 77,
130
+ "m": 78,
131
+ "n": 79,
132
+ "o": 80,
133
+ "p": 81,
134
+ "q": 82,
135
+ "r": 83,
136
+ "s": 84,
137
+ "t": 85,
138
+ "u": 86,
139
+ "v": 87,
140
+ "w": 88,
141
+ "x": 89,
142
+ "y": 90,
143
+ "z": 91,
144
+ "{": 92,
145
+ "|": 93,
146
+ "}": 94,
147
+ "~": 95,
148
+ "¡": 96,
149
+ "¢": 97,
150
+ "£": 98,
151
+ "¤": 99,
152
+ "¥": 100,
153
+ "¦": 101,
154
+ "§": 102,
155
+ "¨": 103,
156
+ "©": 104,
157
+ "ª": 105,
158
+ "«": 106,
159
+ "¬": 107,
160
+ "®": 108,
161
+ "¯": 109,
162
+ "°": 110,
163
+ "±": 111,
164
+ "²": 112,
165
+ "³": 113,
166
+ "´": 114,
167
+ "µ": 115,
168
+ "¶": 116,
169
+ "·": 117,
170
+ "¸": 118,
171
+ "¹": 119,
172
+ "º": 120,
173
+ "»": 121,
174
+ "¼": 122,
175
+ "½": 123,
176
+ "¾": 124,
177
+ "¿": 125,
178
+ "À": 126,
179
+ "Á": 127,
180
+ "Â": 128,
181
+ "Ã": 129,
182
+ "Ä": 130,
183
+ "Å": 131,
184
+ "Æ": 132,
185
+ "Ç": 133,
186
+ "È": 134,
187
+ "É": 135,
188
+ "Ê": 136,
189
+ "Ë": 137,
190
+ "Ì": 138,
191
+ "Í": 139,
192
+ "Î": 140,
193
+ "Ï": 141,
194
+ "Ð": 142,
195
+ "Ñ": 143,
196
+ "Ò": 144,
197
+ "Ó": 145,
198
+ "Ô": 146,
199
+ "Õ": 147,
200
+ "Ö": 148,
201
+ "×": 149,
202
+ "Ø": 150,
203
+ "Ù": 151,
204
+ "Ú": 152,
205
+ "Û": 153,
206
+ "Ü": 154,
207
+ "Ý": 155,
208
+ "Þ": 156,
209
+ "ß": 157,
210
+ "à": 158,
211
+ "á": 159,
212
+ "â": 160,
213
+ "ã": 161,
214
+ "ä": 162,
215
+ "å": 163,
216
+ "æ": 164,
217
+ "ç": 165,
218
+ "è": 166,
219
+ "é": 167,
220
+ "ê": 168,
221
+ "ë": 169,
222
+ "ì": 170,
223
+ "í": 171,
224
+ "î": 172,
225
+ "ï": 173,
226
+ "ð": 174,
227
+ "ñ": 175,
228
+ "ò": 176,
229
+ "ó": 177,
230
+ "ô": 178,
231
+ "õ": 179,
232
+ "ö": 180,
233
+ "÷": 181,
234
+ "ø": 182,
235
+ "ù": 183,
236
+ "ú": 184,
237
+ "û": 185,
238
+ "ü": 186,
239
+ "ý": 187,
240
+ "þ": 188,
241
+ "ÿ": 189,
242
+ "Ā": 190,
243
+ "ā": 191,
244
+ "Ă": 192,
245
+ "ă": 193,
246
+ "Ą": 194,
247
+ "ą": 195,
248
+ "Ć": 196,
249
+ "ć": 197,
250
+ "Ĉ": 198,
251
+ "ĉ": 199,
252
+ "Ċ": 200,
253
+ "ċ": 201,
254
+ "Č": 202,
255
+ "č": 203,
256
+ "Ď": 204,
257
+ "ď": 205,
258
+ "Đ": 206,
259
+ "đ": 207,
260
+ "Ē": 208,
261
+ "ē": 209,
262
+ "Ĕ": 210,
263
+ "ĕ": 211,
264
+ "Ė": 212,
265
+ "ė": 213,
266
+ "Ę": 214,
267
+ "ę": 215,
268
+ "Ě": 216,
269
+ "ě": 217,
270
+ "Ĝ": 218,
271
+ "ĝ": 219,
272
+ "Ğ": 220,
273
+ "ğ": 221,
274
+ "Ġ": 222,
275
+ "ġ": 223,
276
+ "Ģ": 224,
277
+ "ģ": 225,
278
+ "Ĥ": 226,
279
+ "ĥ": 227,
280
+ "Ħ": 228,
281
+ "ħ": 229,
282
+ "Ĩ": 230,
283
+ "ĩ": 231,
284
+ "Ī": 232,
285
+ "ī": 233,
286
+ "Ĭ": 234,
287
+ "ĭ": 235,
288
+ "Į": 236,
289
+ "į": 237,
290
+ "İ": 238,
291
+ "ı": 239,
292
+ "IJ": 240,
293
+ "ij": 241,
294
+ "Ĵ": 242,
295
+ "ĵ": 243,
296
+ "Ķ": 244,
297
+ "ķ": 245,
298
+ "ĸ": 246,
299
+ "Ĺ": 247,
300
+ "ĺ": 248,
301
+ "Ļ": 249,
302
+ "ļ": 250,
303
+ "Ľ": 251,
304
+ "ľ": 252,
305
+ "Ŀ": 253,
306
+ "ŀ": 254,
307
+ "Ł": 255,
308
+ "ł": 256,
309
+ "Ń": 257,
310
+ "Ġ(": 258,
311
+ "Ġ3": 259,
312
+ "Ġ0": 260,
313
+ "Ġ|": 261,
314
+ "Ġ4": 262,
315
+ "Ġ1": 263,
316
+ "Ġr": 264,
317
+ "16": 265,
318
+ "Ġ5": 266,
319
+ "Ġ2": 267,
320
+ ").": 268,
321
+ "Ġ36": 269,
322
+ "tu": 270,
323
+ "Ġ7": 271,
324
+ "Ġ:": 272,
325
+ "Ġ6": 273,
326
+ "}.": 274,
327
+ "Ġ35": 275,
328
+ "Ġ8": 276,
329
+ "Ġ40": 277,
330
+ "Ġ9": 278,
331
+ "Ġ38": 279,
332
+ "Ġ12": 280,
333
+ "})": 281,
334
+ "Ġ42": 282,
335
+ "Ġ10": 283,
336
+ "pm": 284,
337
+ "lr": 285,
338
+ "46": 286,
339
+ "42": 287,
340
+ "32": 288,
341
+ "Ġ46": 289,
342
+ "36": 290,
343
+ "}).": 291,
344
+ "49": 292,
345
+ "Ġ14": 293,
346
+ "Ġ11": 294,
347
+ "Ġtu": 295,
348
+ "Ġ15": 296,
349
+ "51": 297,
350
+ "Ġ44": 298,
351
+ "10": 299,
352
+ "Ġ51": 300,
353
+ "57": 301,
354
+ "38": 302,
355
+ "Ġ13": 303,
356
+ "Ġ17": 304,
357
+ "40": 305,
358
+ "be": 306,
359
+ "44": 307,
360
+ "12": 308,
361
+ "35": 309,
362
+ "Ġ49": 310,
363
+ "Ġ45": 311,
364
+ "Ġ57": 312,
365
+ "Ġ41": 313,
366
+ "Ġ43": 314,
367
+ "Ġ16": 315,
368
+ "Ġ19": 316,
369
+ "11": 317,
370
+ "Ġ54": 318,
371
+ "53": 319,
372
+ ")}": 320,
373
+ "Ġ47": 321,
374
+ "59": 322,
375
+ "55": 323,
376
+ "st": 324,
377
+ "Ġ59": 325,
378
+ "14": 326,
379
+ "Ġ53": 327,
380
+ "Ġ55": 328,
381
+ "Ġ18": 329,
382
+ "Ġ20": 330,
383
+ "64": 331,
384
+ ")}.": 332,
385
+ "Ġ48": 333,
386
+ "45": 334,
387
+ "13": 335,
388
+ "41": 336,
389
+ "15": 337,
390
+ "Ġ22": 338,
391
+ "43": 339,
392
+ "ah": 340,
393
+ "Ġ69": 341,
394
+ "Ġ82": 342,
395
+ "Ġ21": 343,
396
+ "17": 344,
397
+ "47": 345,
398
+ "Ġg": 346,
399
+ "Ġbe": 347,
400
+ "Ġv": 348,
401
+ "Ġlr": 349,
402
+ "Ġ37": 350,
403
+ "54": 351,
404
+ "52": 352,
405
+ "Ġ50": 353,
406
+ "Ġ24": 354,
407
+ "Ġ70": 355,
408
+ "nh": 356,
409
+ "48": 357,
410
+ "Ġ39": 358,
411
+ "19": 359,
412
+ "Ġ52": 360,
413
+ ")})": 361,
414
+ "Ġ64": 362,
415
+ "20": 363,
416
+ "Ġ63": 364,
417
+ "69": 365,
418
+ "Ġ61": 366,
419
+ "Ġ23": 367,
420
+ ")}).": 368,
421
+ "Ġ27": 369,
422
+ "Ġ33": 370,
423
+ "82": 371,
424
+ "18": 372,
425
+ "22": 373,
426
+ "Ġ60": 374,
427
+ "Ġ28": 375,
428
+ "50": 376,
429
+ "Ġ31": 377,
430
+ "70": 378,
431
+ "21": 379,
432
+ "Ġh": 380,
433
+ "Ġ62": 381,
434
+ "37": 382,
435
+ "24": 383,
436
+ "Ġ25": 384,
437
+ "Ġf": 385,
438
+ "Ġ56": 386,
439
+ "Ġ81": 387,
440
+ "Ġ87": 388,
441
+ "Ġ29": 389,
442
+ "Ġ26": 390,
443
+ "39": 391,
444
+ "Ġ85": 392,
445
+ "Ġ65": 393,
446
+ "Ġpm": 394,
447
+ "Ġah": 395,
448
+ "Ġ32": 396,
449
+ "Ġ86": 397,
450
+ "56": 398,
451
+ "86": 399,
452
+ "33": 400,
453
+ "Ġ34": 401,
454
+ "27": 402,
455
+ "Ġ77": 403,
456
+ "Ġ30": 404,
457
+ "62": 405,
458
+ "28": 406,
459
+ "60": 407,
460
+ "61": 408,
461
+ "23": 409,
462
+ "Ġ80": 410,
463
+ "sh": 411,
464
+ "Ġst": 412,
465
+ "63": 413,
466
+ "25": 414,
467
+ "ph": 415,
468
+ "Ġ75": 416,
469
+ "87": 417,
470
+ "Ġ99": 418,
471
+ "31": 419,
472
+ "Ġ76": 420,
473
+ "Ġnh": 421,
474
+ "26": 422,
475
+ "65": 423,
476
+ "Ġ74": 424,
477
+ "Ġ66": 425,
478
+ "tr": 426,
479
+ "th": 427,
480
+ "76": 428,
481
+ "81": 429,
482
+ "29": 430,
483
+ "Ġ68": 431,
484
+ "34": 432,
485
+ "77": 433,
486
+ "Ġ83": 434,
487
+ "Ġ67": 435,
488
+ "Ġ84": 436,
489
+ "30": 437,
490
+ "Ġph": 438,
491
+ "80": 439,
492
+ "84": 440,
493
+ "Ġ73": 441,
494
+ "73": 442,
495
+ "75": 443,
496
+ "Ġsh": 444,
497
+ "74": 445,
498
+ "68": 446,
499
+ "58": 447,
500
+ "Ġ58": 448,
501
+ "66": 449,
502
+ "Ġtr": 450,
503
+ "Ġ71": 451,
504
+ "Ġ91": 452,
505
+ "99": 453,
506
+ "Ġ96": 454,
507
+ "67": 455,
508
+ "Ġ79": 456,
509
+ "85": 457,
510
+ "Ġ78": 458,
511
+ "83": 459,
512
+ "Ġ72": 460,
513
+ "Ġ94": 461,
514
+ "Ġth": 462,
515
+ "71": 463,
516
+ "79": 464,
517
+ "96": 465,
518
+ "72": 466,
519
+ "95": 467
520
+ },
521
+ "merges": [
522
+ "Ġ (",
523
+ "Ġ 3",
524
+ "Ġ 0",
525
+ "Ġ |",
526
+ "Ġ 4",
527
+ "Ġ 1",
528
+ "Ġ r",
529
+ "1 6",
530
+ "Ġ 5",
531
+ "Ġ 2",
532
+ ") .",
533
+ "Ġ3 6",
534
+ "t u",
535
+ "Ġ 7",
536
+ "Ġ :",
537
+ "Ġ 6",
538
+ "} .",
539
+ "Ġ3 5",
540
+ "Ġ 8",
541
+ "Ġ4 0",
542
+ "Ġ 9",
543
+ "Ġ3 8",
544
+ "Ġ1 2",
545
+ "} )",
546
+ "Ġ4 2",
547
+ "Ġ1 0",
548
+ "p m",
549
+ "l r",
550
+ "4 6",
551
+ "4 2",
552
+ "3 2",
553
+ "Ġ4 6",
554
+ "3 6",
555
+ "} ).",
556
+ "4 9",
557
+ "Ġ1 4",
558
+ "Ġ1 1",
559
+ "Ġ tu",
560
+ "Ġ1 5",
561
+ "5 1",
562
+ "Ġ4 4",
563
+ "1 0",
564
+ "Ġ5 1",
565
+ "5 7",
566
+ "3 8",
567
+ "Ġ1 3",
568
+ "Ġ1 7",
569
+ "4 0",
570
+ "b e",
571
+ "4 4",
572
+ "1 2",
573
+ "3 5",
574
+ "Ġ4 9",
575
+ "Ġ4 5",
576
+ "Ġ5 7",
577
+ "Ġ4 1",
578
+ "Ġ4 3",
579
+ "Ġ1 6",
580
+ "Ġ1 9",
581
+ "1 1",
582
+ "Ġ5 4",
583
+ "5 3",
584
+ ") }",
585
+ "Ġ4 7",
586
+ "5 9",
587
+ "5 5",
588
+ "s t",
589
+ "Ġ5 9",
590
+ "1 4",
591
+ "Ġ5 3",
592
+ "Ġ5 5",
593
+ "Ġ1 8",
594
+ "Ġ2 0",
595
+ "6 4",
596
+ ") }.",
597
+ "Ġ4 8",
598
+ "4 5",
599
+ "1 3",
600
+ "4 1",
601
+ "1 5",
602
+ "Ġ2 2",
603
+ "4 3",
604
+ "a h",
605
+ "Ġ6 9",
606
+ "Ġ8 2",
607
+ "Ġ2 1",
608
+ "1 7",
609
+ "4 7",
610
+ "Ġ g",
611
+ "Ġ be",
612
+ "Ġ v",
613
+ "Ġ lr",
614
+ "Ġ3 7",
615
+ "5 4",
616
+ "5 2",
617
+ "Ġ5 0",
618
+ "Ġ2 4",
619
+ "Ġ7 0",
620
+ "n h",
621
+ "4 8",
622
+ "Ġ3 9",
623
+ "1 9",
624
+ "Ġ5 2",
625
+ ") })",
626
+ "Ġ6 4",
627
+ "2 0",
628
+ "Ġ6 3",
629
+ "6 9",
630
+ "Ġ6 1",
631
+ "Ġ2 3",
632
+ ") }).",
633
+ "Ġ2 7",
634
+ "Ġ3 3",
635
+ "8 2",
636
+ "1 8",
637
+ "2 2",
638
+ "Ġ6 0",
639
+ "Ġ2 8",
640
+ "5 0",
641
+ "Ġ3 1",
642
+ "7 0",
643
+ "2 1",
644
+ "Ġ h",
645
+ "Ġ6 2",
646
+ "3 7",
647
+ "2 4",
648
+ "Ġ2 5",
649
+ "Ġ f",
650
+ "Ġ5 6",
651
+ "Ġ8 1",
652
+ "Ġ8 7",
653
+ "Ġ2 9",
654
+ "Ġ2 6",
655
+ "3 9",
656
+ "Ġ8 5",
657
+ "Ġ6 5",
658
+ "Ġ pm",
659
+ "Ġ ah",
660
+ "Ġ3 2",
661
+ "Ġ8 6",
662
+ "5 6",
663
+ "8 6",
664
+ "3 3",
665
+ "Ġ3 4",
666
+ "2 7",
667
+ "Ġ7 7",
668
+ "Ġ3 0",
669
+ "6 2",
670
+ "2 8",
671
+ "6 0",
672
+ "6 1",
673
+ "2 3",
674
+ "Ġ8 0",
675
+ "s h",
676
+ "Ġ st",
677
+ "6 3",
678
+ "2 5",
679
+ "p h",
680
+ "Ġ7 5",
681
+ "8 7",
682
+ "Ġ9 9",
683
+ "3 1",
684
+ "Ġ7 6",
685
+ "Ġ nh",
686
+ "2 6",
687
+ "6 5",
688
+ "Ġ7 4",
689
+ "Ġ6 6",
690
+ "t r",
691
+ "t h",
692
+ "7 6",
693
+ "8 1",
694
+ "2 9",
695
+ "Ġ6 8",
696
+ "3 4",
697
+ "7 7",
698
+ "Ġ8 3",
699
+ "Ġ6 7",
700
+ "Ġ8 4",
701
+ "3 0",
702
+ "Ġ ph",
703
+ "8 0",
704
+ "8 4",
705
+ "Ġ7 3",
706
+ "7 3",
707
+ "7 5",
708
+ "Ġ sh",
709
+ "7 4",
710
+ "6 8",
711
+ "5 8",
712
+ "Ġ5 8",
713
+ "6 6",
714
+ "Ġ tr",
715
+ "Ġ7 1",
716
+ "Ġ9 1",
717
+ "9 9",
718
+ "Ġ9 6",
719
+ "6 7",
720
+ "Ġ7 9",
721
+ "8 5",
722
+ "Ġ7 8",
723
+ "8 3",
724
+ "Ġ7 2",
725
+ "Ġ9 4",
726
+ "Ġ th",
727
+ "7 1",
728
+ "7 9",
729
+ "9 6",
730
+ "7 2",
731
+ "9 5"
732
+ ]
733
+ }
734
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "<|pad|>"
5
+ ],
6
+ "bos_token": "<|endoftext|>",
7
+ "eos_token": "<|endoftext|>",
8
+ "model_max_length": 1024,
9
+ "special_tokens_map_file": null,
10
+ "tokenizer_class": "GPT2Tokenizer",
11
+ "unk_token": "<|endoftext|>"
12
+ }
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<|endoftext|>":0,"<|pad|>":1,"!":2,"\"":3,"#":4,"$":5,"%":6,"&":7,"'":8,"(":9,")":10,"*":11,"+":12,",":13,"-":14,".":15,"/":16,"0":17,"1":18,"2":19,"3":20,"4":21,"5":22,"6":23,"7":24,"8":25,"9":26,":":27,";":28,"<":29,"=":30,">":31,"?":32,"@":33,"A":34,"B":35,"C":36,"D":37,"E":38,"F":39,"G":40,"H":41,"I":42,"J":43,"K":44,"L":45,"M":46,"N":47,"O":48,"P":49,"Q":50,"R":51,"S":52,"T":53,"U":54,"V":55,"W":56,"X":57,"Y":58,"Z":59,"[":60,"\\":61,"]":62,"^":63,"_":64,"`":65,"a":66,"b":67,"c":68,"d":69,"e":70,"f":71,"g":72,"h":73,"i":74,"j":75,"k":76,"l":77,"m":78,"n":79,"o":80,"p":81,"q":82,"r":83,"s":84,"t":85,"u":86,"v":87,"w":88,"x":89,"y":90,"z":91,"{":92,"|":93,"}":94,"~":95,"¡":96,"¢":97,"£":98,"¤":99,"¥":100,"¦":101,"§":102,"¨":103,"©":104,"ª":105,"«":106,"¬":107,"®":108,"¯":109,"°":110,"±":111,"²":112,"³":113,"´":114,"µ":115,"¶":116,"·":117,"¸":118,"¹":119,"º":120,"»":121,"¼":122,"½":123,"¾":124,"¿":125,"À":126,"Á":127,"Â":128,"Ã":129,"Ä":130,"Å":131,"Æ":132,"Ç":133,"È":134,"É":135,"Ê":136,"Ë":137,"Ì":138,"Í":139,"Î":140,"Ï":141,"Ð":142,"Ñ":143,"Ò":144,"Ó":145,"Ô":146,"Õ":147,"Ö":148,"×":149,"Ø":150,"Ù":151,"Ú":152,"Û":153,"Ü":154,"Ý":155,"Þ":156,"ß":157,"à":158,"á":159,"â":160,"ã":161,"ä":162,"å":163,"æ":164,"ç":165,"è":166,"é":167,"ê":168,"ë":169,"ì":170,"í":171,"î":172,"ï":173,"ð":174,"ñ":175,"ò":176,"ó":177,"ô":178,"õ":179,"ö":180,"÷":181,"ø":182,"ù":183,"ú":184,"û":185,"ü":186,"ý":187,"þ":188,"ÿ":189,"Ā":190,"ā":191,"Ă":192,"ă":193,"Ą":194,"ą":195,"Ć":196,"ć":197,"Ĉ":198,"ĉ":199,"Ċ":200,"ċ":201,"Č":202,"č":203,"Ď":204,"ď":205,"Đ":206,"đ":207,"Ē":208,"ē":209,"Ĕ":210,"ĕ":211,"Ė":212,"ė":213,"Ę":214,"ę":215,"Ě":216,"ě":217,"Ĝ":218,"ĝ":219,"Ğ":220,"ğ":221,"Ġ":222,"ġ":223,"Ģ":224,"ģ":225,"Ĥ":226,"ĥ":227,"Ħ":228,"ħ":229,"Ĩ":230,"ĩ":231,"Ī":232,"ī":233,"Ĭ":234,"ĭ":235,"Į":236,"į":237,"İ":238,"ı":239,"IJ":240,"ij":241,"Ĵ":242,"ĵ":243,"Ķ":244,"ķ":245,"ĸ":246,"Ĺ":247,"ĺ":248,"Ļ":249,"ļ":250,"Ľ":251,"ľ":252,"Ŀ":253,"ŀ":254,"Ł":255,"ł":256,"Ń":257,"Ġ(":258,"Ġ3":259,"Ġ0":260,"Ġ|":261,"Ġ4":262,"Ġ1":263,"Ġr":264,"16":265,"Ġ5":266,"Ġ2":267,").":268,"Ġ36":269,"tu":270,"Ġ7":271,"Ġ:":272,"Ġ6":273,"}.":274,"Ġ35":275,"Ġ8":276,"Ġ40":277,"Ġ9":278,"Ġ38":279,"Ġ12":280,"})":281,"Ġ42":282,"Ġ10":283,"pm":284,"lr":285,"46":286,"42":287,"32":288,"Ġ46":289,"36":290,"}).":291,"49":292,"Ġ14":293,"Ġ11":294,"Ġtu":295,"Ġ15":296,"51":297,"Ġ44":298,"10":299,"Ġ51":300,"57":301,"38":302,"Ġ13":303,"Ġ17":304,"40":305,"be":306,"44":307,"12":308,"35":309,"Ġ49":310,"Ġ45":311,"Ġ57":312,"Ġ41":313,"Ġ43":314,"Ġ16":315,"Ġ19":316,"11":317,"Ġ54":318,"53":319,")}":320,"Ġ47":321,"59":322,"55":323,"st":324,"Ġ59":325,"14":326,"Ġ53":327,"Ġ55":328,"Ġ18":329,"Ġ20":330,"64":331,")}.":332,"Ġ48":333,"45":334,"13":335,"41":336,"15":337,"Ġ22":338,"43":339,"ah":340,"Ġ69":341,"Ġ82":342,"Ġ21":343,"17":344,"47":345,"Ġg":346,"Ġbe":347,"Ġv":348,"Ġlr":349,"Ġ37":350,"54":351,"52":352,"Ġ50":353,"Ġ24":354,"Ġ70":355,"nh":356,"48":357,"Ġ39":358,"19":359,"Ġ52":360,")})":361,"Ġ64":362,"20":363,"Ġ63":364,"69":365,"Ġ61":366,"Ġ23":367,")}).":368,"Ġ27":369,"Ġ33":370,"82":371,"18":372,"22":373,"Ġ60":374,"Ġ28":375,"50":376,"Ġ31":377,"70":378,"21":379,"Ġh":380,"Ġ62":381,"37":382,"24":383,"Ġ25":384,"Ġf":385,"Ġ56":386,"Ġ81":387,"Ġ87":388,"Ġ29":389,"Ġ26":390,"39":391,"Ġ85":392,"Ġ65":393,"Ġpm":394,"Ġah":395,"Ġ32":396,"Ġ86":397,"56":398,"86":399,"33":400,"Ġ34":401,"27":402,"Ġ77":403,"Ġ30":404,"62":405,"28":406,"60":407,"61":408,"23":409,"Ġ80":410,"sh":411,"Ġst":412,"63":413,"25":414,"ph":415,"Ġ75":416,"87":417,"Ġ99":418,"31":419,"Ġ76":420,"Ġnh":421,"26":422,"65":423,"Ġ74":424,"Ġ66":425,"tr":426,"th":427,"76":428,"81":429,"29":430,"Ġ68":431,"34":432,"77":433,"Ġ83":434,"Ġ67":435,"Ġ84":436,"30":437,"Ġph":438,"80":439,"84":440,"Ġ73":441,"73":442,"75":443,"Ġsh":444,"74":445,"68":446,"58":447,"Ġ58":448,"66":449,"Ġtr":450,"Ġ71":451,"Ġ91":452,"99":453,"Ġ96":454,"67":455,"Ġ79":456,"85":457,"Ġ78":458,"83":459,"Ġ72":460,"Ġ94":461,"Ġth":462,"71":463,"79":464,"96":465,"72":466,"95":467}