susnato commited on
Commit
75a9688
1 Parent(s): 75fdab9

Uploaded new saved tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 255,
3
+ "[SPACE]": 2,
4
+ "[STOP]": 0,
5
+ "[UNK]": 1
6
+ }
merges.txt CHANGED
@@ -213,4 +213,3 @@ u se
213
  b l
214
  sa id
215
  o ver
216
- ge t
 
213
  b l
214
  sa id
215
  o ver
 
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "[STOP]",
4
+ "pad_token": "[STOP]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": true,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "[STOP]",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "1": {
15
+ "content": "[UNK]",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "2": {
23
+ "content": "[SPACE]",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "255": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "additional_special_tokens": [],
40
+ "bos_token": "<|endoftext|>",
41
+ "clean_up_tokenization_spaces": true,
42
+ "eos_token": "[STOP]",
43
+ "errors": "replace",
44
+ "model_max_length": 1000000000000000019884624838656,
45
+ "pad_token": "[STOP]",
46
+ "tokenizer_class": "ClvpTokenizer",
47
+ "tokenizer_file": "/home/susnato/.cache/huggingface/hub/models--susnato--clvp_dev/snapshots/75fdab928ec8ef23e0f66bb0bf70518989388456/tokenizer.json",
48
+ "unk_token": "[UNK]"
49
+ }
vocab.json CHANGED
@@ -1 +1,257 @@
1
- {"te": 136, "con": 135, "wh": 72, "ong": 222, "pp": 246, "ough": 202, "from": 154, "there": 172, "in": 41, "ul": 167, "ca": 183, "said": 252, "who": 214, "ou": 45, "id": 83, "ter": 114, "were": 184, "m": 26, "up": 152, "ant": 159, "en": 50, "ind": 168, "ere": 138, "into": 228, "el": 249, "ow": 69, "whi": 230, "y": 38, "th": 40, "n": 27, "od": 207, "ity": 240, "ate": 186, "you": 74, "ag": 143, "as": 55, "for": 87, "ha": 102, "so": 123, "f": 19, "if": 140, "ty": 210, "em": 225, "u": 34, "der": 234, "at": 48, "ous": 206, "but": 128, "have": 148, "li": 75, "co": 174, "back": 241, "r": 31, "way": 180, "e": 18, "j": 23, "ur": 121, "mo": 115, "this": 147, "ab": 109, "k": 24, "me": 80, "ill": 216, "o": 28, "was": 81, "some": 171, "pe": 124, "lo": 99, "res": 175, "we": 100, "bl": 251, "p": 29, "what": 193, "[SPACE]": 2, "ad": 68, "t": 33, "ld": 79, "use": 250, "z": 39, "could": 221, "al": 56, "ve": 76, "ay": 88, "ke": 93, "ts": 192, "ac": 77, "ain": 137, "pl": 178, "ch": 71, "qu": 194, "ight": 196, "w": 36, "ation": 158, "ir": 98, "ine": 203, "ri": 101, "il": 151, "h": 21, "my": 125, "just": 226, "ound": 189, "can": 201, "when": 199, "red": 219, "'": 4, "tion": 181, "do": 134, "any": 229, "had": 116, "go": 119, "oun": 146, "-": 8, "their": 224, "b": 15, "es": 61, "ther": 131, "st": 63, "sh": 120, "ap": 155, "ok": 166, "i": 22, "per": 205, "of": 58, "er": 44, "now": 145, "ak": 213, "ting": 242, "with": 103, "they": 141, "been": 237, "ted": 188, "ll": 84, "ut": 96, "am": 122, "ome": 129, "more": 244, "su": 133, "a": 14, "ack": 157, "them": 217, "fo": 182, "know": 209, "by": 185, "po": 139, "ce": 117, ";": 12, "ally": 195, "then": 247, "s": 32, "no": 95, "se": 66, "about": 215, "im": 106, "ti": 78, "our": 160, "ye": 220, "br": 243, "the": 42, "x": 37, "ard": 177, "gr": 198, "ink": 239, "ge": 112, "own": 190, "ide": 208, "gh": 82, "fe": 132, "ven": 200, "!": 3, "did": 235, "(": 5, "very": 211, "wi": 85, "his": 94, ")": 6, "ne": 111, "sa": 150, "tw": 232, "ly": 70, "fr": 130, "man": 176, "get": 254, "that": 73, "om": 65, "si": 212, "ast": 233, "l": 25, "us": 142, "an": 43, "and": 53, "end": 204, "g": 20, "to": 51, "one": 110, "ould": 113, "ver": 90, "ro": 89, "op": 161, "sp": 248, "on": 47, "him": 156, "c": 16, "ar": 59, "out": 105, "/": 10, "or": 57, "le": 64, "ion": 107, "com": 170, "would": 191, "ed": 49, "all": 108, "ent": 86, "thing": 153, "q": 30, "it": 60, "[UNK]": 1, "her": 92, ",": 7, "ex": 169, "ake": 245, "ers": 173, "d": 17, "ing": 52, "not": 149, "[STOP]": 0, ".": 9, "like": 162, "she": 118, "be": 67, "ace": 238, "un": 97, "tr": 144, "are": 127, "v": 35, "ic": 91, "is": 54, "over": 253, "your": 223, "ght": 104, "?": 13, "ie": 236, "he": 62, "est": 218, "ust": 163, "re": 46, "wor": 179, ":": 11, "de": 126, "other": 227, "ess": 164, "pro": 187, "um": 231, "ck": 197, "bo": 165}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "!": 3,
3
+ "'": 4,
4
+ "(": 5,
5
+ ")": 6,
6
+ ",": 7,
7
+ "-": 8,
8
+ ".": 9,
9
+ "/": 10,
10
+ ":": 11,
11
+ ";": 12,
12
+ "?": 13,
13
+ "[SPACE]": 2,
14
+ "[STOP]": 0,
15
+ "[UNK]": 1,
16
+ "a": 14,
17
+ "ab": 109,
18
+ "about": 215,
19
+ "ac": 77,
20
+ "ace": 238,
21
+ "ack": 157,
22
+ "ad": 68,
23
+ "ag": 143,
24
+ "ain": 137,
25
+ "ak": 213,
26
+ "ake": 245,
27
+ "al": 56,
28
+ "all": 108,
29
+ "ally": 195,
30
+ "am": 122,
31
+ "an": 43,
32
+ "and": 53,
33
+ "ant": 159,
34
+ "any": 229,
35
+ "ap": 155,
36
+ "ar": 59,
37
+ "ard": 177,
38
+ "are": 127,
39
+ "as": 55,
40
+ "ast": 233,
41
+ "at": 48,
42
+ "ate": 186,
43
+ "ation": 158,
44
+ "ay": 88,
45
+ "b": 15,
46
+ "back": 241,
47
+ "be": 67,
48
+ "been": 237,
49
+ "bl": 251,
50
+ "bo": 165,
51
+ "br": 243,
52
+ "but": 128,
53
+ "by": 185,
54
+ "c": 16,
55
+ "ca": 183,
56
+ "can": 201,
57
+ "ce": 117,
58
+ "ch": 71,
59
+ "ck": 197,
60
+ "co": 174,
61
+ "com": 170,
62
+ "con": 135,
63
+ "could": 221,
64
+ "d": 17,
65
+ "de": 126,
66
+ "der": 234,
67
+ "did": 235,
68
+ "do": 134,
69
+ "e": 18,
70
+ "ed": 49,
71
+ "el": 249,
72
+ "em": 225,
73
+ "en": 50,
74
+ "end": 204,
75
+ "ent": 86,
76
+ "er": 44,
77
+ "ere": 138,
78
+ "ers": 173,
79
+ "es": 61,
80
+ "ess": 164,
81
+ "est": 218,
82
+ "ex": 169,
83
+ "f": 19,
84
+ "fe": 132,
85
+ "fo": 182,
86
+ "for": 87,
87
+ "fr": 130,
88
+ "from": 154,
89
+ "g": 20,
90
+ "ge": 112,
91
+ "get": 254,
92
+ "gh": 82,
93
+ "ght": 104,
94
+ "go": 119,
95
+ "gr": 198,
96
+ "h": 21,
97
+ "ha": 102,
98
+ "had": 116,
99
+ "have": 148,
100
+ "he": 62,
101
+ "her": 92,
102
+ "him": 156,
103
+ "his": 94,
104
+ "i": 22,
105
+ "ic": 91,
106
+ "id": 83,
107
+ "ide": 208,
108
+ "ie": 236,
109
+ "if": 140,
110
+ "ight": 196,
111
+ "il": 151,
112
+ "ill": 216,
113
+ "im": 106,
114
+ "in": 41,
115
+ "ind": 168,
116
+ "ine": 203,
117
+ "ing": 52,
118
+ "ink": 239,
119
+ "into": 228,
120
+ "ion": 107,
121
+ "ir": 98,
122
+ "is": 54,
123
+ "it": 60,
124
+ "ity": 240,
125
+ "j": 23,
126
+ "just": 226,
127
+ "k": 24,
128
+ "ke": 93,
129
+ "know": 209,
130
+ "l": 25,
131
+ "ld": 79,
132
+ "le": 64,
133
+ "li": 75,
134
+ "like": 162,
135
+ "ll": 84,
136
+ "lo": 99,
137
+ "ly": 70,
138
+ "m": 26,
139
+ "man": 176,
140
+ "me": 80,
141
+ "mo": 115,
142
+ "more": 244,
143
+ "my": 125,
144
+ "n": 27,
145
+ "ne": 111,
146
+ "no": 95,
147
+ "not": 149,
148
+ "now": 145,
149
+ "o": 28,
150
+ "od": 207,
151
+ "of": 58,
152
+ "ok": 166,
153
+ "om": 65,
154
+ "ome": 129,
155
+ "on": 47,
156
+ "one": 110,
157
+ "ong": 222,
158
+ "op": 161,
159
+ "or": 57,
160
+ "other": 227,
161
+ "ou": 45,
162
+ "ough": 202,
163
+ "ould": 113,
164
+ "oun": 146,
165
+ "ound": 189,
166
+ "our": 160,
167
+ "ous": 206,
168
+ "out": 105,
169
+ "over": 253,
170
+ "ow": 69,
171
+ "own": 190,
172
+ "p": 29,
173
+ "pe": 124,
174
+ "per": 205,
175
+ "pl": 178,
176
+ "po": 139,
177
+ "pp": 246,
178
+ "pro": 187,
179
+ "q": 30,
180
+ "qu": 194,
181
+ "r": 31,
182
+ "re": 46,
183
+ "red": 219,
184
+ "res": 175,
185
+ "ri": 101,
186
+ "ro": 89,
187
+ "s": 32,
188
+ "sa": 150,
189
+ "said": 252,
190
+ "se": 66,
191
+ "sh": 120,
192
+ "she": 118,
193
+ "si": 212,
194
+ "so": 123,
195
+ "some": 171,
196
+ "sp": 248,
197
+ "st": 63,
198
+ "su": 133,
199
+ "t": 33,
200
+ "te": 136,
201
+ "ted": 188,
202
+ "ter": 114,
203
+ "th": 40,
204
+ "that": 73,
205
+ "the": 42,
206
+ "their": 224,
207
+ "them": 217,
208
+ "then": 247,
209
+ "ther": 131,
210
+ "there": 172,
211
+ "they": 141,
212
+ "thing": 153,
213
+ "this": 147,
214
+ "ti": 78,
215
+ "ting": 242,
216
+ "tion": 181,
217
+ "to": 51,
218
+ "tr": 144,
219
+ "ts": 192,
220
+ "tw": 232,
221
+ "ty": 210,
222
+ "u": 34,
223
+ "ul": 167,
224
+ "um": 231,
225
+ "un": 97,
226
+ "up": 152,
227
+ "ur": 121,
228
+ "us": 142,
229
+ "use": 250,
230
+ "ust": 163,
231
+ "ut": 96,
232
+ "v": 35,
233
+ "ve": 76,
234
+ "ven": 200,
235
+ "ver": 90,
236
+ "very": 211,
237
+ "w": 36,
238
+ "was": 81,
239
+ "way": 180,
240
+ "we": 100,
241
+ "were": 184,
242
+ "wh": 72,
243
+ "what": 193,
244
+ "when": 199,
245
+ "whi": 230,
246
+ "who": 214,
247
+ "wi": 85,
248
+ "with": 103,
249
+ "wor": 179,
250
+ "would": 191,
251
+ "x": 37,
252
+ "y": 38,
253
+ "ye": 220,
254
+ "you": 74,
255
+ "your": 223,
256
+ "z": 39
257
+ }