susnato commited on
Commit
ffe4a31
1 Parent(s): b068caa

Added files related to tokenizer

Browse files
added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 255,
3
+ "[SPACE]": 2,
4
+ "[STOP]": 0,
5
+ "[UNK]": 1
6
+ }
merges.txt ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version: 0.2
2
+ t h
3
+ i n
4
+ th e
5
+ a n
6
+ e r
7
+ o u
8
+ r e
9
+ o n
10
+ a t
11
+ e d
12
+ e n
13
+ t o
14
+ in g
15
+ an d
16
+ i s
17
+ a s
18
+ a l
19
+ o r
20
+ o f
21
+ a r
22
+ i t
23
+ e s
24
+ h e
25
+ s t
26
+ l e
27
+ o m
28
+ s e
29
+ b e
30
+ a d
31
+ o w
32
+ l y
33
+ c h
34
+ w h
35
+ th at
36
+ y ou
37
+ l i
38
+ v e
39
+ a c
40
+ t i
41
+ l d
42
+ m e
43
+ w as
44
+ g h
45
+ i d
46
+ l l
47
+ w i
48
+ en t
49
+ f or
50
+ a y
51
+ r o
52
+ v er
53
+ i c
54
+ h er
55
+ k e
56
+ h is
57
+ n o
58
+ u t
59
+ u n
60
+ i r
61
+ l o
62
+ w e
63
+ r i
64
+ h a
65
+ wi th
66
+ gh t
67
+ ou t
68
+ i m
69
+ i on
70
+ al l
71
+ a b
72
+ on e
73
+ n e
74
+ g e
75
+ ou ld
76
+ t er
77
+ m o
78
+ h ad
79
+ c e
80
+ s he
81
+ g o
82
+ s h
83
+ u r
84
+ a m
85
+ s o
86
+ p e
87
+ m y
88
+ d e
89
+ a re
90
+ b ut
91
+ om e
92
+ f r
93
+ the r
94
+ f e
95
+ s u
96
+ d o
97
+ c on
98
+ t e
99
+ a in
100
+ er e
101
+ p o
102
+ i f
103
+ the y
104
+ u s
105
+ a g
106
+ t r
107
+ n ow
108
+ ou n
109
+ th is
110
+ ha ve
111
+ no t
112
+ s a
113
+ i l
114
+ u p
115
+ th ing
116
+ fr om
117
+ a p
118
+ h im
119
+ ac k
120
+ at ion
121
+ an t
122
+ ou r
123
+ o p
124
+ li ke
125
+ u st
126
+ es s
127
+ b o
128
+ o k
129
+ u l
130
+ in d
131
+ e x
132
+ c om
133
+ s ome
134
+ the re
135
+ er s
136
+ c o
137
+ re s
138
+ m an
139
+ ar d
140
+ p l
141
+ w or
142
+ w ay
143
+ ti on
144
+ f o
145
+ c a
146
+ w ere
147
+ b y
148
+ at e
149
+ p ro
150
+ t ed
151
+ oun d
152
+ ow n
153
+ w ould
154
+ t s
155
+ wh at
156
+ q u
157
+ al ly
158
+ i ght
159
+ c k
160
+ g r
161
+ wh en
162
+ v en
163
+ c an
164
+ ou gh
165
+ in e
166
+ en d
167
+ p er
168
+ ou s
169
+ o d
170
+ id e
171
+ k now
172
+ t y
173
+ ver y
174
+ s i
175
+ a k
176
+ wh o
177
+ ab out
178
+ i ll
179
+ the m
180
+ es t
181
+ re d
182
+ y e
183
+ c ould
184
+ on g
185
+ you r
186
+ the ir
187
+ e m
188
+ j ust
189
+ o ther
190
+ in to
191
+ an y
192
+ wh i
193
+ u m
194
+ t w
195
+ as t
196
+ d er
197
+ d id
198
+ i e
199
+ be en
200
+ ac e
201
+ in k
202
+ it y
203
+ b ack
204
+ t ing
205
+ b r
206
+ mo re
207
+ a ke
208
+ p p
209
+ the n
210
+ s p
211
+ e l
212
+ u se
213
+ b l
214
+ sa id
215
+ o ver
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "[STOP]",
4
+ "pad_token": "[STOP]",
5
+ "unk_token": "[UNK]"
6
+ }
tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"version":"1.0","truncation":null,"padding":null,"add_bos_token":true,"add_eos_token":true,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
tokenizer_config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "[STOP]",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "1": {
15
+ "content": "[UNK]",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "2": {
23
+ "content": "[SPACE]",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "255": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "additional_special_tokens": [],
40
+ "bos_token": "<|endoftext|>",
41
+ "clean_up_tokenization_spaces": true,
42
+ "eos_token": "[STOP]",
43
+ "errors": "replace",
44
+ "model_max_length": 402,
45
+ "pad_token": "[STOP]",
46
+ "tokenizer_class": "ClvpTokenizer",
47
+ "tokenizer_file": "/home/susnato/.cache/huggingface/hub/models--susnato--clvp_dev/snapshots/75fdab928ec8ef23e0f66bb0bf70518989388456/tokenizer.json",
48
+ "unk_token": "[UNK]"
49
+ }
vocab.json ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "!": 3,
3
+ "'": 4,
4
+ "(": 5,
5
+ ")": 6,
6
+ ",": 7,
7
+ "-": 8,
8
+ ".": 9,
9
+ "/": 10,
10
+ ":": 11,
11
+ ";": 12,
12
+ "?": 13,
13
+ "[SPACE]": 2,
14
+ "[STOP]": 0,
15
+ "[UNK]": 1,
16
+ "a": 14,
17
+ "ab": 109,
18
+ "about": 215,
19
+ "ac": 77,
20
+ "ace": 238,
21
+ "ack": 157,
22
+ "ad": 68,
23
+ "ag": 143,
24
+ "ain": 137,
25
+ "ak": 213,
26
+ "ake": 245,
27
+ "al": 56,
28
+ "all": 108,
29
+ "ally": 195,
30
+ "am": 122,
31
+ "an": 43,
32
+ "and": 53,
33
+ "ant": 159,
34
+ "any": 229,
35
+ "ap": 155,
36
+ "ar": 59,
37
+ "ard": 177,
38
+ "are": 127,
39
+ "as": 55,
40
+ "ast": 233,
41
+ "at": 48,
42
+ "ate": 186,
43
+ "ation": 158,
44
+ "ay": 88,
45
+ "b": 15,
46
+ "back": 241,
47
+ "be": 67,
48
+ "been": 237,
49
+ "bl": 251,
50
+ "bo": 165,
51
+ "br": 243,
52
+ "but": 128,
53
+ "by": 185,
54
+ "c": 16,
55
+ "ca": 183,
56
+ "can": 201,
57
+ "ce": 117,
58
+ "ch": 71,
59
+ "ck": 197,
60
+ "co": 174,
61
+ "com": 170,
62
+ "con": 135,
63
+ "could": 221,
64
+ "d": 17,
65
+ "de": 126,
66
+ "der": 234,
67
+ "did": 235,
68
+ "do": 134,
69
+ "e": 18,
70
+ "ed": 49,
71
+ "el": 249,
72
+ "em": 225,
73
+ "en": 50,
74
+ "end": 204,
75
+ "ent": 86,
76
+ "er": 44,
77
+ "ere": 138,
78
+ "ers": 173,
79
+ "es": 61,
80
+ "ess": 164,
81
+ "est": 218,
82
+ "ex": 169,
83
+ "f": 19,
84
+ "fe": 132,
85
+ "fo": 182,
86
+ "for": 87,
87
+ "fr": 130,
88
+ "from": 154,
89
+ "g": 20,
90
+ "ge": 112,
91
+ "get": 254,
92
+ "gh": 82,
93
+ "ght": 104,
94
+ "go": 119,
95
+ "gr": 198,
96
+ "h": 21,
97
+ "ha": 102,
98
+ "had": 116,
99
+ "have": 148,
100
+ "he": 62,
101
+ "her": 92,
102
+ "him": 156,
103
+ "his": 94,
104
+ "i": 22,
105
+ "ic": 91,
106
+ "id": 83,
107
+ "ide": 208,
108
+ "ie": 236,
109
+ "if": 140,
110
+ "ight": 196,
111
+ "il": 151,
112
+ "ill": 216,
113
+ "im": 106,
114
+ "in": 41,
115
+ "ind": 168,
116
+ "ine": 203,
117
+ "ing": 52,
118
+ "ink": 239,
119
+ "into": 228,
120
+ "ion": 107,
121
+ "ir": 98,
122
+ "is": 54,
123
+ "it": 60,
124
+ "ity": 240,
125
+ "j": 23,
126
+ "just": 226,
127
+ "k": 24,
128
+ "ke": 93,
129
+ "know": 209,
130
+ "l": 25,
131
+ "ld": 79,
132
+ "le": 64,
133
+ "li": 75,
134
+ "like": 162,
135
+ "ll": 84,
136
+ "lo": 99,
137
+ "ly": 70,
138
+ "m": 26,
139
+ "man": 176,
140
+ "me": 80,
141
+ "mo": 115,
142
+ "more": 244,
143
+ "my": 125,
144
+ "n": 27,
145
+ "ne": 111,
146
+ "no": 95,
147
+ "not": 149,
148
+ "now": 145,
149
+ "o": 28,
150
+ "od": 207,
151
+ "of": 58,
152
+ "ok": 166,
153
+ "om": 65,
154
+ "ome": 129,
155
+ "on": 47,
156
+ "one": 110,
157
+ "ong": 222,
158
+ "op": 161,
159
+ "or": 57,
160
+ "other": 227,
161
+ "ou": 45,
162
+ "ough": 202,
163
+ "ould": 113,
164
+ "oun": 146,
165
+ "ound": 189,
166
+ "our": 160,
167
+ "ous": 206,
168
+ "out": 105,
169
+ "over": 253,
170
+ "ow": 69,
171
+ "own": 190,
172
+ "p": 29,
173
+ "pe": 124,
174
+ "per": 205,
175
+ "pl": 178,
176
+ "po": 139,
177
+ "pp": 246,
178
+ "pro": 187,
179
+ "q": 30,
180
+ "qu": 194,
181
+ "r": 31,
182
+ "re": 46,
183
+ "red": 219,
184
+ "res": 175,
185
+ "ri": 101,
186
+ "ro": 89,
187
+ "s": 32,
188
+ "sa": 150,
189
+ "said": 252,
190
+ "se": 66,
191
+ "sh": 120,
192
+ "she": 118,
193
+ "si": 212,
194
+ "so": 123,
195
+ "some": 171,
196
+ "sp": 248,
197
+ "st": 63,
198
+ "su": 133,
199
+ "t": 33,
200
+ "te": 136,
201
+ "ted": 188,
202
+ "ter": 114,
203
+ "th": 40,
204
+ "that": 73,
205
+ "the": 42,
206
+ "their": 224,
207
+ "them": 217,
208
+ "then": 247,
209
+ "ther": 131,
210
+ "there": 172,
211
+ "they": 141,
212
+ "thing": 153,
213
+ "this": 147,
214
+ "ti": 78,
215
+ "ting": 242,
216
+ "tion": 181,
217
+ "to": 51,
218
+ "tr": 144,
219
+ "ts": 192,
220
+ "tw": 232,
221
+ "ty": 210,
222
+ "u": 34,
223
+ "ul": 167,
224
+ "um": 231,
225
+ "un": 97,
226
+ "up": 152,
227
+ "ur": 121,
228
+ "us": 142,
229
+ "use": 250,
230
+ "ust": 163,
231
+ "ut": 96,
232
+ "v": 35,
233
+ "ve": 76,
234
+ "ven": 200,
235
+ "ver": 90,
236
+ "very": 211,
237
+ "w": 36,
238
+ "was": 81,
239
+ "way": 180,
240
+ "we": 100,
241
+ "were": 184,
242
+ "wh": 72,
243
+ "what": 193,
244
+ "when": 199,
245
+ "whi": 230,
246
+ "who": 214,
247
+ "wi": 85,
248
+ "with": 103,
249
+ "wor": 179,
250
+ "would": 191,
251
+ "x": 37,
252
+ "y": 38,
253
+ "ye": 220,
254
+ "you": 74,
255
+ "your": 223,
256
+ "z": 39
257
+ }