seny1004 commited on
Commit
af70e45
โ€ข
1 Parent(s): 1622ebb

Upload tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +4 -0
  2. special_tokens_map.json +22 -0
  3. tokenizer_config.json +13 -0
  4. vocab.json +359 -0
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 356,
3
+ "<s>": 355
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<s>",
19
+ "eos_token": "</s>",
20
+ "pad_token": "[PAD]",
21
+ "unk_token": "[UNK]"
22
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "clean_up_tokenization_spaces": true,
4
+ "do_lower_case": false,
5
+ "eos_token": "</s>",
6
+ "model_max_length": 1000000000000000019884624838656,
7
+ "pad_token": "[PAD]",
8
+ "replace_word_delimiter_char": " ",
9
+ "target_lang": "kor",
10
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
11
+ "unk_token": "[UNK]",
12
+ "word_delimiter_token": "|"
13
+ }
vocab.json ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "kor": {
3
+ "*": 1,
4
+ "0": 2,
5
+ "1": 3,
6
+ "2": 4,
7
+ "3": 5,
8
+ "4": 6,
9
+ "5": 7,
10
+ "6": 8,
11
+ "7": 9,
12
+ "8": 10,
13
+ "9": 11,
14
+ "[PAD]": 354,
15
+ "[UNK]": 353,
16
+ "a": 12,
17
+ "b": 13,
18
+ "c": 14,
19
+ "d": 15,
20
+ "e": 16,
21
+ "f": 17,
22
+ "g": 18,
23
+ "h": 19,
24
+ "i": 20,
25
+ "j": 21,
26
+ "k": 22,
27
+ "l": 23,
28
+ "m": 24,
29
+ "n": 25,
30
+ "o": 26,
31
+ "p": 27,
32
+ "q": 28,
33
+ "r": 29,
34
+ "s": 30,
35
+ "t": 31,
36
+ "u": 32,
37
+ "v": 33,
38
+ "w": 34,
39
+ "x": 35,
40
+ "y": 36,
41
+ "|": 0,
42
+ "ใ„ฑ": 37,
43
+ "ใ„ฒ": 38,
44
+ "ใ„ณ": 39,
45
+ "ใ„ด": 40,
46
+ "ใ„ต": 41,
47
+ "ใ„ถ": 42,
48
+ "ใ„ท": 43,
49
+ "ใ„น": 44,
50
+ "ใ„บ": 45,
51
+ "ใ„ป": 46,
52
+ "ใ„ผ": 47,
53
+ "ใ„พ": 48,
54
+ "ใ…€": 49,
55
+ "ใ…": 50,
56
+ "ใ…‚": 51,
57
+ "ใ…„": 52,
58
+ "ใ……": 53,
59
+ "ใ…†": 54,
60
+ "ใ…‡": 55,
61
+ "ใ…ˆ": 56,
62
+ "ใ…Š": 57,
63
+ "ใ…‹": 58,
64
+ "ใ…Œ": 59,
65
+ "ใ…": 60,
66
+ "ใ…Ž": 61,
67
+ "๊ฐ€": 62,
68
+ "๊ฐœ": 63,
69
+ "๊ฐธ": 64,
70
+ "๊ฑ”": 65,
71
+ "๊ฑฐ": 66,
72
+ "๊ฒŒ": 67,
73
+ "๊ฒจ": 68,
74
+ "๊ณ„": 69,
75
+ "๊ณ ": 70,
76
+ "๊ณผ": 71,
77
+ "๊ด˜": 72,
78
+ "๊ดด": 73,
79
+ "๊ต": 74,
80
+ "๊ตฌ": 75,
81
+ "๊ถˆ": 76,
82
+ "๊ถค": 77,
83
+ "๊ท€": 78,
84
+ "๊ทœ": 79,
85
+ "๊ทธ": 80,
86
+ "๊ธฐ": 81,
87
+ "๊นŒ": 82,
88
+ "๊นจ": 83,
89
+ "๊บ ": 84,
90
+ "๊บผ": 85,
91
+ "๊ป˜": 86,
92
+ "๊ปด": 87,
93
+ "๊ผ": 88,
94
+ "๊ผฌ": 89,
95
+ "๊ฝˆ": 90,
96
+ "๊ฝค": 91,
97
+ "๊พ€": 92,
98
+ "๊พธ": 93,
99
+ "๊ฟ”": 94,
100
+ "๊ฟฐ": 95,
101
+ "๋€Œ": 96,
102
+ "๋„": 97,
103
+ "๋ผ": 98,
104
+ "๋‚˜": 99,
105
+ "๋‚ด": 100,
106
+ "๋ƒ": 101,
107
+ "๋ƒฌ": 102,
108
+ "๋„ˆ": 103,
109
+ "๋„ค": 104,
110
+ "๋…€": 105,
111
+ "๋…œ": 106,
112
+ "๋…ธ": 107,
113
+ "๋†”": 108,
114
+ "๋‡Œ": 109,
115
+ "๋‡จ": 110,
116
+ "๋ˆ„": 111,
117
+ "๋ˆ ": 112,
118
+ "๋‰˜": 113,
119
+ "๋‰ด": 114,
120
+ "๋Š": 115,
121
+ "๋Šฌ": 116,
122
+ "๋‹ˆ": 117,
123
+ "๋‹ค": 118,
124
+ "๋Œ€": 119,
125
+ "๋”": 120,
126
+ "๋ฐ": 121,
127
+ "๋ŽŒ": 122,
128
+ "๋„": 123,
129
+ "๋ผ": 124,
130
+ "๋˜": 125,
131
+ "๋‘": 126,
132
+ "๋‘ฌ": 127,
133
+ "๋’ค": 128,
134
+ "๋“€": 129,
135
+ "๋“œ": 130,
136
+ "๋””": 131,
137
+ "๋”ฐ": 132,
138
+ "๋•Œ": 133,
139
+ "๋–„": 134,
140
+ "๋– ": 135,
141
+ "๋–ผ": 136,
142
+ "๋—˜": 137,
143
+ "๋˜": 138,
144
+ "๋šœ": 139,
145
+ "๋›ฐ": 140,
146
+ "๋œจ": 141,
147
+ "๋„": 142,
148
+ "๋ ": 143,
149
+ "๋ผ": 144,
150
+ "๋ž˜": 145,
151
+ "๋žด": 146,
152
+ "๋Ÿฌ": 147,
153
+ "๋ ˆ": 148,
154
+ "๋ ค": 149,
155
+ "๋ก€": 150,
156
+ "๋กœ": 151,
157
+ "๋ขฐ": 152,
158
+ "๋ฃŒ": 153,
159
+ "๋ฃจ": 154,
160
+ "๋ค„": 155,
161
+ "๋ฅ˜": 156,
162
+ "๋ฅด": 157,
163
+ "๋ฆฌ": 158,
164
+ "๋งˆ": 159,
165
+ "๋งค": 160,
166
+ "๋จธ": 161,
167
+ "๋ฉ”": 162,
168
+ "๋ฉฐ": 163,
169
+ "๋ชจ": 164,
170
+ "๋ฌ˜": 165,
171
+ "๋ฌด": 166,
172
+ "๋ญ": 167,
173
+ "๋ฎค": 168,
174
+ "๋ฏ€": 169,
175
+ "๋ฏธ": 170,
176
+ "๋ฐ”": 171,
177
+ "๋ฐฐ": 172,
178
+ "๋ฒ„": 173,
179
+ "๋ฒ ": 174,
180
+ "๋ฒผ": 175,
181
+ "๋ณด": 176,
182
+ "๋ด": 177,
183
+ "๋ดฌ": 178,
184
+ "๋ตˆ": 179,
185
+ "๋ถ€": 180,
186
+ "๋ท”": 181,
187
+ "๋ทฐ": 182,
188
+ "๋ธŒ": 183,
189
+ "๋น„": 184,
190
+ "๋น ": 185,
191
+ "๋นผ": 186,
192
+ "๋ป": 187,
193
+ "๋ผˆ": 188,
194
+ "๋ฝ€": 189,
195
+ "๋พฐ": 190,
196
+ "๋ฟŒ": 191,
197
+ "์˜": 192,
198
+ "์‚": 193,
199
+ "์‚ฌ": 194,
200
+ "์ƒˆ": 195,
201
+ "์ƒค": 196,
202
+ "์„€": 197,
203
+ "์„œ": 198,
204
+ "์„ธ": 199,
205
+ "์…”": 200,
206
+ "์…ฐ": 201,
207
+ "์†Œ": 202,
208
+ "์‡„": 203,
209
+ "์‡ ": 204,
210
+ "์‡ผ": 205,
211
+ "์ˆ˜": 206,
212
+ "์ˆด": 207,
213
+ "์‰": 208,
214
+ "์‰ฌ": 209,
215
+ "์Šˆ": 210,
216
+ "์Šค": 211,
217
+ "์‹œ": 212,
218
+ "์‹ธ": 213,
219
+ "์Œ”": 214,
220
+ "์Œฐ": 215,
221
+ "์จ": 216,
222
+ "์Ž„": 217,
223
+ "์˜": 218,
224
+ "์ด": 219,
225
+ "์": 220,
226
+ "์‘ค": 221,
227
+ "์“ฐ": 222,
228
+ "์”Œ": 223,
229
+ "์”จ": 224,
230
+ "์•„": 225,
231
+ "์• ": 226,
232
+ "์•ผ": 227,
233
+ "์–˜": 228,
234
+ "์–ด": 229,
235
+ "์—": 230,
236
+ "์—ฌ": 231,
237
+ "์˜ˆ": 232,
238
+ "์˜ค": 233,
239
+ "์™€": 234,
240
+ "์™œ": 235,
241
+ "์™ธ": 236,
242
+ "์š”": 237,
243
+ "์šฐ": 238,
244
+ "์›Œ": 239,
245
+ "์›จ": 240,
246
+ "์œ„": 241,
247
+ "์œ ": 242,
248
+ "์œผ": 243,
249
+ "์˜": 244,
250
+ "์ด": 245,
251
+ "์ž": 246,
252
+ "์žฌ": 247,
253
+ "์Ÿˆ": 248,
254
+ "์Ÿค": 249,
255
+ "์ €": 250,
256
+ "์ œ": 251,
257
+ "์ ธ": 252,
258
+ "์กฐ": 253,
259
+ "์ขŒ": 254,
260
+ "์ฃ„": 255,
261
+ "์ฃ ": 256,
262
+ "์ฃผ": 257,
263
+ "์ค˜": 258,
264
+ "์ฅ": 259,
265
+ "์ฅฌ": 260,
266
+ "์ฆˆ": 261,
267
+ "์ง€": 262,
268
+ "์งœ": 263,
269
+ "์งธ": 264,
270
+ "์จฐ": 265,
271
+ "์ฉŒ": 266,
272
+ "์ฉจ": 267,
273
+ "์ช„": 268,
274
+ "์ชผ": 269,
275
+ "์ซ˜": 270,
276
+ "์ฌ": 271,
277
+ "์ญˆ": 272,
278
+ "์ญค": 273,
279
+ "์ฏ”": 274,
280
+ "์ฐŒ": 275,
281
+ "์ฐจ": 276,
282
+ "์ฑ„": 277,
283
+ "์ฒ˜": 278,
284
+ "์ฒด": 279,
285
+ "์ณ": 280,
286
+ "์ดˆ": 281,
287
+ "์ดค": 282,
288
+ "์ตœ": 283,
289
+ "์ตธ": 284,
290
+ "์ถ”": 285,
291
+ "์ถฐ": 286,
292
+ "์ทŒ": 287,
293
+ "์ทจ": 288,
294
+ "์ธ„": 289,
295
+ "์ธ ": 290,
296
+ "์น˜": 291,
297
+ "์นด": 292,
298
+ "์บ": 293,
299
+ "์บฌ": 294,
300
+ "์ปค": 295,
301
+ "์ผ€": 296,
302
+ "์ผœ": 297,
303
+ "์ฝ”": 298,
304
+ "์ฝฐ": 299,
305
+ "์พŒ": 300,
306
+ "์ฟ„": 301,
307
+ "์ฟ ": 302,
308
+ "์ฟผ": 303,
309
+ "ํ€˜": 304,
310
+ "ํ€ด": 305,
311
+ "ํ": 306,
312
+ "ํฌ": 307,
313
+ "ํ‚ค": 308,
314
+ "ํƒ€": 309,
315
+ "ํƒœ": 310,
316
+ "ํ„ฐ": 311,
317
+ "ํ…Œ": 312,
318
+ "ํ…จ": 313,
319
+ "ํ† ": 314,
320
+ "ํ‡ด": 315,
321
+ "ํˆฌ": 316,
322
+ "ํŠ€": 317,
323
+ "ํŠœ": 318,
324
+ "ํŠธ": 319,
325
+ "ํ‹ฐ": 320,
326
+ "ํŒŒ": 321,
327
+ "ํŒจ": 322,
328
+ "ํผ": 323,
329
+ "ํŽ˜": 324,
330
+ "ํŽด": 325,
331
+ "ํ": 326,
332
+ "ํฌ": 327,
333
+ "ํ‘œ": 328,
334
+ "ํ‘ธ": 329,
335
+ "ํ“จ": 330,
336
+ "ํ”„": 331,
337
+ "ํ”ผ": 332,
338
+ "ํ•˜": 333,
339
+ "ํ•ด": 334,
340
+ "ํ–": 335,
341
+ "ํ–ฌ": 336,
342
+ "ํ—ˆ": 337,
343
+ "ํ—ค": 338,
344
+ "ํ˜€": 339,
345
+ "ํ˜œ": 340,
346
+ "ํ˜ธ": 341,
347
+ "ํ™”": 342,
348
+ "ํ™ฐ": 343,
349
+ "ํšŒ": 344,
350
+ "ํšจ": 345,
351
+ "ํ›„": 346,
352
+ "ํ› ": 347,
353
+ "ํœ˜": 348,
354
+ "ํœด": 349,
355
+ "ํ": 350,
356
+ "ํฌ": 351,
357
+ "ํžˆ": 352
358
+ }
359
+ }