jaeyeon commited on
Commit
02697c4
1 Parent(s): dda9e1a

add tokenizer

Browse files
Files changed (4) hide show
  1. added_tokens.json +4 -0
  2. special_tokens_map.json +22 -0
  3. tokenizer_config.json +12 -0
  4. vocab.json +510 -0
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "</s>": 509,
3
+ "<s>": 508
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "</s>",
12
+ "lstrip": false,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
+ ],
18
+ "bos_token": "<s>",
19
+ "eos_token": "</s>",
20
+ "pad_token": "[PAD]",
21
+ "unk_token": "[UNK]"
22
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "do_lower_case": false,
4
+ "eos_token": "</s>",
5
+ "name_or_path": "./",
6
+ "pad_token": "[PAD]",
7
+ "replace_word_delimiter_char": " ",
8
+ "special_tokens_map_file": null,
9
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
10
+ "unk_token": "[UNK]",
11
+ "word_delimiter_token": "|"
12
+ }
vocab.json ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 507,
3
+ "[UNK]": 506,
4
+ "|": 0,
5
+ "ㄱ": 1,
6
+ "ㄴ": 2,
7
+ "ㄷ": 3,
8
+ "ㄹ": 4,
9
+ "ㅁ": 5,
10
+ "ㅂ": 6,
11
+ "가": 7,
12
+ "각": 8,
13
+ "간": 9,
14
+ "갈": 10,
15
+ "감": 11,
16
+ "갑": 12,
17
+ "값": 13,
18
+ "갔": 14,
19
+ "강": 15,
20
+ "같": 16,
21
+ "개": 17,
22
+ "갯": 18,
23
+ "거": 19,
24
+ "건": 20,
25
+ "걸": 21,
26
+ "검": 22,
27
+ "겁": 23,
28
+ "것": 24,
29
+ "게": 25,
30
+ "겠": 26,
31
+ "겨": 27,
32
+ "결": 28,
33
+ "경": 29,
34
+ "계": 30,
35
+ "고": 31,
36
+ "곡": 32,
37
+ "곤": 33,
38
+ "골": 34,
39
+ "곱": 35,
40
+ "곳": 36,
41
+ "공": 37,
42
+ "과": 38,
43
+ "관": 39,
44
+ "굉": 40,
45
+ "교": 41,
46
+ "구": 42,
47
+ "국": 43,
48
+ "굳": 44,
49
+ "궁": 45,
50
+ "귀": 46,
51
+ "귓": 47,
52
+ "규": 48,
53
+ "그": 49,
54
+ "근": 50,
55
+ "귿": 51,
56
+ "금": 52,
57
+ "급": 53,
58
+ "기": 54,
59
+ "긴": 55,
60
+ "길": 56,
61
+ "김": 57,
62
+ "까": 58,
63
+ "깐": 59,
64
+ "깥": 60,
65
+ "꺼": 61,
66
+ "껏": 62,
67
+ "께": 63,
68
+ "꼭": 64,
69
+ "꼼": 65,
70
+ "꾸": 66,
71
+ "꾼": 67,
72
+ "꿔": 68,
73
+ "뀌": 69,
74
+ "끊": 70,
75
+ "끓": 71,
76
+ "끝": 72,
77
+ "끼": 73,
78
+ "나": 74,
79
+ "난": 75,
80
+ "날": 76,
81
+ "납": 77,
82
+ "낫": 78,
83
+ "났": 79,
84
+ "낮": 80,
85
+ "내": 81,
86
+ "낸": 82,
87
+ "낼": 83,
88
+ "냄": 84,
89
+ "냈": 85,
90
+ "냐": 86,
91
+ "너": 87,
92
+ "넣": 88,
93
+ "네": 89,
94
+ "넷": 90,
95
+ "념": 91,
96
+ "녕": 92,
97
+ "녹": 93,
98
+ "농": 94,
99
+ "높": 95,
100
+ "놓": 96,
101
+ "놨": 97,
102
+ "누": 98,
103
+ "눈": 99,
104
+ "눌": 100,
105
+ "느": 101,
106
+ "는": 102,
107
+ "늘": 103,
108
+ "니": 104,
109
+ "닌": 105,
110
+ "님": 106,
111
+ "다": 107,
112
+ "단": 108,
113
+ "달": 109,
114
+ "담": 110,
115
+ "답": 111,
116
+ "닷": 112,
117
+ "당": 113,
118
+ "대": 114,
119
+ "더": 115,
120
+ "던": 116,
121
+ "덩": 117,
122
+ "데": 118,
123
+ "델": 119,
124
+ "도": 120,
125
+ "돌": 121,
126
+ "동": 122,
127
+ "돼": 123,
128
+ "됐": 124,
129
+ "되": 125,
130
+ "된": 126,
131
+ "될": 127,
132
+ "됩": 128,
133
+ "두": 129,
134
+ "둔": 130,
135
+ "둘": 131,
136
+ "둬": 132,
137
+ "뒷": 133,
138
+ "드": 134,
139
+ "든": 135,
140
+ "들": 136,
141
+ "등": 137,
142
+ "디": 138,
143
+ "따": 139,
144
+ "땀": 140,
145
+ "때": 141,
146
+ "떠": 142,
147
+ "떤": 143,
148
+ "떨": 144,
149
+ "떻": 145,
150
+ "또": 146,
151
+ "똑": 147,
152
+ "뜨": 148,
153
+ "뜻": 149,
154
+ "띄": 150,
155
+ "라": 151,
156
+ "란": 152,
157
+ "람": 153,
158
+ "랐": 154,
159
+ "랑": 155,
160
+ "래": 156,
161
+ "랜": 157,
162
+ "랬": 158,
163
+ "량": 159,
164
+ "러": 160,
165
+ "런": 161,
166
+ "럴": 162,
167
+ "럼": 163,
168
+ "렀": 164,
169
+ "렇": 165,
170
+ "렌": 166,
171
+ "려": 167,
172
+ "력": 168,
173
+ "련": 169,
174
+ "렸": 170,
175
+ "례": 171,
176
+ "로": 172,
177
+ "록": 173,
178
+ "론": 174,
179
+ "료": 175,
180
+ "루": 176,
181
+ "류": 177,
182
+ "르": 178,
183
+ "른": 179,
184
+ "를": 180,
185
+ "름": 181,
186
+ "리": 182,
187
+ "린": 183,
188
+ "림": 184,
189
+ "립": 185,
190
+ "마": 186,
191
+ "막": 187,
192
+ "만": 188,
193
+ "많": 189,
194
+ "말": 190,
195
+ "맑": 191,
196
+ "맞": 192,
197
+ "맡": 193,
198
+ "매": 194,
199
+ "맺": 195,
200
+ "머": 196,
201
+ "먹": 197,
202
+ "먼": 198,
203
+ "멀": 199,
204
+ "멉": 200,
205
+ "며": 201,
206
+ "면": 202,
207
+ "명": 203,
208
+ "몇": 204,
209
+ "모": 205,
210
+ "몰": 206,
211
+ "못": 207,
212
+ "무": 208,
213
+ "묶": 209,
214
+ "문": 210,
215
+ "묻": 211,
216
+ "물": 212,
217
+ "뭐": 213,
218
+ "미": 214,
219
+ "밀": 215,
220
+ "밑": 216,
221
+ "바": 217,
222
+ "밖": 218,
223
+ "반": 219,
224
+ "받": 220,
225
+ "발": 221,
226
+ "방": 222,
227
+ "배": 223,
228
+ "백": 224,
229
+ "번": 225,
230
+ "법": 226,
231
+ "벗": 227,
232
+ "벳": 228,
233
+ "벼": 229,
234
+ "벽": 230,
235
+ "변": 231,
236
+ "볕": 232,
237
+ "보": 233,
238
+ "본": 234,
239
+ "볼": 235,
240
+ "봅": 236,
241
+ "봉": 237,
242
+ "봐": 238,
243
+ "봤": 239,
244
+ "부": 240,
245
+ "분": 241,
246
+ "불": 242,
247
+ "붙": 243,
248
+ "브": 244,
249
+ "비": 245,
250
+ "빈": 246,
251
+ "빠": 247,
252
+ "빨": 248,
253
+ "뺄": 249,
254
+ "뽀": 250,
255
+ "뿐": 251,
256
+ "뿜": 252,
257
+ "사": 253,
258
+ "삭": 254,
259
+ "산": 255,
260
+ "살": 256,
261
+ "삼": 257,
262
+ "상": 258,
263
+ "새": 259,
264
+ "색": 260,
265
+ "샘": 261,
266
+ "생": 262,
267
+ "샤": 263,
268
+ "서": 264,
269
+ "석": 265,
270
+ "섞": 266,
271
+ "선": 267,
272
+ "설": 268,
273
+ "섭": 269,
274
+ "섯": 270,
275
+ "성": 271,
276
+ "세": 272,
277
+ "셔": 273,
278
+ "셨": 274,
279
+ "소": 275,
280
+ "속": 276,
281
+ "손": 277,
282
+ "송": 278,
283
+ "수": 279,
284
+ "순": 280,
285
+ "술": 281,
286
+ "쉬": 282,
287
+ "쉽": 283,
288
+ "스": 284,
289
+ "슨": 285,
290
+ "슬": 286,
291
+ "습": 287,
292
+ "승": 288,
293
+ "시": 289,
294
+ "식": 290,
295
+ "신": 291,
296
+ "실": 292,
297
+ "심": 293,
298
+ "십": 294,
299
+ "썼": 295,
300
+ "쓴": 296,
301
+ "쓸": 297,
302
+ "씨": 298,
303
+ "씩": 299,
304
+ "아": 300,
305
+ "안": 301,
306
+ "않": 302,
307
+ "알": 303,
308
+ "암": 304,
309
+ "압": 305,
310
+ "았": 306,
311
+ "애": 307,
312
+ "액": 308,
313
+ "앤": 309,
314
+ "야": 310,
315
+ "약": 311,
316
+ "양": 312,
317
+ "얘": 313,
318
+ "어": 314,
319
+ "억": 315,
320
+ "언": 316,
321
+ "얻": 317,
322
+ "얼": 318,
323
+ "없": 319,
324
+ "엇": 320,
325
+ "었": 321,
326
+ "에": 322,
327
+ "엔": 323,
328
+ "여": 324,
329
+ "역": 325,
330
+ "연": 326,
331
+ "열": 327,
332
+ "염": 328,
333
+ "였": 329,
334
+ "영": 330,
335
+ "예": 331,
336
+ "오": 332,
337
+ "온": 333,
338
+ "올": 334,
339
+ "옳": 335,
340
+ "옷": 336,
341
+ "와": 337,
342
+ "왔": 338,
343
+ "왜": 339,
344
+ "외": 340,
345
+ "요": 341,
346
+ "용": 342,
347
+ "우": 343,
348
+ "운": 344,
349
+ "울": 345,
350
+ "움": 346,
351
+ "워": 347,
352
+ "원": 348,
353
+ "월": 349,
354
+ "웠": 350,
355
+ "위": 351,
356
+ "유": 352,
357
+ "육": 353,
358
+ "융": 354,
359
+ "으": 355,
360
+ "은": 356,
361
+ "을": 357,
362
+ "음": 358,
363
+ "읍": 359,
364
+ "응": 360,
365
+ "의": 361,
366
+ "이": 362,
367
+ "인": 363,
368
+ "일": 364,
369
+ "읽": 365,
370
+ "임": 366,
371
+ "입": 367,
372
+ "있": 368,
373
+ "잉": 369,
374
+ "잎": 370,
375
+ "자": 371,
376
+ "작": 372,
377
+ "잘": 373,
378
+ "잠": 374,
379
+ "장": 375,
380
+ "재": 376,
381
+ "저": 377,
382
+ "적": 378,
383
+ "전": 379,
384
+ "절": 380,
385
+ "점": 381,
386
+ "정": 382,
387
+ "젖": 383,
388
+ "제": 384,
389
+ "져": 385,
390
+ "졌": 386,
391
+ "조": 387,
392
+ "존": 388,
393
+ "좀": 389,
394
+ "종": 390,
395
+ "좋": 391,
396
+ "죠": 392,
397
+ "주": 393,
398
+ "준": 394,
399
+ "줄": 395,
400
+ "중": 396,
401
+ "즉": 397,
402
+ "즐": 398,
403
+ "증": 399,
404
+ "지": 400,
405
+ "직": 401,
406
+ "진": 402,
407
+ "질": 403,
408
+ "집": 404,
409
+ "짚": 405,
410
+ "짝": 406,
411
+ "짭": 407,
412
+ "짱": 408,
413
+ "째": 409,
414
+ "쨍": 410,
415
+ "쪼": 411,
416
+ "쫙": 412,
417
+ "쬐": 413,
418
+ "쭉": 414,
419
+ "차": 415,
420
+ "찬": 416,
421
+ "찰": 417,
422
+ "창": 418,
423
+ "찾": 419,
424
+ "책": 420,
425
+ "처": 421,
426
+ "철": 422,
427
+ "첫": 423,
428
+ "체": 424,
429
+ "쳐": 425,
430
+ "쵸": 426,
431
+ "추": 427,
432
+ "축": 428,
433
+ "출": 429,
434
+ "충": 430,
435
+ "측": 431,
436
+ "층": 432,
437
+ "치": 433,
438
+ "칙": 434,
439
+ "친": 435,
440
+ "칠": 436,
441
+ "칭": 437,
442
+ "캘": 438,
443
+ "커": 439,
444
+ "컵": 440,
445
+ "컷": 441,
446
+ "케": 442,
447
+ "켜": 443,
448
+ "켰": 444,
449
+ "코": 445,
450
+ "큐": 446,
451
+ "크": 447,
452
+ "큼": 448,
453
+ "키": 449,
454
+ "킨": 450,
455
+ "킬": 451,
456
+ "타": 452,
457
+ "탄": 453,
458
+ "탈": 454,
459
+ "탕": 455,
460
+ "태": 456,
461
+ "택": 457,
462
+ "터": 458,
463
+ "텐": 459,
464
+ "톤": 460,
465
+ "통": 461,
466
+ "투": 462,
467
+ "특": 463,
468
+ "틀": 464,
469
+ "티": 465,
470
+ "파": 466,
471
+ "판": 467,
472
+ "팔": 468,
473
+ "팽": 469,
474
+ "퍼": 470,
475
+ "펴": 471,
476
+ "편": 472,
477
+ "포": 473,
478
+ "표": 474,
479
+ "풀": 475,
480
+ "풍": 476,
481
+ "프": 477,
482
+ "피": 478,
483
+ "필": 479,
484
+ "하": 480,
485
+ "학": 481,
486
+ "한": 482,
487
+ "할": 483,
488
+ "함": 484,
489
+ "합": 485,
490
+ "항": 486,
491
+ "해": 487,
492
+ "햇": 488,
493
+ "했": 489,
494
+ "향": 490,
495
+ "험": 491,
496
+ "현": 492,
497
+ "형": 493,
498
+ "화": 494,
499
+ "확": 495,
500
+ "활": 496,
501
+ "황": 497,
502
+ "회": 498,
503
+ "횟": 499,
504
+ "후": 500,
505
+ "흘": 501,
506
+ "흡": 502,
507
+ "히": 503,
508
+ "힌": 504,
509
+ "힘": 505
510
+ }