ArthurZ HF staff commited on
Commit
73d8a6d
1 Parent(s): 1f7cf6f

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer_config.json +2 -2
  2. vocab.json +106 -0
tokenizer_config.json CHANGED
@@ -4,9 +4,9 @@
4
  "bos_token": "<|endoftext|>",
5
  "eos_token": "<|endoftext|>",
6
  "errors": "replace",
7
- "name_or_path": "whisper-tokenizer-eng",
8
  "pad_token": null,
9
- "special_tokens_map_file": "whisper-tokenizer-eng/special_tokens_map.json",
10
  "tokenizer_class": "WhisperTokenizer",
11
  "unk_token": "<|endoftext|>"
12
  }
 
4
  "bos_token": "<|endoftext|>",
5
  "eos_token": "<|endoftext|>",
6
  "errors": "replace",
7
+ "name_or_path": "working",
8
  "pad_token": null,
9
+ "special_tokens_map_file": "working/special_tokens_map.json",
10
  "tokenizer_class": "WhisperTokenizer",
11
  "unk_token": "<|endoftext|>"
12
  }
vocab.json CHANGED
@@ -313,7 +313,113 @@
313
  ";;": 35746,
314
  "<": 27,
315
  "</": 3433,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  "<|endoftext|>": 50256,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  "=": 28,
318
  "=\"": 13114,
319
  "=\"#": 34106,
 
313
  ";;": 35746,
314
  "<": 27,
315
  "</": 3433,
316
+ "<|af|>": 50326,
317
+ "<|am|>": 50333,
318
+ "<|ar|>": 50271,
319
+ "<|as|>": 50349,
320
+ "<|az|>": 50303,
321
+ "<|ba|>": 50354,
322
+ "<|be|>": 50329,
323
+ "<|bg|>": 50291,
324
+ "<|bn|>": 50301,
325
+ "<|bo|>": 50346,
326
+ "<|br|>": 50308,
327
+ "<|bs|>": 50314,
328
+ "<|ca|>": 50269,
329
+ "<|cs|>": 50282,
330
+ "<|cy|>": 50296,
331
+ "<|da|>": 50284,
332
+ "<|de|>": 50260,
333
+ "<|el|>": 50280,
334
  "<|endoftext|>": 50256,
335
+ "<|en|>": 50258,
336
+ "<|es|>": 50261,
337
+ "<|et|>": 50306,
338
+ "<|eu|>": 50309,
339
+ "<|fa|>": 50299,
340
+ "<|fi|>": 50276,
341
+ "<|fo|>": 50337,
342
+ "<|fr|>": 50264,
343
+ "<|gl|>": 50318,
344
+ "<|gu|>": 50332,
345
+ "<|haw|>": 50351,
346
+ "<|ha|>": 50353,
347
+ "<|hi|>": 50275,
348
+ "<|hr|>": 50290,
349
+ "<|ht|>": 50338,
350
+ "<|hu|>": 50285,
351
+ "<|hy|>": 50311,
352
+ "<|id|>": 50274,
353
+ "<|is|>": 50310,
354
+ "<|it|>": 50273,
355
+ "<|iw|>": 50278,
356
+ "<|ja|>": 50265,
357
+ "<|jw|>": 50355,
358
+ "<|ka|>": 50328,
359
+ "<|kk|>": 50315,
360
+ "<|km|>": 50322,
361
+ "<|kn|>": 50305,
362
+ "<|ko|>": 50263,
363
+ "<|la|>": 50293,
364
+ "<|lb|>": 50344,
365
+ "<|ln|>": 50352,
366
+ "<|lo|>": 50335,
367
+ "<|lt|>": 50292,
368
+ "<|lv|>": 50300,
369
+ "<|mg|>": 50348,
370
+ "<|mi|>": 50294,
371
+ "<|mk|>": 50307,
372
+ "<|ml|>": 50295,
373
+ "<|mn|>": 50313,
374
+ "<|mr|>": 50319,
375
+ "<|ms|>": 50281,
376
+ "<|mt|>": 50342,
377
+ "<|my|>": 50345,
378
+ "<|ne|>": 50312,
379
+ "<|nl|>": 50270,
380
+ "<|nn|>": 50341,
381
+ "<|nocaptions|>": 50361,
382
+ "<|notimestamps|>": 50362,
383
+ "<|no|>": 50287,
384
+ "<|oc|>": 50327,
385
+ "<|pa|>": 50320,
386
+ "<|pl|>": 50268,
387
+ "<|ps|>": 50339,
388
+ "<|pt|>": 50266,
389
+ "<|ro|>": 50283,
390
+ "<|ru|>": 50262,
391
+ "<|sa|>": 50343,
392
+ "<|sd|>": 50331,
393
+ "<|si|>": 50321,
394
+ "<|sk|>": 50297,
395
+ "<|sl|>": 50304,
396
+ "<|sn|>": 50323,
397
+ "<|so|>": 50325,
398
+ "<|sq|>": 50316,
399
+ "<|sr|>": 50302,
400
+ "<|startoflm|>": 50359,
401
+ "<|startofprev|>": 50360,
402
+ "<|startoftranscript|>": 50257,
403
+ "<|su|>": 50356,
404
+ "<|sv|>": 50272,
405
+ "<|sw|>": 50317,
406
+ "<|ta|>": 50286,
407
+ "<|te|>": 50298,
408
+ "<|tg|>": 50330,
409
+ "<|th|>": 50288,
410
+ "<|tk|>": 50340,
411
+ "<|tl|>": 50347,
412
+ "<|transcribe|>": 50358,
413
+ "<|translate|>": 50357,
414
+ "<|tr|>": 50267,
415
+ "<|tt|>": 50350,
416
+ "<|uk|>": 50279,
417
+ "<|ur|>": 50289,
418
+ "<|uz|>": 50336,
419
+ "<|vi|>": 50277,
420
+ "<|yi|>": 50334,
421
+ "<|yo|>": 50324,
422
+ "<|zh|>": 50259,
423
  "=": 28,
424
  "=\"": 13114,
425
  "=\"#": 34106,