ArthurZ HF staff commited on
Commit
7a2b53b
1 Parent(s): 8db4072

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +107 -106
  2. special_tokens_map.json +16 -3
  3. tokenizer_config.json +2 -3
added_tokens.json CHANGED
@@ -1,108 +1,109 @@
1
  {
2
- "<|af|>": 50326,
3
- "<|am|>": 50333,
4
- "<|ar|>": 50271,
5
- "<|as|>": 50349,
6
- "<|az|>": 50303,
7
- "<|ba|>": 50354,
8
- "<|be|>": 50329,
9
- "<|bg|>": 50291,
10
- "<|bn|>": 50301,
11
- "<|bo|>": 50346,
12
- "<|br|>": 50308,
13
- "<|bs|>": 50314,
14
- "<|ca|>": 50269,
15
- "<|cs|>": 50282,
16
- "<|cy|>": 50296,
17
- "<|da|>": 50284,
18
- "<|de|>": 50260,
19
- "<|el|>": 50280,
20
- "<|en|>": 50258,
21
- "<|es|>": 50261,
22
- "<|et|>": 50306,
23
- "<|eu|>": 50309,
24
- "<|fa|>": 50299,
25
- "<|fi|>": 50276,
26
- "<|fo|>": 50337,
27
- "<|fr|>": 50264,
28
- "<|gl|>": 50318,
29
- "<|gu|>": 50332,
30
- "<|haw|>": 50351,
31
- "<|ha|>": 50353,
32
- "<|hi|>": 50275,
33
- "<|hr|>": 50290,
34
- "<|ht|>": 50338,
35
- "<|hu|>": 50285,
36
- "<|hy|>": 50311,
37
- "<|id|>": 50274,
38
- "<|is|>": 50310,
39
- "<|it|>": 50273,
40
- "<|iw|>": 50278,
41
- "<|ja|>": 50265,
42
- "<|jw|>": 50355,
43
- "<|ka|>": 50328,
44
- "<|kk|>": 50315,
45
- "<|km|>": 50322,
46
- "<|kn|>": 50305,
47
- "<|ko|>": 50263,
48
- "<|la|>": 50293,
49
- "<|lb|>": 50344,
50
- "<|ln|>": 50352,
51
- "<|lo|>": 50335,
52
- "<|lt|>": 50292,
53
- "<|lv|>": 50300,
54
- "<|mg|>": 50348,
55
- "<|mi|>": 50294,
56
- "<|mk|>": 50307,
57
- "<|ml|>": 50295,
58
- "<|mn|>": 50313,
59
- "<|mr|>": 50319,
60
- "<|ms|>": 50281,
61
- "<|mt|>": 50342,
62
- "<|my|>": 50345,
63
- "<|ne|>": 50312,
64
- "<|nl|>": 50270,
65
- "<|nn|>": 50341,
66
- "<|nocaptions|>": 50361,
67
- "<|notimestamps|>": 50362,
68
- "<|no|>": 50287,
69
- "<|oc|>": 50327,
70
- "<|pa|>": 50320,
71
- "<|pl|>": 50268,
72
- "<|ps|>": 50339,
73
- "<|pt|>": 50266,
74
- "<|ro|>": 50283,
75
- "<|ru|>": 50262,
76
- "<|sa|>": 50343,
77
- "<|sd|>": 50331,
78
- "<|si|>": 50321,
79
- "<|sk|>": 50297,
80
- "<|sl|>": 50304,
81
- "<|sn|>": 50323,
82
- "<|so|>": 50325,
83
- "<|sq|>": 50316,
84
- "<|sr|>": 50302,
85
- "<|startoflm|>": 50359,
86
- "<|startofprev|>": 50360,
87
- "<|startoftranscript|>": 50257,
88
- "<|su|>": 50356,
89
- "<|sv|>": 50272,
90
- "<|sw|>": 50317,
91
- "<|ta|>": 50286,
92
- "<|te|>": 50298,
93
- "<|tg|>": 50330,
94
- "<|th|>": 50288,
95
- "<|tk|>": 50340,
96
- "<|tl|>": 50347,
97
- "<|transcribe|>": 50358,
98
- "<|translate|>": 50357,
99
- "<|tr|>": 50267,
100
- "<|tt|>": 50350,
101
- "<|uk|>": 50279,
102
- "<|ur|>": 50289,
103
- "<|uz|>": 50336,
104
- "<|vi|>": 50277,
105
- "<|yi|>": 50334,
106
- "<|yo|>": 50324,
107
- "<|zh|>": 50259
 
108
  }
 
1
  {
2
+ "<|af|>": 50327,
3
+ "<|am|>": 50334,
4
+ "<|ar|>": 50272,
5
+ "<|as|>": 50350,
6
+ "<|az|>": 50304,
7
+ "<|ba|>": 50355,
8
+ "<|be|>": 50330,
9
+ "<|bg|>": 50292,
10
+ "<|bn|>": 50302,
11
+ "<|bo|>": 50347,
12
+ "<|br|>": 50309,
13
+ "<|bs|>": 50315,
14
+ "<|ca|>": 50270,
15
+ "<|cs|>": 50283,
16
+ "<|cy|>": 50297,
17
+ "<|da|>": 50285,
18
+ "<|de|>": 50261,
19
+ "<|el|>": 50281,
20
+ "<|endoftext|>": 50257,
21
+ "<|en|>": 50259,
22
+ "<|es|>": 50262,
23
+ "<|et|>": 50307,
24
+ "<|eu|>": 50310,
25
+ "<|fa|>": 50300,
26
+ "<|fi|>": 50277,
27
+ "<|fo|>": 50338,
28
+ "<|fr|>": 50265,
29
+ "<|gl|>": 50319,
30
+ "<|gu|>": 50333,
31
+ "<|haw|>": 50352,
32
+ "<|ha|>": 50354,
33
+ "<|hi|>": 50276,
34
+ "<|hr|>": 50291,
35
+ "<|ht|>": 50339,
36
+ "<|hu|>": 50286,
37
+ "<|hy|>": 50312,
38
+ "<|id|>": 50275,
39
+ "<|is|>": 50311,
40
+ "<|it|>": 50274,
41
+ "<|iw|>": 50279,
42
+ "<|ja|>": 50266,
43
+ "<|jw|>": 50356,
44
+ "<|ka|>": 50329,
45
+ "<|kk|>": 50316,
46
+ "<|km|>": 50323,
47
+ "<|kn|>": 50306,
48
+ "<|ko|>": 50264,
49
+ "<|la|>": 50294,
50
+ "<|lb|>": 50345,
51
+ "<|ln|>": 50353,
52
+ "<|lo|>": 50336,
53
+ "<|lt|>": 50293,
54
+ "<|lv|>": 50301,
55
+ "<|mg|>": 50349,
56
+ "<|mi|>": 50295,
57
+ "<|mk|>": 50308,
58
+ "<|ml|>": 50296,
59
+ "<|mn|>": 50314,
60
+ "<|mr|>": 50320,
61
+ "<|ms|>": 50282,
62
+ "<|mt|>": 50343,
63
+ "<|my|>": 50346,
64
+ "<|ne|>": 50313,
65
+ "<|nl|>": 50271,
66
+ "<|nn|>": 50342,
67
+ "<|nocaptions|>": 50362,
68
+ "<|notimestamps|>": 50363,
69
+ "<|no|>": 50288,
70
+ "<|oc|>": 50328,
71
+ "<|pa|>": 50321,
72
+ "<|pl|>": 50269,
73
+ "<|ps|>": 50340,
74
+ "<|pt|>": 50267,
75
+ "<|ro|>": 50284,
76
+ "<|ru|>": 50263,
77
+ "<|sa|>": 50344,
78
+ "<|sd|>": 50332,
79
+ "<|si|>": 50322,
80
+ "<|sk|>": 50298,
81
+ "<|sl|>": 50305,
82
+ "<|sn|>": 50324,
83
+ "<|so|>": 50326,
84
+ "<|sq|>": 50317,
85
+ "<|sr|>": 50303,
86
+ "<|startoflm|>": 50360,
87
+ "<|startofprev|>": 50361,
88
+ "<|startoftranscript|>": 50258,
89
+ "<|su|>": 50357,
90
+ "<|sv|>": 50273,
91
+ "<|sw|>": 50318,
92
+ "<|ta|>": 50287,
93
+ "<|te|>": 50299,
94
+ "<|tg|>": 50331,
95
+ "<|th|>": 50289,
96
+ "<|tk|>": 50341,
97
+ "<|tl|>": 50348,
98
+ "<|transcribe|>": 50359,
99
+ "<|translate|>": 50358,
100
+ "<|tr|>": 50268,
101
+ "<|tt|>": 50351,
102
+ "<|uk|>": 50280,
103
+ "<|ur|>": 50290,
104
+ "<|uz|>": 50337,
105
+ "<|vi|>": 50278,
106
+ "<|yi|>": 50335,
107
+ "<|yo|>": 50325,
108
+ "<|zh|>": 50260
109
  }
special_tokens_map.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "additional_special_tokens": [
 
3
  "<|startoftranscript|>",
4
  "<|en|>",
5
  "<|zh|>",
@@ -107,13 +108,25 @@
107
  "<|nocaptions|>",
108
  "<|notimestamps|>"
109
  ],
110
- "bos_token": "<|endoftext|>",
111
- "eos_token": "<|endoftext|>",
112
- "unk_token": {
 
 
 
 
 
113
  "content": "<|endoftext|>",
114
  "lstrip": false,
115
  "normalized": true,
116
  "rstrip": false,
117
  "single_word": false
 
 
 
 
 
 
 
118
  }
119
  }
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|endoftext|>",
4
  "<|startoftranscript|>",
5
  "<|en|>",
6
  "<|zh|>",
 
108
  "<|nocaptions|>",
109
  "<|notimestamps|>"
110
  ],
111
+ "bos_token": {
112
+ "content": "<|endoftext|>",
113
+ "lstrip": false,
114
+ "normalized": true,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "eos_token": {
119
  "content": "<|endoftext|>",
120
  "lstrip": false,
121
  "normalized": true,
122
  "rstrip": false,
123
  "single_word": false
124
+ },
125
+ "unk_token": {
126
+ "content": "",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false
131
  }
132
  }
tokenizer_config.json CHANGED
@@ -19,14 +19,13 @@
19
  },
20
  "errors": "replace",
21
  "model_max_length": 1024,
22
- "name_or_path": "openai/whisper-large",
23
  "pad_token": null,
24
- "processor_class": "WhisperProcessor",
25
  "special_tokens_map_file": null,
26
  "tokenizer_class": "WhisperTokenizer",
27
  "unk_token": {
28
  "__type": "AddedToken",
29
- "content": "<|endoftext|>",
30
  "lstrip": false,
31
  "normalized": true,
32
  "rstrip": false,
 
19
  },
20
  "errors": "replace",
21
  "model_max_length": 1024,
22
+ "name_or_path": "whisper-multi/test_added_eot",
23
  "pad_token": null,
 
24
  "special_tokens_map_file": null,
25
  "tokenizer_class": "WhisperTokenizer",
26
  "unk_token": {
27
  "__type": "AddedToken",
28
+ "content": "",
29
  "lstrip": false,
30
  "normalized": true,
31
  "rstrip": false,