ArthurZ HF staff commited on
Commit
13d8f54
1 Parent(s): c6da634

Upload processor

Browse files
added_tokens.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|af|>": 50326,
3
+ "<|am|>": 50333,
4
+ "<|ar|>": 50271,
5
+ "<|as|>": 50349,
6
+ "<|az|>": 50303,
7
+ "<|ba|>": 50354,
8
+ "<|be|>": 50329,
9
+ "<|bg|>": 50291,
10
+ "<|bn|>": 50301,
11
+ "<|bo|>": 50346,
12
+ "<|br|>": 50308,
13
+ "<|bs|>": 50314,
14
+ "<|ca|>": 50269,
15
+ "<|cs|>": 50282,
16
+ "<|cy|>": 50296,
17
+ "<|da|>": 50284,
18
+ "<|de|>": 50260,
19
+ "<|el|>": 50280,
20
+ "<|en|>": 50258,
21
+ "<|es|>": 50261,
22
+ "<|et|>": 50306,
23
+ "<|eu|>": 50309,
24
+ "<|fa|>": 50299,
25
+ "<|fi|>": 50276,
26
+ "<|fo|>": 50337,
27
+ "<|fr|>": 50264,
28
+ "<|gl|>": 50318,
29
+ "<|gu|>": 50332,
30
+ "<|haw|>": 50351,
31
+ "<|ha|>": 50353,
32
+ "<|hi|>": 50275,
33
+ "<|hr|>": 50290,
34
+ "<|ht|>": 50338,
35
+ "<|hu|>": 50285,
36
+ "<|hy|>": 50311,
37
+ "<|id|>": 50274,
38
+ "<|is|>": 50310,
39
+ "<|it|>": 50273,
40
+ "<|iw|>": 50278,
41
+ "<|ja|>": 50265,
42
+ "<|jw|>": 50355,
43
+ "<|ka|>": 50328,
44
+ "<|kk|>": 50315,
45
+ "<|km|>": 50322,
46
+ "<|kn|>": 50305,
47
+ "<|ko|>": 50263,
48
+ "<|la|>": 50293,
49
+ "<|lb|>": 50344,
50
+ "<|ln|>": 50352,
51
+ "<|lo|>": 50335,
52
+ "<|lt|>": 50292,
53
+ "<|lv|>": 50300,
54
+ "<|mg|>": 50348,
55
+ "<|mi|>": 50294,
56
+ "<|mk|>": 50307,
57
+ "<|ml|>": 50295,
58
+ "<|mn|>": 50313,
59
+ "<|mr|>": 50319,
60
+ "<|ms|>": 50281,
61
+ "<|mt|>": 50342,
62
+ "<|my|>": 50345,
63
+ "<|ne|>": 50312,
64
+ "<|nl|>": 50270,
65
+ "<|nn|>": 50341,
66
+ "<|nocaptions|>": 50361,
67
+ "<|notimestamps|>": 50362,
68
+ "<|no|>": 50287,
69
+ "<|oc|>": 50327,
70
+ "<|pa|>": 50320,
71
+ "<|pl|>": 50268,
72
+ "<|ps|>": 50339,
73
+ "<|pt|>": 50266,
74
+ "<|ro|>": 50283,
75
+ "<|ru|>": 50262,
76
+ "<|sa|>": 50343,
77
+ "<|sd|>": 50331,
78
+ "<|si|>": 50321,
79
+ "<|sk|>": 50297,
80
+ "<|sl|>": 50304,
81
+ "<|sn|>": 50323,
82
+ "<|so|>": 50325,
83
+ "<|sq|>": 50316,
84
+ "<|sr|>": 50302,
85
+ "<|startoflm|>": 50359,
86
+ "<|startofprev|>": 50360,
87
+ "<|startoftranscript|>": 50257,
88
+ "<|su|>": 50356,
89
+ "<|sv|>": 50272,
90
+ "<|sw|>": 50317,
91
+ "<|ta|>": 50286,
92
+ "<|te|>": 50298,
93
+ "<|tg|>": 50330,
94
+ "<|th|>": 50288,
95
+ "<|tk|>": 50340,
96
+ "<|tl|>": 50347,
97
+ "<|transcribe|>": 50358,
98
+ "<|translate|>": 50357,
99
+ "<|tr|>": 50267,
100
+ "<|tt|>": 50350,
101
+ "<|uk|>": 50279,
102
+ "<|ur|>": 50289,
103
+ "<|uz|>": 50336,
104
+ "<|vi|>": 50277,
105
+ "<|yi|>": 50334,
106
+ "<|yo|>": 50324,
107
+ "<|zh|>": 50259
108
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff
special_tokens_map.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|startoftranscript|>",
4
+ "<|en|>",
5
+ "<|zh|>",
6
+ "<|de|>",
7
+ "<|es|>",
8
+ "<|ru|>",
9
+ "<|ko|>",
10
+ "<|fr|>",
11
+ "<|ja|>",
12
+ "<|pt|>",
13
+ "<|tr|>",
14
+ "<|pl|>",
15
+ "<|ca|>",
16
+ "<|nl|>",
17
+ "<|ar|>",
18
+ "<|sv|>",
19
+ "<|it|>",
20
+ "<|id|>",
21
+ "<|hi|>",
22
+ "<|fi|>",
23
+ "<|vi|>",
24
+ "<|iw|>",
25
+ "<|uk|>",
26
+ "<|el|>",
27
+ "<|ms|>",
28
+ "<|cs|>",
29
+ "<|ro|>",
30
+ "<|da|>",
31
+ "<|hu|>",
32
+ "<|ta|>",
33
+ "<|no|>",
34
+ "<|th|>",
35
+ "<|ur|>",
36
+ "<|hr|>",
37
+ "<|bg|>",
38
+ "<|lt|>",
39
+ "<|la|>",
40
+ "<|mi|>",
41
+ "<|ml|>",
42
+ "<|cy|>",
43
+ "<|sk|>",
44
+ "<|te|>",
45
+ "<|fa|>",
46
+ "<|lv|>",
47
+ "<|bn|>",
48
+ "<|sr|>",
49
+ "<|az|>",
50
+ "<|sl|>",
51
+ "<|kn|>",
52
+ "<|et|>",
53
+ "<|mk|>",
54
+ "<|br|>",
55
+ "<|eu|>",
56
+ "<|is|>",
57
+ "<|hy|>",
58
+ "<|ne|>",
59
+ "<|mn|>",
60
+ "<|bs|>",
61
+ "<|kk|>",
62
+ "<|sq|>",
63
+ "<|sw|>",
64
+ "<|gl|>",
65
+ "<|mr|>",
66
+ "<|pa|>",
67
+ "<|si|>",
68
+ "<|km|>",
69
+ "<|sn|>",
70
+ "<|yo|>",
71
+ "<|so|>",
72
+ "<|af|>",
73
+ "<|oc|>",
74
+ "<|ka|>",
75
+ "<|be|>",
76
+ "<|tg|>",
77
+ "<|sd|>",
78
+ "<|gu|>",
79
+ "<|am|>",
80
+ "<|yi|>",
81
+ "<|lo|>",
82
+ "<|uz|>",
83
+ "<|fo|>",
84
+ "<|ht|>",
85
+ "<|ps|>",
86
+ "<|tk|>",
87
+ "<|nn|>",
88
+ "<|mt|>",
89
+ "<|sa|>",
90
+ "<|lb|>",
91
+ "<|my|>",
92
+ "<|bo|>",
93
+ "<|tl|>",
94
+ "<|mg|>",
95
+ "<|as|>",
96
+ "<|tt|>",
97
+ "<|haw|>",
98
+ "<|ln|>",
99
+ "<|ha|>",
100
+ "<|ba|>",
101
+ "<|jw|>",
102
+ "<|su|>",
103
+ "<|translate|>",
104
+ "<|transcribe|>",
105
+ "<|startoflm|>",
106
+ "<|startofprev|>",
107
+ "<|nocaptions|>",
108
+ "<|notimestamps|>"
109
+ ],
110
+ "bos_token": "<|endoftext|>",
111
+ "eos_token": "<|endoftext|>",
112
+ "unk_token": {
113
+ "content": "<|endoftext|>",
114
+ "lstrip": false,
115
+ "normalized": true,
116
+ "rstrip": false,
117
+ "single_word": false
118
+ }
119
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 1024,
22
+ "name_or_path": "/home/arthur_huggingface_co/transformers/whisper-any.en",
23
+ "pad_token": null,
24
+ "processor_class": "WhisperProcessor",
25
+ "special_tokens_map_file": null,
26
+ "tokenizer_class": "WhisperTokenizer",
27
+ "unk_token": {
28
+ "__type": "AddedToken",
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ }
35
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff