peterBagnegaard commited on
Commit
78cdd8e
1 Parent(s): 7058e51

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +133 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +144 -0
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|startoftranscript|>",
5
+ "<|en|>",
6
+ "<|zh|>",
7
+ "<|de|>",
8
+ "<|es|>",
9
+ "<|ru|>",
10
+ "<|ko|>",
11
+ "<|fr|>",
12
+ "<|ja|>",
13
+ "<|pt|>",
14
+ "<|tr|>",
15
+ "<|pl|>",
16
+ "<|ca|>",
17
+ "<|nl|>",
18
+ "<|ar|>",
19
+ "<|sv|>",
20
+ "<|it|>",
21
+ "<|id|>",
22
+ "<|hi|>",
23
+ "<|fi|>",
24
+ "<|vi|>",
25
+ "<|he|>",
26
+ "<|uk|>",
27
+ "<|el|>",
28
+ "<|ms|>",
29
+ "<|cs|>",
30
+ "<|ro|>",
31
+ "<|da|>",
32
+ "<|hu|>",
33
+ "<|ta|>",
34
+ "<|no|>",
35
+ "<|th|>",
36
+ "<|ur|>",
37
+ "<|hr|>",
38
+ "<|bg|>",
39
+ "<|lt|>",
40
+ "<|la|>",
41
+ "<|mi|>",
42
+ "<|ml|>",
43
+ "<|cy|>",
44
+ "<|sk|>",
45
+ "<|te|>",
46
+ "<|fa|>",
47
+ "<|lv|>",
48
+ "<|bn|>",
49
+ "<|sr|>",
50
+ "<|az|>",
51
+ "<|sl|>",
52
+ "<|kn|>",
53
+ "<|et|>",
54
+ "<|mk|>",
55
+ "<|br|>",
56
+ "<|eu|>",
57
+ "<|is|>",
58
+ "<|hy|>",
59
+ "<|ne|>",
60
+ "<|mn|>",
61
+ "<|bs|>",
62
+ "<|kk|>",
63
+ "<|sq|>",
64
+ "<|sw|>",
65
+ "<|gl|>",
66
+ "<|mr|>",
67
+ "<|pa|>",
68
+ "<|si|>",
69
+ "<|km|>",
70
+ "<|sn|>",
71
+ "<|yo|>",
72
+ "<|so|>",
73
+ "<|af|>",
74
+ "<|oc|>",
75
+ "<|ka|>",
76
+ "<|be|>",
77
+ "<|tg|>",
78
+ "<|sd|>",
79
+ "<|gu|>",
80
+ "<|am|>",
81
+ "<|yi|>",
82
+ "<|lo|>",
83
+ "<|uz|>",
84
+ "<|fo|>",
85
+ "<|ht|>",
86
+ "<|ps|>",
87
+ "<|tk|>",
88
+ "<|nn|>",
89
+ "<|mt|>",
90
+ "<|sa|>",
91
+ "<|lb|>",
92
+ "<|my|>",
93
+ "<|bo|>",
94
+ "<|tl|>",
95
+ "<|mg|>",
96
+ "<|as|>",
97
+ "<|tt|>",
98
+ "<|haw|>",
99
+ "<|ln|>",
100
+ "<|ha|>",
101
+ "<|ba|>",
102
+ "<|jw|>",
103
+ "<|su|>",
104
+ "<|translate|>",
105
+ "<|transcribe|>",
106
+ "<|startoflm|>",
107
+ "<|startofprev|>",
108
+ "<|nocaptions|>",
109
+ "<|notimestamps|>"
110
+ ],
111
+ "bos_token": {
112
+ "content": "<|endoftext|>",
113
+ "lstrip": false,
114
+ "normalized": true,
115
+ "rstrip": false,
116
+ "single_word": false
117
+ },
118
+ "eos_token": {
119
+ "content": "<|endoftext|>",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ },
125
+ "pad_token": "<|endoftext|>",
126
+ "unk_token": {
127
+ "content": "<|endoftext|>",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false
132
+ }
133
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "additional_special_tokens": [
5
+ "<|endoftext|>",
6
+ "<|startoftranscript|>",
7
+ "<|en|>",
8
+ "<|zh|>",
9
+ "<|de|>",
10
+ "<|es|>",
11
+ "<|ru|>",
12
+ "<|ko|>",
13
+ "<|fr|>",
14
+ "<|ja|>",
15
+ "<|pt|>",
16
+ "<|tr|>",
17
+ "<|pl|>",
18
+ "<|ca|>",
19
+ "<|nl|>",
20
+ "<|ar|>",
21
+ "<|sv|>",
22
+ "<|it|>",
23
+ "<|id|>",
24
+ "<|hi|>",
25
+ "<|fi|>",
26
+ "<|vi|>",
27
+ "<|he|>",
28
+ "<|uk|>",
29
+ "<|el|>",
30
+ "<|ms|>",
31
+ "<|cs|>",
32
+ "<|ro|>",
33
+ "<|da|>",
34
+ "<|hu|>",
35
+ "<|ta|>",
36
+ "<|no|>",
37
+ "<|th|>",
38
+ "<|ur|>",
39
+ "<|hr|>",
40
+ "<|bg|>",
41
+ "<|lt|>",
42
+ "<|la|>",
43
+ "<|mi|>",
44
+ "<|ml|>",
45
+ "<|cy|>",
46
+ "<|sk|>",
47
+ "<|te|>",
48
+ "<|fa|>",
49
+ "<|lv|>",
50
+ "<|bn|>",
51
+ "<|sr|>",
52
+ "<|az|>",
53
+ "<|sl|>",
54
+ "<|kn|>",
55
+ "<|et|>",
56
+ "<|mk|>",
57
+ "<|br|>",
58
+ "<|eu|>",
59
+ "<|is|>",
60
+ "<|hy|>",
61
+ "<|ne|>",
62
+ "<|mn|>",
63
+ "<|bs|>",
64
+ "<|kk|>",
65
+ "<|sq|>",
66
+ "<|sw|>",
67
+ "<|gl|>",
68
+ "<|mr|>",
69
+ "<|pa|>",
70
+ "<|si|>",
71
+ "<|km|>",
72
+ "<|sn|>",
73
+ "<|yo|>",
74
+ "<|so|>",
75
+ "<|af|>",
76
+ "<|oc|>",
77
+ "<|ka|>",
78
+ "<|be|>",
79
+ "<|tg|>",
80
+ "<|sd|>",
81
+ "<|gu|>",
82
+ "<|am|>",
83
+ "<|yi|>",
84
+ "<|lo|>",
85
+ "<|uz|>",
86
+ "<|fo|>",
87
+ "<|ht|>",
88
+ "<|ps|>",
89
+ "<|tk|>",
90
+ "<|nn|>",
91
+ "<|mt|>",
92
+ "<|sa|>",
93
+ "<|lb|>",
94
+ "<|my|>",
95
+ "<|bo|>",
96
+ "<|tl|>",
97
+ "<|mg|>",
98
+ "<|as|>",
99
+ "<|tt|>",
100
+ "<|haw|>",
101
+ "<|ln|>",
102
+ "<|ha|>",
103
+ "<|ba|>",
104
+ "<|jw|>",
105
+ "<|su|>",
106
+ "<|translate|>",
107
+ "<|transcribe|>",
108
+ "<|startoflm|>",
109
+ "<|startofprev|>",
110
+ "<|nocaptions|>",
111
+ "<|notimestamps|>"
112
+ ],
113
+ "bos_token": {
114
+ "__type": "AddedToken",
115
+ "content": "<|endoftext|>",
116
+ "lstrip": false,
117
+ "normalized": true,
118
+ "rstrip": false,
119
+ "single_word": false
120
+ },
121
+ "clean_up_tokenization_spaces": true,
122
+ "eos_token": {
123
+ "__type": "AddedToken",
124
+ "content": "<|endoftext|>",
125
+ "lstrip": false,
126
+ "normalized": true,
127
+ "rstrip": false,
128
+ "single_word": false
129
+ },
130
+ "errors": "replace",
131
+ "model_max_length": 1024,
132
+ "pad_token": "<|endoftext|>",
133
+ "processor_class": "WhisperProcessor",
134
+ "return_attention_mask": false,
135
+ "tokenizer_class": "WhisperTokenizer",
136
+ "unk_token": {
137
+ "__type": "AddedToken",
138
+ "content": "<|endoftext|>",
139
+ "lstrip": false,
140
+ "normalized": true,
141
+ "rstrip": false,
142
+ "single_word": false
143
+ }
144
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff