ylacombe HF staff commited on
Commit
f457685
1 Parent(s): 98a3618

Upload processor

Browse files
added_tokens.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__ace_Latn__": 256207,
3
+ "__ace__": 256206,
4
+ "__acm__": 256208,
5
+ "__acq__": 256209,
6
+ "__aeb__": 256210,
7
+ "__ajp__": 256211,
8
+ "__aka__": 256212,
9
+ "__als__": 256288,
10
+ "__apc__": 256213,
11
+ "__ars__": 256214,
12
+ "__ast__": 256215,
13
+ "__awa__": 256216,
14
+ "__ayr__": 256217,
15
+ "__azb__": 256218,
16
+ "__bak__": 256219,
17
+ "__bam__": 256220,
18
+ "__ban__": 256221,
19
+ "__bem__": 256222,
20
+ "__bho__": 256223,
21
+ "__bjn_Latn__": 256225,
22
+ "__bjn__": 256224,
23
+ "__bod__": 256226,
24
+ "__bug__": 256227,
25
+ "__cjk__": 256228,
26
+ "__crh__": 256229,
27
+ "__dik__": 256230,
28
+ "__dyu__": 256231,
29
+ "__dzo__": 256232,
30
+ "__epo__": 256233,
31
+ "__ewe__": 256234,
32
+ "__fao__": 256235,
33
+ "__fij__": 256236,
34
+ "__fon__": 256237,
35
+ "__fur__": 256238,
36
+ "__gla__": 256239,
37
+ "__grn__": 256240,
38
+ "__hat__": 256241,
39
+ "__hau__": 256242,
40
+ "__hne__": 256243,
41
+ "__ilo__": 256244,
42
+ "__kab__": 256245,
43
+ "__kac__": 256246,
44
+ "__kam__": 256247,
45
+ "__kas_Deva__": 256249,
46
+ "__kas__": 256248,
47
+ "__kbp__": 256252,
48
+ "__kea__": 256253,
49
+ "__kik__": 256254,
50
+ "__kin__": 256255,
51
+ "__kmb__": 256256,
52
+ "__kmr__": 256258,
53
+ "__knc_Latn__": 256251,
54
+ "__knc__": 256250,
55
+ "__kon__": 256257,
56
+ "__lij__": 256259,
57
+ "__lim__": 256260,
58
+ "__lin__": 256261,
59
+ "__lmo__": 256262,
60
+ "__ltg__": 256263,
61
+ "__ltz__": 256264,
62
+ "__lua__": 256265,
63
+ "__lus__": 256266,
64
+ "__mag__": 256267,
65
+ "__min__": 256268,
66
+ "__mos__": 256270,
67
+ "__mri__": 256271,
68
+ "__nso__": 256273,
69
+ "__nus__": 256274,
70
+ "__oci__": 256275,
71
+ "__pag__": 256276,
72
+ "__pap__": 256277,
73
+ "__plt__": 256269,
74
+ "__prs__": 256278,
75
+ "__quy__": 256279,
76
+ "__run__": 256280,
77
+ "__sag__": 256281,
78
+ "__san__": 256282,
79
+ "__scn__": 256283,
80
+ "__shn__": 256284,
81
+ "__sin__": 256285,
82
+ "__smo__": 256286,
83
+ "__sot__": 256287,
84
+ "__srd__": 256289,
85
+ "__ssw__": 256290,
86
+ "__sun__": 256291,
87
+ "__szl__": 256292,
88
+ "__taq_Tfng__": 256296,
89
+ "__taq__": 256295,
90
+ "__tat__": 256293,
91
+ "__tir__": 256294,
92
+ "__tpi__": 256297,
93
+ "__tsn__": 256298,
94
+ "__tso__": 256299,
95
+ "__tuk__": 256300,
96
+ "__tum__": 256301,
97
+ "__twi__": 256302,
98
+ "__tzm__": 256303,
99
+ "__uig__": 256304,
100
+ "__umb__": 256305,
101
+ "__vec__": 256306,
102
+ "__war__": 256307,
103
+ "__wol__": 256308,
104
+ "__xho__": 256309,
105
+ "__ydd__": 256310,
106
+ "__zsm__": 256272
107
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "SeamlessM4TFeatureExtractor",
3
+ "feature_size": 80,
4
+ "num_mel_bins": 80,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "SeamlessM4TProcessor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000,
10
+ "src_lang": "eng",
11
+ "stride": 2,
12
+ "tgt_lang": "fra"
13
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14bb8dfb35c0ffdea7bc01e56cea38b9e3d5efcdcb9c251d6b40538e1aab555a
3
+ size 4852054
special_tokens_map.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "__ace__",
4
+ "__ace_Latn__",
5
+ "__acm__",
6
+ "__acq__",
7
+ "__aeb__",
8
+ "__afr__",
9
+ "__ajp__",
10
+ "__aka__",
11
+ "__amh__",
12
+ "__apc__",
13
+ "__arb__",
14
+ "__ars__",
15
+ "__ary__",
16
+ "__arz__",
17
+ "__asm__",
18
+ "__ast__",
19
+ "__awa__",
20
+ "__ayr__",
21
+ "__azb__",
22
+ "__azj__",
23
+ "__bak__",
24
+ "__bam__",
25
+ "__ban__",
26
+ "__bel__",
27
+ "__bem__",
28
+ "__ben__",
29
+ "__bho__",
30
+ "__bjn__",
31
+ "__bjn_Latn__",
32
+ "__bod__",
33
+ "__bos__",
34
+ "__bug__",
35
+ "__bul__",
36
+ "__cat__",
37
+ "__ceb__",
38
+ "__ces__",
39
+ "__cjk__",
40
+ "__ckb__",
41
+ "__crh__",
42
+ "__cym__",
43
+ "__dan__",
44
+ "__deu__",
45
+ "__dik__",
46
+ "__dyu__",
47
+ "__dzo__",
48
+ "__ell__",
49
+ "__eng__",
50
+ "__epo__",
51
+ "__est__",
52
+ "__eus__",
53
+ "__ewe__",
54
+ "__fao__",
55
+ "__pes__",
56
+ "__fij__",
57
+ "__fin__",
58
+ "__fon__",
59
+ "__fra__",
60
+ "__fur__",
61
+ "__fuv__",
62
+ "__gla__",
63
+ "__gle__",
64
+ "__glg__",
65
+ "__grn__",
66
+ "__guj__",
67
+ "__hat__",
68
+ "__hau__",
69
+ "__heb__",
70
+ "__hin__",
71
+ "__hne__",
72
+ "__hrv__",
73
+ "__hun__",
74
+ "__hye__",
75
+ "__ibo__",
76
+ "__ilo__",
77
+ "__ind__",
78
+ "__isl__",
79
+ "__ita__",
80
+ "__jav__",
81
+ "__jpn__",
82
+ "__kab__",
83
+ "__kac__",
84
+ "__kam__",
85
+ "__kan__",
86
+ "__kas__",
87
+ "__kas_Deva__",
88
+ "__kat__",
89
+ "__knc__",
90
+ "__knc_Latn__",
91
+ "__kaz__",
92
+ "__kbp__",
93
+ "__kea__",
94
+ "__khm__",
95
+ "__kik__",
96
+ "__kin__",
97
+ "__kir__",
98
+ "__kmb__",
99
+ "__kon__",
100
+ "__kor__",
101
+ "__kmr__",
102
+ "__lao__",
103
+ "__lvs__",
104
+ "__lij__",
105
+ "__lim__",
106
+ "__lin__",
107
+ "__lit__",
108
+ "__lmo__",
109
+ "__ltg__",
110
+ "__ltz__",
111
+ "__lua__",
112
+ "__lug__",
113
+ "__luo__",
114
+ "__lus__",
115
+ "__mag__",
116
+ "__mai__",
117
+ "__mal__",
118
+ "__mar__",
119
+ "__min__",
120
+ "__mkd__",
121
+ "__plt__",
122
+ "__mlt__",
123
+ "__mni__",
124
+ "__khk__",
125
+ "__mos__",
126
+ "__mri__",
127
+ "__zsm__",
128
+ "__mya__",
129
+ "__nld__",
130
+ "__nno__",
131
+ "__nob__",
132
+ "__npi__",
133
+ "__nso__",
134
+ "__nus__",
135
+ "__nya__",
136
+ "__oci__",
137
+ "__gaz__",
138
+ "__ory__",
139
+ "__pag__",
140
+ "__pan__",
141
+ "__pap__",
142
+ "__pol__",
143
+ "__por__",
144
+ "__prs__",
145
+ "__pbt__",
146
+ "__quy__",
147
+ "__ron__",
148
+ "__run__",
149
+ "__rus__",
150
+ "__sag__",
151
+ "__san__",
152
+ "__sat__",
153
+ "__scn__",
154
+ "__shn__",
155
+ "__sin__",
156
+ "__slk__",
157
+ "__slv__",
158
+ "__smo__",
159
+ "__sna__",
160
+ "__snd__",
161
+ "__som__",
162
+ "__sot__",
163
+ "__spa__",
164
+ "__als__",
165
+ "__srd__",
166
+ "__srp__",
167
+ "__ssw__",
168
+ "__sun__",
169
+ "__swe__",
170
+ "__swh__",
171
+ "__szl__",
172
+ "__tam__",
173
+ "__tat__",
174
+ "__tel__",
175
+ "__tgk__",
176
+ "__tgl__",
177
+ "__tha__",
178
+ "__tir__",
179
+ "__taq__",
180
+ "__taq_Tfng__",
181
+ "__tpi__",
182
+ "__tsn__",
183
+ "__tso__",
184
+ "__tuk__",
185
+ "__tum__",
186
+ "__tur__",
187
+ "__twi__",
188
+ "__tzm__",
189
+ "__uig__",
190
+ "__ukr__",
191
+ "__umb__",
192
+ "__urd__",
193
+ "__uzn__",
194
+ "__vec__",
195
+ "__vie__",
196
+ "__war__",
197
+ "__wol__",
198
+ "__xho__",
199
+ "__ydd__",
200
+ "__yor__",
201
+ "__yue__",
202
+ "__cmn__",
203
+ "__cmn_Hant__",
204
+ "__zul__",
205
+ "<MINED_DATA>",
206
+ "<MMT_BT_DATA>",
207
+ "<SMT_BT_DATA>"
208
+ ],
209
+ "bos_token": "<s>",
210
+ "cls_token": "<s>",
211
+ "eos_token": "</s>",
212
+ "pad_token": "<pad>",
213
+ "sep_token": "</s>",
214
+ "unk_token": "<unk>"
215
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "<pad>",
9
+ "processor_class": "SeamlessM4TProcessor",
10
+ "sep_token": "</s>",
11
+ "sp_model_kwargs": {},
12
+ "src_lang": "eng",
13
+ "tgt_lang": "fra",
14
+ "tokenizer_class": "SeamlessM4TTokenizer",
15
+ "tokenizer_file": null,
16
+ "unk_token": "<unk>"
17
+ }