Spaces:
Sleeping
Sleeping
j-tobias
commited on
Commit
•
db6e0bb
1
Parent(s):
15f66cd
added new model
Browse files- app.py +2 -2
- cards.txt +11 -0
- processing.py +11 -0
app.py
CHANGED
@@ -26,7 +26,7 @@ login(hf_token)
|
|
26 |
|
27 |
|
28 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
29 |
-
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
30 |
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
|
31 |
|
32 |
# HELPER FUNCTIONS
|
@@ -43,7 +43,7 @@ def get_card(selected_model:str)->str:
|
|
43 |
if "ID: "+selected_model in card:
|
44 |
return card
|
45 |
|
46 |
-
return "Unknown Model"
|
47 |
|
48 |
def is_own(selected_option):
|
49 |
"""
|
|
|
26 |
|
27 |
|
28 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
29 |
+
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2","facebook/hf-seamless-m4t-medium"]
|
30 |
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
|
31 |
|
32 |
# HELPER FUNCTIONS
|
|
|
43 |
if "ID: "+selected_model in card:
|
44 |
return card
|
45 |
|
46 |
+
return "## Unknown Model"
|
47 |
|
48 |
def is_own(selected_option):
|
49 |
"""
|
cards.txt
CHANGED
@@ -34,4 +34,15 @@
|
|
34 |
- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
|
35 |
- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
(evaluating this model might take a while due to it's size)
|
|
|
34 |
- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
|
35 |
- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
|
36 |
|
37 |
+
(evaluating this model might take a while due to it's size)
|
38 |
+
@@
|
39 |
+
#### HF Seamless M4T Medium
|
40 |
+
- ID: facebook/hf-seamless-m4t-medium
|
41 |
+
- Hugging Face: [model](https://huggingface.co/facebook/hf-seamless-m4t-medium)
|
42 |
+
- Creator: facebook
|
43 |
+
- Finetuned: No
|
44 |
+
- Model Size: 1.2 B Parameters
|
45 |
+
- Model Paper: [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf)
|
46 |
+
- Training Data: ?
|
47 |
+
|
48 |
(evaluating this model might take a while due to it's size)
|
processing.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
3 |
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
|
4 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
|
|
5 |
|
6 |
# Import Libraries to access Datasets
|
7 |
from datasets import load_dataset
|
@@ -251,6 +252,9 @@ def load_model(model_id:str):
|
|
251 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
252 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
253 |
model.config.forced_decoder_ids = None
|
|
|
|
|
|
|
254 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
255 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
256 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
@@ -291,6 +295,13 @@ def model_compute(model, processor, sample, model_id):
|
|
291 |
transcription = processor.tokenizer.normalize(transcription[0])
|
292 |
print("TRANSCRIPTION Whisper Large v2: ", transcription)
|
293 |
return transcription
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
295 |
sample = sample["audio"]
|
296 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|
|
|
2 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
3 |
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
|
4 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
5 |
+
from transformers import AutoProcessor, SeamlessM4TModel
|
6 |
|
7 |
# Import Libraries to access Datasets
|
8 |
from datasets import load_dataset
|
|
|
252 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
253 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
254 |
model.config.forced_decoder_ids = None
|
255 |
+
elif model_id == "facebook/hf-seamless-m4t-medium":
|
256 |
+
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
257 |
+
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
|
258 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
259 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
260 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
|
|
295 |
transcription = processor.tokenizer.normalize(transcription[0])
|
296 |
print("TRANSCRIPTION Whisper Large v2: ", transcription)
|
297 |
return transcription
|
298 |
+
elif model_id == "facebook/hf-seamless-m4t-medium":
|
299 |
+
sample = sample["audio"]
|
300 |
+
input_data = processor(audios=sample["array"], return_tensors="pt")
|
301 |
+
output_tokens = model.generate(**input_data, tgt_lang="eng", generate_speech=False)
|
302 |
+
print(output_tokens)
|
303 |
+
transcription = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
304 |
+
return transcription
|
305 |
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
306 |
sample = sample["audio"]
|
307 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|