Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor | |
import torch | |
import numpy as np | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
device = "cpu" | |
torch_dtype = torch.float16 if device != "cpu" else torch.float32 | |
print("Device:", device) | |
model_id = "openai/whisper-large-v3" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model.to(device) | |
processor = AutoProcessor.from_pretrained(model_id) | |
pipe_transcription = pipeline( | |
"automatic-speech-recognition", | |
model=model, | |
tokenizer=processor.tokenizer, | |
feature_extractor=processor.feature_extractor, | |
max_new_tokens=128, | |
chunk_length_s=30, | |
batch_size=16, | |
return_timestamps=True, | |
torch_dtype=torch_dtype, | |
device=device, | |
) | |
pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device) | |
pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark | |
def get_translation(text): | |
return pipe_translate(text)[0]["translation_text"] | |
def get_transcript(voice): | |
return pipe_transcription(voice, generate_kwargs={"task": "translate", "language": "french"})["text"] | |
def get_audio(text): | |
speech = pipe_tts(text) | |
return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T | |
with gr.Blocks() as demo: | |
with gr.Tab("Texte (rapide)"): | |
input_text = gr.Textbox( | |
label="Input text", | |
info="Your text", | |
lines=3, | |
placeholder="Écrire le texte à traduire", | |
) | |
translation_button = gr.Button("Traduire...") | |
output_text = gr.Textbox( | |
label="Output text", | |
info="Your text", | |
lines=3, | |
placeholder="Votre traduction", | |
) | |
speech_button = gr.Button("Générer audio...") | |
translation_button.click( | |
get_translation, | |
inputs=[ | |
input_text | |
], | |
outputs=[ | |
output_text | |
], | |
) | |
speech_button.click( | |
get_audio, | |
inputs=[ | |
output_text | |
], | |
outputs=[ | |
gr.Audio(label="Output") | |
], | |
) | |
with gr.Tab("Voix (plus lent)"): | |
voice = gr.Audio(sources=["microphone"], type="filepath") | |
translation_button = gr.Button("Traduire votre enregistrement !") | |
output_text = gr.Textbox( | |
label="Texte traduit", | |
info="Votre texte", | |
lines=3, | |
placeholder="Votre traduction", | |
) | |
speech_button = gr.Button("Générer audio !") | |
translation_button.click( | |
get_transcript, | |
inputs=[ | |
voice | |
], | |
outputs=[ | |
output_text | |
], | |
) | |
speech_button.click( | |
get_audio, | |
inputs=[ | |
output_text | |
], | |
outputs=[ | |
gr.Audio(label="Output") | |
], | |
) | |
demo.launch() |