NouFuS's picture
added tab with voice input.
494f8dc verified
raw
history blame
No virus
3.42 kB
import gradio as gr
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
import torch
import numpy as np
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = "cpu"
torch_dtype = torch.float16 if device != "cpu" else torch.float32
print("Device:", device)
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe_transcription = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device)
pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark
def get_translation(text):
return pipe_translate(text)[0]["translation_text"]
def get_transcript(voice):
return pipe_transcription(voice, generate_kwargs={"task": "translate", "language": "french"})["text"]
def get_audio(text):
speech = pipe_tts(text)
return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T
with gr.Blocks() as demo:
with gr.Tab("Voix (plus lent)"):
voice = gr.Audio(sources=["microphone"], type="filepath")
translation_button = gr.Button("Traduire votre enregistrement !")
output_text = gr.Textbox(
label="Texte traduit",
info="Votre texte",
lines=3,
placeholder="Votre traduction",
)
speech_button = gr.Button("Générer audio !")
translation_button.click(
get_transcript,
inputs=[
voice
],
outputs=[
output_text
],
)
speech_button.click(
get_audio,
inputs=[
output_text
],
outputs=[
gr.Audio(label="Output")
],
)
with gr.Tab("Texte (rapide)"):
input_text = gr.Textbox(
label="Input text",
info="Your text",
lines=3,
placeholder="Écrire le texte à traduire",
)
translation_button = gr.Button("Traduire...")
output_text = gr.Textbox(
label="Output text",
info="Your text",
lines=3,
placeholder="Votre traduction",
)
speech_button = gr.Button("Générer audio...")
translation_button.click(
get_translation,
inputs=[
input_text
],
outputs=[
output_text
],
)
speech_button.click(
get_audio,
inputs=[
output_text
],
outputs=[
gr.Audio(label="Output")
],
)
demo.launch()