import gradio as gr from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor import torch import numpy as np device = "cuda:0" if torch.cuda.is_available() else "cpu" device = "cpu" torch_dtype = torch.float16 if device != "cpu" else torch.float32 print("Device:", device) model_id = "openai/whisper-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe_transcription = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch_dtype, device=device, ) pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device) pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark def get_translation(text): return pipe_translate(text)[0]["translation_text"] def get_transcript(voice): return pipe_transcription(voice, generate_kwargs={"task": "translate", "language": "french"})["text"] def get_audio(text): speech = pipe_tts(text) return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T with gr.Blocks() as demo: with gr.Tab("Voix (plus lent)"): voice = gr.Audio(sources=["microphone"], type="filepath") translation_button = gr.Button("Traduire votre enregistrement !") output_text = gr.Textbox( label="Texte traduit", info="Votre texte", lines=3, placeholder="Votre traduction", ) speech_button = gr.Button("Générer audio !") translation_button.click( get_transcript, inputs=[ voice ], outputs=[ output_text ], ) speech_button.click( get_audio, inputs=[ output_text ], outputs=[ gr.Audio(label="Output") ], ) with gr.Tab("Texte (rapide)"): input_text = gr.Textbox( label="Input text", info="Your text", lines=3, placeholder="Écrire le texte à traduire", ) translation_button = gr.Button("Traduire...") output_text = gr.Textbox( label="Output text", info="Your text", lines=3, placeholder="Votre traduction", ) speech_button = gr.Button("Générer audio...") translation_button.click( get_translation, inputs=[ input_text ], outputs=[ output_text ], ) speech_button.click( get_audio, inputs=[ output_text ], outputs=[ gr.Audio(label="Output") ], ) demo.launch()