import gradio as gr from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor import torch import numpy as np device = "cuda:0" if torch.cuda.is_available() else "cpu" #device = "cpu" torch_dtype = torch.float16 if device != "cpu" else torch.float32 print("Device:", device) model_id = "openai/whisper-large-v3" #model_id = "openai/whisper-medium" # model_id = "openai/whisper-large-v3" # model_id = "openai/whisper-medium" # model = AutoModelForSpeechSeq2Seq.from_pretrained( # model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True # ) # model.to(device) # processor = AutoProcessor.from_pretrained(model_id) # pipe_transcription = pipeline( # "automatic-speech-recognition", # model=model, # tokenizer=processor.tokenizer, # feature_extractor=processor.feature_extractor, # max_new_tokens=128, # chunk_length_s=30, # batch_size=16, # return_timestamps=True, # torch_dtype=torch_dtype, # device=device, # ) pipe_transcription = pipeline("automatic-speech-recognition", model="pierreguillou/whisper-medium-french") pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device) pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark def get_translation(text): return pipe_translate(text)[0]["translation_text"] def get_transcript(voice): return get_translation(pipe_transcription(voice)["text"])#, generate_kwargs={"task": "translate", "language": "french"})["text"] def get_audio(text): speech = pipe_tts(text) return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T with gr.Blocks() as demo: with gr.Tab("Texte (rapide)"): input_text = gr.Textbox( label="Input text", info="Your text", lines=3, placeholder="Écrire le texte à traduire", ) translation_button = gr.Button("Traduire...") output_text = gr.Textbox( label="Output text", info="Your text", lines=3, placeholder="Votre traduction", ) speech_button = gr.Button("Générer audio...") translation_button.click( get_translation, inputs=[ input_text ], outputs=[ output_text ], ) speech_button.click( get_audio, inputs=[ output_text ], outputs=[ gr.Audio(label="Output") ], ) with gr.Tab("Voix (plus lent)"): voice = gr.Audio(sources=["microphone"], type="filepath") translation_button = gr.Button("Traduire votre enregistrement !") output_text = gr.Textbox( label="Texte traduit", info="Votre texte", lines=3, placeholder="Votre traduction", ) speech_button = gr.Button("Générer audio !") translation_button.click( get_transcript, inputs=[ voice ], outputs=[ output_text ], ) speech_button.click( get_audio, inputs=[ output_text ], outputs=[ gr.Audio(label="Output") ], ) demo.launch()