import datasets from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset import torch import soundfile as sf import numpy as np import gradio as gr import io import sentencepiece # Charger les modèles et les embeddings du locuteur une seule fois pour éviter de les recharger à chaque appel processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) def text_to_speech(text): # Prétraiter le texte inputs = processor(text=text, return_tensors="pt") # Générer la parole speech = model.generate_speech( inputs["input_ids"], speaker_embeddings, vocoder=vocoder ) # Enregistrer l'audio dans un buffer buffer = io.BytesIO() sf.write(buffer, speech.numpy(), samplerate=16000, format="WAV") return buffer.getvalue() # Créer l'interface Gradio interface = gr.Interface( fn=text_to_speech, inputs="text", outputs=gr.Audio(label="Processed Audio"), title="Application du type Text to speech", description="Entrez un texte en anglais et l'application va la traduire en audio" ).launch()