mafoaurelie's picture
Update app.py
be0bf55 verified
import datasets
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
import numpy as np
import gradio as gr
import io
import sentencepiece
# Charger les modèles et les embeddings du locuteur une seule fois pour éviter de les recharger à chaque appel
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def text_to_speech(text):
# Prétraiter le texte
inputs = processor(text=text, return_tensors="pt")
# Générer la parole
speech = model.generate_speech(
inputs["input_ids"], speaker_embeddings, vocoder=vocoder
)
# Enregistrer l'audio dans un buffer
buffer = io.BytesIO()
sf.write(buffer, speech.numpy(), samplerate=16000, format="WAV")
return buffer.getvalue()
# Créer l'interface Gradio
interface = gr.Interface(
fn=text_to_speech,
inputs="text",
outputs=gr.Audio(label="Processed Audio"),
title="Application du type Text to speech",
description="Entrez un texte en anglais et l'application va la traduire en audio"
).launch()