MeloTTS / app.py
m-ric's picture
m-ric HF staff
Rebase on another space by PHBJT
b4c7847
import spaces
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "PHBJT/french_parler_tts_mini_v0.1"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42
default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
examples = [
[
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
"A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
None,
],
[
"Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
"A male voice delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice, creating a close-sounding audio experience.",
None,
],
[
"La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
"A male voice provides a monotone yet slightly fast delivery, with a very close recording that almost has no background noise.",
None,
],
[
"Le progrès fait naître plus de besoins qu'il n'en satisfait.",
"A female voice, in a very poor recording quality, delivers slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. The voice is slightly higher pitched than average.",
None,
],
]
number_normalizer = EnglishNumberNormalizer()
def preprocess(text):
text = number_normalizer(text).strip()
text = text.replace("-", " ")
if text[-1] not in punctuation:
text = f"{text}."
abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
def separate_abb(chunk):
chunk = chunk.replace(".","")
print(chunk)
return " ".join(chunk)
abbreviations = re.findall(abbreviations_pattern, text)
for abv in abbreviations:
if abv in text:
text = text.replace(abv, separate_abb(abv))
return text
@spaces.GPU
def gen_tts(text, description):
inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
set_seed(SEED)
generation = model.generate(
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
)
audio_arr = generation.cpu().numpy().squeeze()
return SAMPLE_RATE, audio_arr
def extract_text(file):
from pypdf import PdfReader
reader = PdfReader(file)
number_of_pages = len(reader.pages)
text = ''.join(page.extract_text() for page in reader.pages[:10])
return text
with gr.Blocks() as demo:
gr.Markdown("""# PDF reader
Un lecteur pdf construit avec [MeloTTS](https://github.com/myshell-ai/MeloTTS).
### Comment l'utiliser ?
1. Téléversez le document pdf à lire.
2. Cliquez sur "Extraire le texte" pour extraire les 10 premières pages.
3. Cliquez sur "Réciter le texte" pour générer l'audio.""")
with gr.Group():
speaker_description = gr.Textbox(value='A male voice delivers a slightly expressive and animated speech with a quick speed. The recording features a low-pitch voice, creating a close-sounding audio experience.', label='Description de la voix')
file = gr.File(label="Document à lire")
btn_extract = gr.Button('Extraire le texte', variant='primary')
text = gr.Textbox(label="Texte extrait")
btn = gr.Button('Réciter le texte', variant='primary')
audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
btn_extract.click(extract_text, inputs=[file], outputs=[text])
btn.click(gen_tts, inputs=[text, speaker_description], outputs=[audio_out])
gr.Markdown('Demo by [m-ric](https://x.com/AymericRoucher).')
demo.queue(api_open=True, default_concurrency_limit=10).launch(show_api=True, share=True)