|
import os |
|
import time |
|
import torch |
|
import urllib.request |
|
import gradio as gr |
|
import nltk |
|
import numpy as np |
|
import soundfile as sf |
|
from espnet2.bin.tts_inference import Text2Speech |
|
from espnet2.utils.types import str_or_none |
|
from pathlib import Path |
|
from nltk.tokenize import sent_tokenize |
|
|
|
nltk.download('punkt') |
|
|
|
gos_text2speech = Text2Speech.from_pretrained( |
|
model_tag="bartelds/gos_tts", |
|
device="cpu", |
|
speed_control_alpha=1.0, |
|
noise_scale=1.0, |
|
noise_scale_dur=1.0 |
|
) |
|
|
|
def inference(text, lang): |
|
with torch.no_grad(): |
|
lines = sent_tokenize(text.lower()) |
|
outputs = [] |
|
|
|
for line in lines: |
|
line = line.lower() |
|
if lang == "Hoogelaandsters": |
|
wav = gos_text2speech(line, sids=np.array([1]))["wav"] |
|
elif lang == "Oldambsters": |
|
wav = gos_text2speech(line, sids=np.array([2]))["wav"] |
|
elif lang == "Westerkertaaiers": |
|
wav = gos_text2speech(line, sids=np.array([3]))["wav"] |
|
|
|
outputs.append(wav) |
|
|
|
concatenated_wav = np.concatenate([o.view(-1).cpu().numpy() for o in outputs]) |
|
sf.write("out.wav", concatenated_wav, gos_text2speech.fs) |
|
|
|
return "out.wav", "out.wav" |
|
|
|
title = "Gronings text-to-speech" |
|
examples = [ |
|
['Mamme mos even noar winkel om n bosschop.', 'Hoogelaandsters'] |
|
] |
|
|
|
gr.Interface( |
|
inference, |
|
[gr.inputs.Textbox(label="Input text", lines=3), gr.inputs.Radio(choices=["Hoogelaandsters", "Oldambsters", "Westerkertaaiers"], type="value", default="Hoogelaandsters", label="Variant")], |
|
[gr.outputs.Audio(type="file", label="Output"), gr.outputs.File()], |
|
title=title, |
|
examples=examples |
|
).launch(enable_queue=True) |
|
|