| import os | |
| import time | |
| import torch | |
| import urllib.request | |
| import gradio as gr | |
| import nltk | |
| import numpy as np | |
| import soundfile as sf | |
| from espnet2.bin.tts_inference import Text2Speech | |
| from espnet2.utils.types import str_or_none | |
| from pathlib import Path | |
| from nltk.tokenize import sent_tokenize | |
| nltk.download('punkt') | |
| gos_text2speech = Text2Speech.from_pretrained( | |
| model_tag="bartelds/gos_tts", | |
| device="cpu", | |
| speed_control_alpha=1.0, | |
| noise_scale=1.0, | |
| noise_scale_dur=1.0 | |
| ) | |
| def inference(text, lang): | |
| with torch.no_grad(): | |
| lines = sent_tokenize(text.lower()) | |
| outputs = [] | |
| for line in lines: | |
| line = line.lower() | |
| if lang == "Hoogelaandsters": | |
| wav = gos_text2speech(line, sids=np.array([1]))["wav"] | |
| elif lang == "Oldambsters": | |
| wav = gos_text2speech(line, sids=np.array([2]))["wav"] | |
| elif lang == "Westerkertaaiers": | |
| wav = gos_text2speech(line, sids=np.array([3]))["wav"] | |
| outputs.append(wav) | |
| concatenated_wav = np.concatenate([o.view(-1).cpu().numpy() for o in outputs]) | |
| sf.write("out.wav", concatenated_wav, gos_text2speech.fs) | |
| return "out.wav", "out.wav" | |
| title = "Gronings text-to-speech" | |
| examples = [ | |
| ['Mamme mos even noar winkel om n bosschop.', 'Hoogelaandsters'] | |
| ] | |
| gr.Interface( | |
| inference, | |
| [gr.inputs.Textbox(label="Input text", lines=3), gr.inputs.Radio(choices=["Hoogelaandsters", "Oldambsters", "Westerkertaaiers"], type="value", default="Hoogelaandsters", label="Variant")], | |
| [gr.outputs.Audio(type="file", label="Output"), gr.outputs.File()], | |
| title=title, | |
| examples=examples | |
| ).launch(enable_queue=True) | |