SpeechT5-sv / app.py
CEHB's picture
Update app.py
67c5fd4
raw
history blame
4.26 kB
import gradio as gr
import librosa
import numpy as np
import torch
import re
from num2words import num2words
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
checkpoint = "GreenCounsel/speecht5_tts_common_voice_5_sv"
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
speaker_embeddings = {
"Female": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
"Male": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
}
def predict(text, speaker):
if len(text.strip()) == 0 or len(text.strip())>200:
text="Du måste ha minst ett och max 200 tecken."
ar=[int(s) for s in re.findall(r'\b\d+\b',text)]
for arr in ar:
text=text.replace(str(arr),num2words(arr,lang="sv"))
repl = [
('Ä', 'ae'),
('Å', 'o'),
('Ö', 'oe'),
('ä', 'ae'),
('å', 'o'),
('ö', 'oe'),
('ô','oe'),
('-',''),
('‘',''),
('’',''),
('“',''),
('”',''),
]
for src, dst in repl:
text = text.replace(src, dst)
inputs = processor(text=text, return_tensors="pt")
# limit input length
input_ids = inputs["input_ids"]
input_ids = input_ids[..., :model.config.max_text_positions]
speaker_embedding = np.load(speaker_embeddings[speaker])
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech)
title = "SpeechT5 finetuned Swedish, TTS "
description = """
SpeechT5 text-to-speech model finetuned on the Swedish language from the
Common Voice dataset. Inference runs on a basic CPU (2 vCPU, 16 GB ram) so
please have patience if it takes some time. As a company founded by a female
coder, our resources are extremely limited (female founders in tech only get approx.
1 % of the venture capital and the women who receive funding seldom are the
ones actually handling the tech). We are in a very biased sphere where
female coders' companies seldom get the resources which would normally
be necessary to do what they do. The app uses the SpeechT5 model
finetuned for swedish by GreenCounsel, available here: [https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv](https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv).
"""
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original SpeechT5</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
<pre>
@article{Ao2021SpeechT5,
title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
author = {Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
eprint={2110.07205},
archivePrefix={arXiv},
primaryClass={eess.AS},
year={2021}
}
</pre>
</div>
"""
examples = [
["GreenCounsel grundades i Malmö för sex år sedan.", "Female"],
["Med hjälp av maskininlärning kan mycket av juridiken automatiseras samtidigt som juristerna fokuserar på frågor där de ger störst värde.", "Male"],
["GreenCounsel har byggt en chatbott som kan förstå frågor på många olika språk och ge kvalitetssäkrade svar.", "Female"],
["Vi har också byggt ett system för att automatisera arbetsflöden för juridiska tjänster via internet.", "Male"],
["Talsyntesen bygger på en engelsk modell och kan därför upplevas som att jag bryter lite på engelska.","Female"]
]
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Speaker", choices=[
"Female",
"Male",
],
value="Female"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch()