File size: 4,329 Bytes
a31d739
 
 
 
b5a171c
 
a31d739
 
 
a581dbd
53825e7
a31d739
 
 
 
 
cfc25e8
 
02e1a35
cfc25e8
a31d739
 
 
 
3887b31
8583009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a31d739
 
 
 
 
 
 
cfc25e8
a31d739
 
 
 
 
 
 
 
 
e970b40
a31d739
 
e970b40
 
 
 
 
 
 
 
 
a31d739
 
 
 
 
e970b40
a31d739
 
 
 
 
 
 
 
 
 
 
 
 
 
c64f48b
a31d739
502d6ca
cfc25e8
 
 
 
a31d739
 
 
 
 
 
 
67c5fd4
 
02e1a35
a31d739
cfc25e8
a31d739
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import librosa
import numpy as np
import torch
import re
from num2words import num2words

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

checkpoint = "GreenCounsel/speecht5_tts_common_voice_5_sv"
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


speaker_embeddings = {
    "Female": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
    "Male": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
    "Experimental":"spkemb/embeddings.npy",
    
}


def predict(text, speaker):
    if len(text.strip()) == 0 or len(text.strip())>200:
        text="Du måste ha minst ett och max 200 tecken."
    ar=[int(s) for s in re.findall(r'\b\d+\b',text)]
    for arr in ar:
        text=text.replace(str(arr),num2words(arr,lang="sv"))
    repl = [
    ('Ä', 'ae'),
    ('Å', 'o'),
    ('Ö', 'oe'),
    ('ä', 'ae'),
    ('å', 'o'),
    ('ö', 'oe'),
    ('ô','oe'),
    ('-',''),
    ('‘',''),
    ('’',''),
    ('“',''),
    ('”',''),
    ]


    for src, dst in repl:
        text = text.replace(src, dst)

    inputs = processor(text=text, return_tensors="pt")

    # limit input length
    input_ids = inputs["input_ids"]
    input_ids = input_ids[..., :model.config.max_text_positions]

    speaker_embedding = np.load(speaker_embeddings[speaker])

    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

    speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)

    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)


title = "SpeechT5 finetuned Swedish, TTS "

description = """
SpeechT5 text-to-speech model finetuned on the Swedish language from the 
Common Voice dataset. Inference runs on a basic CPU (2 vCPU, 16 GB ram) so 
please have patience if it takes some time. As a company founded by a female 
coder, our resources are extremely limited (female founders in tech only get approx. 
1 % of the venture capital and the women who receive funding seldom are the 
ones actually handling the tech). We are in a very biased sphere where 
female coders' companies seldom get the resources which would normally 
be necessary to do what they do. The app uses the SpeechT5 model 
finetuned for swedish by GreenCounsel, available here: [https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv](https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv).
"""

article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> |
<a href="https://github.com/microsoft/SpeechT5/">original SpeechT5</a> |
<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
<pre>
@article{Ao2021SpeechT5,
  title   = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
  author  = {Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
  eprint={2110.07205},
  archivePrefix={arXiv},
  primaryClass={eess.AS},
  year={2021}
}
</pre>
</div>
"""


examples = [
    ["GreenCounsel grundades i Malmö för sex år sedan.", "Female"],
    ["Med hjälp av maskininlärning kan mycket av juridiken automatiseras samtidigt som juristerna fokuserar på frågor där de ger störst värde.", "Male"],
    ["GreenCounsel har byggt en chatbott som kan förstå frågor på många olika språk och ge kvalitetssäkrade svar.", "Female"],
    ["Vi har också byggt ett system för att automatisera arbetsflöden för juridiska tjänster via internet.", "Male"],
    ["Talsyntesen bygger på en engelsk modell och kan därför upplevas som att jag bryter lite på engelska.","Female"]
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Input Text"),
        gr.Radio(label="Speaker", choices=[
            "Female", 
            "Male",
            "Experimental",
        ],
        value="Female"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()