Spaces:
Running
Running
File size: 10,006 Bytes
b3fa29f e8958d3 b3fa29f e8958d3 b3fa29f e8958d3 b3fa29f e8958d3 b3fa29f e8958d3 b3fa29f e8958d3 b3fa29f e8958d3 b3fa29f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import gradio as gr
import numpy as np
import torch
import math
from Preprocessing.ArticulatoryCombinedTextFrontend import ArticulatoryCombinedTextFrontend
from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
from run_utterance_cloner import UtteranceCloner
# os.system("pip uninstall -y gradio")
# os.system("pip install gradio==2.7.5.2")
def float2pcm(sig, dtype='int16'):
"""
https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
"""
sig = np.asarray(sig)
if sig.dtype.kind != 'f':
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in 'iu':
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
class TTS_Interface:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.utterance_cloner = UtteranceCloner(device=self.device)
self.utterance_cloner.tts.set_language("de")
self.acoustic_model = Aligner()
self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
self.acoustic_model = self.acoustic_model.to(self.device)
self.dc = DurationCalculator(reduction_factor=1)
self.tf = ArticulatoryCombinedTextFrontend(language="en")
self.text = "Quellen hattest du ihm, hattest dem Flüchtigen, kühle Schatten geschenkt, und die Gestade sahen, all ihm nach, und es bebte, aus den Wellen ihr lieblich Bild."
reference_audio = "reference_audios/2.wav"
self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=False)
self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)
#######
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
self.current_voice = "male"
self.cloned_speech_male = self.utterance_cloner.tts(self.phones,
view=False,
durations=self.duration,
pitch=self.pitch,
energy=self.energy,
phones=True).cpu().numpy()
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
self.current_voice = "female"
self.cloned_speech_female = self.utterance_cloner.tts(self.phones,
view=False,
durations=self.duration,
pitch=self.pitch,
energy=self.energy,
phones=True).cpu().numpy()
#######
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
self.current_voice = "male"
self.reg_speech_male = self.utterance_cloner.tts(
"Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
view=False).cpu().numpy()
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
self.current_voice = "female"
self.reg_speech_female = self.utterance_cloner.tts(
"Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
view=False).cpu().numpy()
def read(self, _, speaker, lengthening, pause_dur, pitch_up):
if speaker == "Female Voice" and self.current_voice != "female":
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
self.current_voice = "female"
elif speaker == "Male Voice" and self.current_voice != "male":
self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
self.current_voice = "male"
duration = self.duration.clone()
# lengthening
lenghtening_candidates = [ # ('f', 27),
# ('l', 28),
('ʏ', 29),
('ç', 30),
# ('t', 31),
('ɪ', 32),
# ('ɡ', 33),
('ə', 34),
('n', 35),
# ('z', 66),
('ɑ', 67),
# ('ə', 68),
('n', 69),
# ('b', 84),
('e', 85),
# ('p', 86),
# ('t', 87),
('ə', 88)
]
for lenghtening_candidate in lenghtening_candidates:
duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening
# pauses
pause_candidates = [('~', 36),
('~', 70),
('~', 89)]
for pause_candidate in pause_candidates:
duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur
pitch = self.pitch.clone()
# pitch raise
pitch_candidates = [ # ('k', 37),
('y', 38),
('l', 39),
('ə', 40),
('ʃ', 41),
('a', 42),
('t', 43),
# ('ə', 44),
# ('n', 45),
('a', 71),
('l', 72),
('v', 96),
('ɛ', 97),
('l', 98),
# ('ə', 99),
# ('n', 100)
]
for pitch_candidate in pitch_candidates:
pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up
fixme = [('f', 27),
('l', 28),
('ʏ', 29),
('ç', 30),
('t', 31),
('ɪ', 32),
('ɡ', 33),
('ə', 34),
('n', 35)
]
for pitch_candidate in fixme:
pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)
manipulated_speech = self.utterance_cloner.tts(self.phones,
view=False,
durations=duration,
pitch=pitch,
energy=self.energy,
phones=True).cpu()
if self.current_voice == "female":
cloned_speech = self.cloned_speech_female
reg_speech = self.reg_speech_female
else:
cloned_speech = self.cloned_speech_male
reg_speech = self.reg_speech_male
return (48000, float2pcm(reg_speech)), (48000, float2pcm(cloned_speech)), (48000, float2pcm(manipulated_speech.numpy()))
poem_model = TTS_Interface()
article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning and more controllability. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"
iface = gr.Interface(fn=poem_model.read,
inputs=[gr.inputs.Dropdown([
"Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild."],
type="value",
default="Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild.",
label="Poem Transcript"),
gr.inputs.Dropdown(["Female Voice", "Male Voice"],
type="value",
default="Female Voice",
label="Select a Speaker"),
gr.inputs.Slider(minimum=0, maximum=4, step=1, default=2, label="Lengthening on verse end"),
gr.inputs.Slider(minimum=0, maximum=20, step=1, default=8, label="Length of Pause after verse end"),
gr.inputs.Slider(minimum=-0.4, maximum=0.4, step=0.01, default=0.2, label="Raise Pitch on new verse")
],
outputs=[gr.outputs.Audio(type="numpy", label="Poem read with prose reading"),
gr.outputs.Audio(type="numpy", label="Poem cloned from a reference"),
gr.outputs.Audio(type="numpy", label="Poem after human-in-the-loop adjustments")],
layout="vertical",
title="PoeticTTS - Customizing Poetry for Literary Studies",
thumbnail="Utility/toucan.png",
theme="default",
allow_flagging="never",
allow_screenshot=False,
description="Customize how a poem is read by a text-to-speech system with intuitive high-level controls. You can control markers of syntactic phrasing ",
article=article)
iface.launch(enable_queue=True)
|