Spaces:

Flux9665
/

PoeticTTS

Running

App Files Files

PoeticTTS / app.py

Flux9665

Update app.py

a8bfe3a about 2 years ago

raw

history blame

10.3 kB

	import os

	import gradio as gr
	import numpy as np
	import soundfile as sf
	import torch

	os.system("git clone --branch v2.3 https://github.com/DigitalPhonetics/IMS-Toucan.git toucan_codebase")
	os.system("mv toucan_codebase/* .")

	from run_model_downloader import download_models

	download_models()



	import gradio as gr
	import numpy as np
	import torch
	import math
	import os
	from Preprocessing.TextFrontend import ArticulatoryCombinedTextFrontend
	from TrainingInterfaces.Text_to_Spectrogram.AutoAligner.Aligner import Aligner
	from TrainingInterfaces.Text_to_Spectrogram.FastSpeech2.DurationCalculator import DurationCalculator
	from InferenceInterfaces.UtteranceCloner import UtteranceCloner

	def float2pcm(sig, dtype='int16'):
	"""
	https://gist.github.com/HudsonHuang/fbdf8e9af7993fe2a91620d3fb86a182
	"""
	sig = np.asarray(sig)
	if sig.dtype.kind != 'f':
	raise TypeError("'sig' must be a float array")
	dtype = np.dtype(dtype)
	if dtype.kind not in 'iu':
	raise TypeError("'dtype' must be an integer type")
	i = np.iinfo(dtype)
	abs_max = 2 ** (i.bits - 1)
	offset = i.min + abs_max
	return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)


	class TTS_Interface:

	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.utterance_cloner = UtteranceCloner(model_id="Meta", device=self.device)
	self.utterance_cloner.tts.set_language("de")
	self.acoustic_model = Aligner()
	self.acoustic_model.load_state_dict(torch.load("Models/Aligner/aligner.pt", map_location='cpu')["asr_model"])
	self.acoustic_model = self.acoustic_model.to(self.device)
	self.dc = DurationCalculator(reduction_factor=1)
	self.tf = ArticulatoryCombinedTextFrontend(language="en")
	self.text = "Quellen hattest du ihm, hattest dem Flüchtigen, kühle Schatten geschenkt, und die Gestade sahen, all ihm nach, und es bebte, aus den Wellen ihr lieblich Bild."
	reference_audio = "reference_audios/2.wav"
	self.duration, self.pitch, self.energy, _, _ = self.utterance_cloner.extract_prosody(self.text, reference_audio, lang="de", on_line_fine_tune=False)
	self.phones = self.utterance_cloner.tts.text2phone.get_phone_string(self.text)

	#######
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
	self.current_voice = "male"
	self.cloned_speech_male = self.utterance_cloner.tts(self.phones,
	view=False,
	durations=self.duration,
	pitch=self.pitch,
	energy=self.energy,
	phones=True).cpu().numpy()
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
	self.current_voice = "female"
	self.cloned_speech_female = self.utterance_cloner.tts(self.phones,
	view=False,
	durations=self.duration,
	pitch=self.pitch,
	energy=self.energy,
	phones=True).cpu().numpy()

	#######
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
	self.current_voice = "male"
	self.reg_speech_male = self.utterance_cloner.tts(
	"Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
	view=False).cpu().numpy()
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
	self.current_voice = "female"
	self.reg_speech_female = self.utterance_cloner.tts(
	"Quellen hattest du ihm, hattest dem Flüchtigen kühle Schatten geschenkt, und die Gestade sahen all ihm nach, und es bebte aus den Wellen ihr lieblich Bild.",
	view=False).cpu().numpy()

	def read(self, _, speaker, lengthening, pause_dur, pitch_up):

	if speaker == "Female Voice" and self.current_voice != "female":
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_female.wav")
	self.current_voice = "female"
	elif speaker == "Male Voice" and self.current_voice != "male":
	self.utterance_cloner.tts.set_utterance_embedding("reference_audios/german_male.wav")
	self.current_voice = "male"

	duration = self.duration.clone()
	# lengthening
	lenghtening_candidates = [ # ('f', 27),
	# ('l', 28),
	('ʏ', 29),
	('ç', 30),
	# ('t', 31),
	('ɪ', 32),
	# ('ɡ', 33),
	('ə', 34),
	('n', 35),

	# ('z', 66),
	('ɑ', 67),
	# ('ə', 68),
	('n', 69),

	# ('b', 84),
	('e', 85),
	# ('p', 86),
	# ('t', 87),
	('ə', 88)
	]

	for lenghtening_candidate in lenghtening_candidates:
	duration[lenghtening_candidate[1]] = duration[lenghtening_candidate[1]] + lengthening

	# pauses
	pause_candidates = [('~', 36),
	('~', 70),
	('~', 89)]

	for pause_candidate in pause_candidates:
	duration[pause_candidate[1]] = duration[pause_candidate[1]] + pause_dur

	pitch = self.pitch.clone()
	# pitch raise

	pitch_candidates = [ # ('k', 37),
	('y', 38),
	('l', 39),
	('ə', 40),
	('ʃ', 41),
	('a', 42),
	('t', 43),
	# ('ə', 44),
	# ('n', 45),

	('a', 71),
	('l', 72),

	('v', 96),
	('ɛ', 97),
	('l', 98),
	# ('ə', 99),
	# ('n', 100)
	]

	for pitch_candidate in pitch_candidates:
	pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] + pitch_up

	fixme = [('f', 27),
	('l', 28),
	('ʏ', 29),
	('ç', 30),
	('t', 31),
	('ɪ', 32),
	('ɡ', 33),
	('ə', 34),
	('n', 35)
	]
	for pitch_candidate in fixme:
	pitch[pitch_candidate[1]] = pitch[pitch_candidate[1]] - abs(pitch_up)

	manipulated_speech = self.utterance_cloner.tts(self.phones,
	view=False,
	durations=duration,
	pitch=pitch,
	energy=self.energy,
	phones=True).cpu()

	if self.current_voice == "female":
	cloned_speech = self.cloned_speech_female
	reg_speech = self.reg_speech_female
	else:
	cloned_speech = self.cloned_speech_male
	reg_speech = self.reg_speech_male

	return (48000, float2pcm(reg_speech)), (48000, float2pcm(cloned_speech)), (48000, float2pcm(manipulated_speech.numpy()))


	poem_model = TTS_Interface()
	article = "<p style='text-align: left'>This is still a work in progress, models will be exchanged for better ones as soon as they are done. More diverse training data can help with more exact cloning and more controllability. For example we are still trying to incorporate more singing data. </p><p style='text-align: center'><a href='https://github.com/DigitalPhonetics/IMS-Toucan' target='_blank'>Click here to learn more about the IMS Toucan Speech Synthesis Toolkit</a></p>"

	iface = gr.Interface(fn=poem_model.read,
	inputs=[gr.inputs.Dropdown([
	"Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild."],
	type="value",
	default="Quellen hattest du ihm, hattest dem Flüchtigen // kühle Schatten geschenkt, und die Gestade sahn // all ihm nach, und es bebte // aus den Wellen ihr lieblich Bild.",
	label="Poem Transcript"),
	gr.inputs.Dropdown(["Female Voice", "Male Voice"],
	type="value",
	default="Female Voice",
	label="Select a Speaker"),
	gr.inputs.Slider(minimum=0, maximum=4, step=1, default=2, label="Lengthening on verse end"),
	gr.inputs.Slider(minimum=0, maximum=20, step=1, default=8, label="Length of Pause after verse end"),
	gr.inputs.Slider(minimum=-0.4, maximum=0.4, step=0.01, default=0.2, label="Raise Pitch on new verse")
	],
	outputs=[gr.outputs.Audio(type="numpy", label="Poem read with prose reading"),
	gr.outputs.Audio(type="numpy", label="Poem cloned from a reference"),
	gr.outputs.Audio(type="numpy", label="Poem after human-in-the-loop adjustments")],
	layout="vertical",
	title="PoeticTTS - Customizing Poetry for Literary Studies",
	thumbnail="Utility/toucan.png",
	theme="default",
	allow_flagging="never",
	allow_screenshot=False,
	description="Customize how a poem is read by a text-to-speech system with intuitive high-level controls. You can control phrasing markers to go from prose style syntactic phrasing to verse aware poetry style phrasing with the sliders below.",
	article=article)
	iface.launch(enable_queue=True)