Spaces:

smf2010
/

mocking-bird

Runtime error

App Files Files Community

mocking-bird / control /mkgui /app.py

smf2010

Upload 204 files

a23d717 verified over 1 year ago

raw

history blame contribute delete

6.09 kB

	from pydantic import BaseModel, Field
	import os
	from pathlib import Path
	from enum import Enum
	from models.encoder import inference as encoder
	import librosa
	from scipy.io.wavfile import write
	import re
	import numpy as np
	from control.mkgui.base.components.types import FileContent
	from models.vocoder.hifigan import inference as gan_vocoder
	from models.synthesizer.inference import Synthesizer
	from typing import Any, Tuple
	import matplotlib.pyplot as plt

	# Constants
	AUDIO_SAMPLES_DIR = f"data{os.sep}samples{os.sep}"
	SYN_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}synthesizer"
	ENC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}encoder"
	VOC_MODELS_DIRT = f"data{os.sep}ckpt{os.sep}vocoder"
	TEMP_SOURCE_AUDIO = f"wavs{os.sep}temp_source.wav"
	TEMP_RESULT_AUDIO = f"wavs{os.sep}temp_result.wav"
	if not os.path.isdir("wavs"):
	os.makedirs("wavs")

	# Load local sample audio as options TODO: load dataset
	if os.path.isdir(AUDIO_SAMPLES_DIR):
	audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
	# Pre-Load models
	if os.path.isdir(SYN_MODELS_DIRT):
	synthesizers = Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("*/.pt")))
	print("Loaded synthesizer models: " + str(len(synthesizers)))
	else:
	raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist. 请将模型文件位置移动到上述位置中进行重试！")

	if os.path.isdir(ENC_MODELS_DIRT):
	encoders = Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("*/.pt")))
	print("Loaded encoders models: " + str(len(encoders)))
	else:
	raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")

	if os.path.isdir(VOC_MODELS_DIRT):
	vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("*/gan*.pt")))
	print("Loaded vocoders models: " + str(len(synthesizers)))
	else:
	raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")


	class Input(BaseModel):
	message: str = Field(
	..., example="欢迎使用工具箱, 现已支持中文输入！", alias="文本内容"
	)
	local_audio_file: audio_input_selection = Field(
	..., alias="选择语音（本地wav）",
	description="选择本地语音文件."
	)
	record_audio_file: FileContent = Field(default=None, alias="录制语音",
	description="录音.", is_recorder=True, mime_type="audio/wav")
	upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
	description="拖拽或点击上传.", mime_type="audio/wav")
	encoder: encoders = Field(
	..., alias="编码模型",
	description="选择语音编码模型文件."
	)
	synthesizer: synthesizers = Field(
	..., alias="合成模型",
	description="选择语音合成模型文件."
	)
	vocoder: vocoders = Field(
	..., alias="语音解码模型",
	description="选择语音解码模型文件(目前只支持HifiGan类型)."
	)

	class AudioEntity(BaseModel):
	content: bytes
	mel: Any

	class Output(BaseModel):
	__root__: Tuple[AudioEntity, AudioEntity]

	def render_output_ui(self, streamlit_app, input) -> None: # type: ignore
	"""Custom output UI.
	If this method is implmeneted, it will be used instead of the default Output UI renderer.
	"""
	src, result = self.__root__

	streamlit_app.subheader("Synthesized Audio")
	streamlit_app.audio(result.content, format="audio/wav")

	fig, ax = plt.subplots()
	ax.imshow(src.mel, aspect="equal", interpolation="none")
	ax.set_title("mel spectrogram(Source Audio)")
	streamlit_app.pyplot(fig)
	fig, ax = plt.subplots()
	ax.imshow(result.mel, aspect="equal", interpolation="none")
	ax.set_title("mel spectrogram(Result Audio)")
	streamlit_app.pyplot(fig)


	def synthesize(input: Input) -> Output:
	"""synthesize(合成)"""
	# load models
	encoder.load_model(Path(input.encoder.value))
	current_synt = Synthesizer(Path(input.synthesizer.value))
	gan_vocoder.load_model(Path(input.vocoder.value))

	# load file
	if input.record_audio_file != None:
	with open(TEMP_SOURCE_AUDIO, "w+b") as f:
	f.write(input.record_audio_file.as_bytes())
	f.seek(0)
	wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
	elif input.upload_audio_file != None:
	with open(TEMP_SOURCE_AUDIO, "w+b") as f:
	f.write(input.upload_audio_file.as_bytes())
	f.seek(0)
	wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
	else:
	wav, sample_rate = librosa.load(input.local_audio_file.value)
	write(TEMP_SOURCE_AUDIO, sample_rate, wav) #Make sure we get the correct wav

	source_spec = Synthesizer.make_spectrogram(wav)

	# preprocess
	encoder_wav = encoder.preprocess_wav(wav, sample_rate)
	embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

	# Load input text
	texts = filter(None, input.message.split("\n"))
	punctuation = '！，。、,' # punctuate and split/clean text
	processed_texts = []
	for text in texts:
	for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
	if processed_text:
	processed_texts.append(processed_text.strip())
	texts = processed_texts

	# synthesize and vocode
	embeds = [embed] * len(texts)
	specs = current_synt.synthesize_spectrograms(texts, embeds)
	spec = np.concatenate(specs, axis=1)
	sample_rate = Synthesizer.sample_rate
	wav, sample_rate = gan_vocoder.infer_waveform(spec)

	# write and output
	write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
	with open(TEMP_SOURCE_AUDIO, "rb") as f:
	source_file = f.read()
	with open(TEMP_RESULT_AUDIO, "rb") as f:
	result_file = f.read()
	return Output(__root__=(AudioEntity(content=source_file, mel=source_spec), AudioEntity(content=result_file, mel=spec)))