Spaces:
Runtime error
Runtime error
import os | |
from enum import Enum | |
from pathlib import Path | |
from typing import Any, Tuple | |
import librosa | |
import matplotlib.pyplot as plt | |
import torch | |
from pydantic import BaseModel, Field | |
from scipy.io.wavfile import write | |
import models.ppg2mel as Convertor | |
import models.ppg_extractor as Extractor | |
from control.mkgui.base.components.types import FileContent | |
from models.encoder import inference as speacker_encoder | |
from models.synthesizer.inference import Synthesizer | |
from models.vocoder.hifigan import inference as gan_vocoder | |
# Constants | |
AUDIO_SAMPLES_DIR = f'data{os.sep}samples{os.sep}' | |
EXT_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg_extractor' | |
CONV_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}ppg2mel' | |
VOC_MODELS_DIRT = f'data{os.sep}ckpt{os.sep}vocoder' | |
TEMP_SOURCE_AUDIO = f'wavs{os.sep}temp_source.wav' | |
TEMP_TARGET_AUDIO = f'wavs{os.sep}temp_target.wav' | |
TEMP_RESULT_AUDIO = f'wavs{os.sep}temp_result.wav' | |
# Load local sample audio as options TODO: load dataset | |
if os.path.isdir(AUDIO_SAMPLES_DIR): | |
audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav"))) | |
# Pre-Load models | |
if os.path.isdir(EXT_MODELS_DIRT): | |
extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt"))) | |
print("Loaded extractor models: " + str(len(extractors))) | |
else: | |
raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.") | |
if os.path.isdir(CONV_MODELS_DIRT): | |
convertors = Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth"))) | |
print("Loaded convertor models: " + str(len(convertors))) | |
else: | |
raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.") | |
if os.path.isdir(VOC_MODELS_DIRT): | |
vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt"))) | |
print("Loaded vocoders models: " + str(len(vocoders))) | |
else: | |
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.") | |
class Input(BaseModel): | |
local_audio_file: audio_input_selection = Field( | |
..., alias="่พๅ ฅ่ฏญ้ณ๏ผๆฌๅฐwav๏ผ", | |
description="้ๆฉๆฌๅฐ่ฏญ้ณๆไปถ." | |
) | |
upload_audio_file: FileContent = Field(default=None, alias="ๆไธไผ ่ฏญ้ณ", | |
description="ๆๆฝๆ็นๅปไธไผ .", mime_type="audio/wav") | |
local_audio_file_target: audio_input_selection = Field( | |
..., alias="็ฎๆ ่ฏญ้ณ๏ผๆฌๅฐwav๏ผ", | |
description="้ๆฉๆฌๅฐ่ฏญ้ณๆไปถ." | |
) | |
upload_audio_file_target: FileContent = Field(default=None, alias="ๆไธไผ ็ฎๆ ่ฏญ้ณ", | |
description="ๆๆฝๆ็นๅปไธไผ .", mime_type="audio/wav") | |
extractor: extractors = Field( | |
..., alias="็ผ็ ๆจกๅ", | |
description="้ๆฉ่ฏญ้ณ็ผ็ ๆจกๅๆไปถ." | |
) | |
convertor: convertors = Field( | |
..., alias="่ฝฌๆขๆจกๅ", | |
description="้ๆฉ่ฏญ้ณ่ฝฌๆขๆจกๅๆไปถ." | |
) | |
vocoder: vocoders = Field( | |
..., alias="่ฏญ้ณ่งฃ็ ๆจกๅ", | |
description="้ๆฉ่ฏญ้ณ่งฃ็ ๆจกๅๆไปถ(็ฎๅๅชๆฏๆHifiGan็ฑปๅ)." | |
) | |
class AudioEntity(BaseModel): | |
content: bytes | |
mel: Any | |
class Output(BaseModel): | |
__root__: Tuple[AudioEntity, AudioEntity, AudioEntity] | |
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore | |
"""Custom output UI. | |
If this method is implmeneted, it will be used instead of the default Output UI renderer. | |
""" | |
src, target, result = self.__root__ | |
streamlit_app.subheader("Synthesized Audio") | |
streamlit_app.audio(result.content, format="audio/wav") | |
fig, ax = plt.subplots() | |
ax.imshow(src.mel, aspect="equal", interpolation="none") | |
ax.set_title("mel spectrogram(Source Audio)") | |
streamlit_app.pyplot(fig) | |
fig, ax = plt.subplots() | |
ax.imshow(target.mel, aspect="equal", interpolation="none") | |
ax.set_title("mel spectrogram(Target Audio)") | |
streamlit_app.pyplot(fig) | |
fig, ax = plt.subplots() | |
ax.imshow(result.mel, aspect="equal", interpolation="none") | |
ax.set_title("mel spectrogram(Result Audio)") | |
streamlit_app.pyplot(fig) | |
def convert(input: Input) -> Output: | |
"""convert(่ฝฌๆข)""" | |
# load models | |
extractor = Extractor.load_model(Path(input.extractor.value)) | |
convertor = Convertor.load_model(Path(input.convertor.value)) | |
# current_synt = Synthesizer(Path(input.synthesizer.value)) | |
gan_vocoder.load_model(Path(input.vocoder.value)) | |
# load file | |
if input.upload_audio_file != None: | |
with open(TEMP_SOURCE_AUDIO, "w+b") as f: | |
f.write(input.upload_audio_file.as_bytes()) | |
f.seek(0) | |
src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO) | |
else: | |
src_wav, sample_rate = librosa.load(input.local_audio_file.value) | |
write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav | |
if input.upload_audio_file_target != None: | |
with open(TEMP_TARGET_AUDIO, "w+b") as f: | |
f.write(input.upload_audio_file_target.as_bytes()) | |
f.seek(0) | |
ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO) | |
else: | |
ref_wav, _ = librosa.load(input.local_audio_file_target.value) | |
write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav | |
ppg = extractor.extract_from_wav(src_wav) | |
# Import necessary dependency of Voice Conversion | |
from utils.f0_utils import (compute_f0, compute_mean_std, f02lf0, | |
get_converted_lf0uv) | |
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav))) | |
speacker_encoder.load_model(Path(f"data{os.sep}ckpt{os.sep}encoder{os.sep}pretrained_bak_5805000.pt")) | |
embed = speacker_encoder.embed_utterance(ref_wav) | |
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True) | |
min_len = min(ppg.shape[1], len(lf0_uv)) | |
ppg = ppg[:, :min_len] | |
lf0_uv = lf0_uv[:min_len] | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
_, mel_pred, att_ws = convertor.inference( | |
ppg, | |
logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device), | |
spembs=torch.from_numpy(embed).unsqueeze(0).to(device), | |
) | |
mel_pred= mel_pred.transpose(0, 1) | |
breaks = [mel_pred.shape[1]] | |
mel_pred= mel_pred.detach().cpu().numpy() | |
# synthesize and vocode | |
wav, sample_rate = gan_vocoder.infer_waveform(mel_pred) | |
# write and output | |
write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav | |
with open(TEMP_SOURCE_AUDIO, "rb") as f: | |
source_file = f.read() | |
with open(TEMP_TARGET_AUDIO, "rb") as f: | |
target_file = f.read() | |
with open(TEMP_RESULT_AUDIO, "rb") as f: | |
result_file = f.read() | |
return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav)))) |