Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import librosa | |
import soundfile as sf | |
import gradio as gr | |
import torchaudio | |
import os | |
from Amphion.models.ns3_codec import FACodecEncoder, FACodecDecoder | |
fa_encoder = FACodecEncoder( | |
ngf=32, | |
up_ratios=[2, 4, 5, 5], | |
out_channels=256, | |
) | |
fa_decoder = FACodecDecoder( | |
in_channels=256, | |
upsample_initial_channel=1024, | |
ngf=32, | |
up_ratios=[5, 5, 4, 2], | |
vq_num_q_c=2, | |
vq_num_q_p=1, | |
vq_num_q_r=3, | |
vq_dim=256, | |
codebook_dim=8, | |
codebook_size_prosody=10, | |
codebook_size_content=10, | |
codebook_size_residual=10, | |
use_gr_x_timbre=True, | |
use_gr_residual_f0=True, | |
use_gr_residual_phone=True, | |
) | |
fa_encoder.load_state_dict(torch.load("ns3_facodec_encoder.bin")) | |
fa_decoder.load_state_dict(torch.load("ns3_facodec_decoder.bin")) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
fa_encoder = fa_encoder.to(device) | |
fa_decoder = fa_decoder.to(device) | |
fa_encoder.eval() | |
fa_decoder.eval() | |
def codec_inference(speech_path): | |
with torch.no_grad(): | |
wav, sr = librosa.load(speech_path, sr=16000) | |
wav = torch.tensor(wav).to(device).unsqueeze(0).unsqueeze(0) | |
enc_out = fa_encoder(wav) | |
vq_post_emb, vq_id, _, quantized, spk_embs = fa_decoder( | |
enc_out, eval_vq=False, vq=True | |
) | |
recon_wav = fa_decoder.inference(vq_post_emb, spk_embs) | |
os.makedirs("temp", exist_ok=True) | |
result_path = "temp/result.wav" | |
sf.write(result_path, recon_wav[0, 0].cpu().numpy(), 16000) | |
return result_path | |
demo_inputs = [ | |
gr.Audio( | |
sources=["upload", "microphone"], | |
label="Upload the speech file", | |
type="filepath", | |
), | |
] | |
demo_outputs = gr.Audio(label="") | |
demo = gr.Interface( | |
fn=codec_inference, | |
inputs=demo_inputs, | |
outputs=demo_outputs, | |
title="NaturalSpeech3 FACodec", | |
) | |
if __name__ == "__main__": | |
demo.launch() | |