import torch import librosa import soundfile as sf import gradio as gr import torchaudio import os from Amphion.models.ns3_codec import FACodecEncoder, FACodecDecoder fa_encoder = FACodecEncoder( ngf=32, up_ratios=[2, 4, 5, 5], out_channels=256, ) fa_decoder = FACodecDecoder( in_channels=256, upsample_initial_channel=1024, ngf=32, up_ratios=[5, 5, 4, 2], vq_num_q_c=2, vq_num_q_p=1, vq_num_q_r=3, vq_dim=256, codebook_dim=8, codebook_size_prosody=10, codebook_size_content=10, codebook_size_residual=10, use_gr_x_timbre=True, use_gr_residual_f0=True, use_gr_residual_phone=True, ) fa_encoder.load_state_dict(torch.load("ns3_facodec_encoder.bin")) fa_decoder.load_state_dict(torch.load("ns3_facodec_decoder.bin")) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") fa_encoder = fa_encoder.to(device) fa_decoder = fa_decoder.to(device) fa_encoder.eval() fa_decoder.eval() def codec_inference(speech_path): with torch.no_grad(): wav, sr = librosa.load(speech_path, sr=16000) wav = torch.tensor(wav).to(device).unsqueeze(0).unsqueeze(0) enc_out = fa_encoder(wav) vq_post_emb, vq_id, _, quantized, spk_embs = fa_decoder( enc_out, eval_vq=False, vq=True ) recon_wav = fa_decoder.inference(vq_post_emb, spk_embs) os.makedirs("temp", exist_ok=True) result_path = "temp/result.wav" sf.write(result_path, recon_wav[0, 0].cpu().numpy(), 16000) return result_path demo_inputs = [ gr.Audio( sources=["upload", "microphone"], label="Upload the speech file", type="filepath", ), ] demo_outputs = gr.Audio(label="") demo = gr.Interface( fn=codec_inference, inputs=demo_inputs, outputs=demo_outputs, title="NaturalSpeech3 FACodec", ) if __name__ == "__main__": demo.launch()