Hecheng0625 commited on
Commit
3629250
1 Parent(s): cb83a6c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +82 -0
  2. ns3_facodec_decoder.bin +3 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ import soundfile as sf
4
+ import gradio as gr
5
+ import torchaudio
6
+ import os
7
+
8
+ from Amphion.models.ns3_codec import FACodecEncoder, FACodecDecoder
9
+
10
+ fa_encoder = FACodecEncoder(
11
+ ngf=32,
12
+ up_ratios=[2, 4, 5, 5],
13
+ out_channels=256,
14
+ )
15
+
16
+ fa_decoder = FACodecDecoder(
17
+ in_channels=256,
18
+ upsample_initial_channel=1024,
19
+ ngf=32,
20
+ up_ratios=[5, 5, 4, 2],
21
+ vq_num_q_c=2,
22
+ vq_num_q_p=1,
23
+ vq_num_q_r=3,
24
+ vq_dim=256,
25
+ codebook_dim=8,
26
+ codebook_size_prosody=10,
27
+ codebook_size_content=10,
28
+ codebook_size_residual=10,
29
+ use_gr_x_timbre=True,
30
+ use_gr_residual_f0=True,
31
+ use_gr_residual_phone=True,
32
+ )
33
+
34
+ fa_encoder.load_state_dict(torch.load("ns3_facodec_encoder.bin"))
35
+ fa_decoder.load_state_dict(torch.load("ns3_facodec_decoder.bin"))
36
+
37
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
+ fa_encoder = fa_encoder.to(device)
39
+ fa_decoder = fa_decoder.to(device)
40
+ fa_encoder.eval()
41
+ fa_decoder.eval()
42
+
43
+
44
+ def codec_inference(speech_path):
45
+
46
+ with torch.no_grad():
47
+
48
+ wav, sr = librosa.load(speech_path, sr=16000)
49
+ wav = torch.tensor(wav).to(device).unsqueeze(0).unsqueeze(0)
50
+
51
+ enc_out = fa_encoder(wav)
52
+ vq_post_emb, vq_id, _, quantized, spk_embs = fa_decoder(
53
+ enc_out, eval_vq=False, vq=True
54
+ )
55
+ recon_wav = fa_decoder.inference(vq_post_emb, spk_embs)
56
+
57
+ os.makedirs("temp", exist_ok=True)
58
+ result_path = "temp/result.wav"
59
+ sf.write(result_path, recon_wav[0, 0].cpu().numpy(), 16000)
60
+
61
+ return result_path
62
+
63
+
64
+ demo_inputs = [
65
+ gr.Audio(
66
+ sources=["upload", "microphone"],
67
+ label="Upload the speech file",
68
+ type="filepath",
69
+ ),
70
+ ]
71
+
72
+ demo_outputs = gr.Audio(label="")
73
+
74
+ demo = gr.Interface(
75
+ fn=codec_inference,
76
+ inputs=demo_inputs,
77
+ outputs=demo_outputs,
78
+ title="NaturalSpeech3 FACodec",
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ demo.launch()
ns3_facodec_decoder.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32bfe7d5145052b55bcc36790d12b4ff826c5e60ff197c45f37cd6c87a44a179
3
+ size 397810979