Hecheng0625 commited on
Commit
24e0fd9
1 Parent(s): 1adbad7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -1
app.py CHANGED
@@ -11,6 +11,8 @@ from Amphion.models.ns3_codec import (
11
  FACodecEncoder,
12
  FACodecDecoder,
13
  FACodecRedecoder,
 
 
14
  )
15
 
16
  fa_encoder = FACodecEncoder(
@@ -39,25 +41,60 @@ fa_decoder = FACodecDecoder(
39
 
40
  fa_redecoder = FACodecRedecoder()
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # encoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_encoder.bin")
43
  # decoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_decoder.bin")
44
  # redecoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_redecoder.bin")
 
 
45
 
46
  encoder_ckpt = "ns3_facodec_encoder.bin"
47
  decoder_ckpt = "ns3_facodec_decoder.bin"
48
  redecoder_ckpt = "ns3_facodec_redecoder.bin"
 
 
49
 
50
  fa_encoder.load_state_dict(torch.load(encoder_ckpt))
51
  fa_decoder.load_state_dict(torch.load(decoder_ckpt))
52
  fa_redecoder.load_state_dict(torch.load(redecoder_ckpt))
 
 
53
 
54
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
55
  fa_encoder = fa_encoder.to(device)
56
  fa_decoder = fa_decoder.to(device)
57
  fa_redecoder = fa_redecoder.to(device)
 
 
58
  fa_encoder.eval()
59
  fa_decoder.eval()
60
  fa_redecoder.eval()
 
 
61
 
62
  @spaces.GPU
63
  def codec_inference(speech_path):
@@ -117,6 +154,45 @@ def codec_voice_conversion(speech_path_a, speech_path_b):
117
 
118
  return recon_a_result_path, recon_b_result_path, vc_result_path
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  demo_inputs = [
122
  gr.Audio(
@@ -182,7 +258,7 @@ with gr.Blocks() as demo:
182
  )
183
 
184
  gr.Interface(
185
- fn=codec_voice_conversion,
186
  inputs=vc_demo_inputs,
187
  outputs=vc_demo_outputs,
188
  title="FACodec Voice Conversion",
 
11
  FACodecEncoder,
12
  FACodecDecoder,
13
  FACodecRedecoder,
14
+ FACodecEncoderV2,
15
+ FACodecDecoderV2,
16
  )
17
 
18
  fa_encoder = FACodecEncoder(
 
41
 
42
  fa_redecoder = FACodecRedecoder()
43
 
44
+ fa_encoder_v2 = FACodecEncoderV2(
45
+ ngf=32,
46
+ up_ratios=[2, 4, 5, 5],
47
+ out_channels=256,
48
+ )
49
+
50
+ fa_decoder_v2 = FACodecDecoderV2(
51
+ in_channels=256,
52
+ upsample_initial_channel=1024,
53
+ ngf=32,
54
+ up_ratios=[5, 5, 4, 2],
55
+ vq_num_q_c=2,
56
+ vq_num_q_p=1,
57
+ vq_num_q_r=3,
58
+ vq_dim=256,
59
+ codebook_dim=8,
60
+ codebook_size_prosody=10,
61
+ codebook_size_content=10,
62
+ codebook_size_residual=10,
63
+ use_gr_x_timbre=True,
64
+ use_gr_residual_f0=True,
65
+ use_gr_residual_phone=True,
66
+ )
67
+
68
+
69
  # encoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_encoder.bin")
70
  # decoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_decoder.bin")
71
  # redecoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_redecoder.bin")
72
+ # encoder_v2_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_encoder_v2.bin")
73
+ # decoder_v2_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_decoder_v2.bin")
74
 
75
  encoder_ckpt = "ns3_facodec_encoder.bin"
76
  decoder_ckpt = "ns3_facodec_decoder.bin"
77
  redecoder_ckpt = "ns3_facodec_redecoder.bin"
78
+ encoder_v2_ckpt = "ns3_facodec_encoder_v2.bin"
79
+ decoder_v2_ckpt = "ns3_facodec_decoder_v2.bin"
80
 
81
  fa_encoder.load_state_dict(torch.load(encoder_ckpt))
82
  fa_decoder.load_state_dict(torch.load(decoder_ckpt))
83
  fa_redecoder.load_state_dict(torch.load(redecoder_ckpt))
84
+ fa_encoder_v2.load_state_dict(torch.load(encoder_v2_ckpt))
85
+ fa_decoder_v2.load_state_dict(torch.load(decoder_v2_ckpt))
86
 
87
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
  fa_encoder = fa_encoder.to(device)
89
  fa_decoder = fa_decoder.to(device)
90
  fa_redecoder = fa_redecoder.to(device)
91
+ fa_encoder_v2 = fa_encoder_v2.to(device)
92
+ fa_decoder_v2 = fa_decoder_v2.to(device)
93
  fa_encoder.eval()
94
  fa_decoder.eval()
95
  fa_redecoder.eval()
96
+ fa_encoder_v2.eval()
97
+ fa_decoder_v2.eval()
98
 
99
  @spaces.GPU
100
  def codec_inference(speech_path):
 
154
 
155
  return recon_a_result_path, recon_b_result_path, vc_result_path
156
 
157
+ @spaces.GPU
158
+ def codec_voice_conversion_v2(speech_path_a, speech_path_b):
159
+
160
+ with torch.no_grad():
161
+
162
+ wav_a, sr = librosa.load(speech_path_a, sr=16000)
163
+ wav_a = np.pad(wav_a, (0, 200 - len(wav_a) % 200))
164
+ wav_a = torch.tensor(wav_a).to(device).unsqueeze(0).unsqueeze(0)
165
+ wav_b, sr = librosa.load(speech_path_b, sr=16000)
166
+ wav_b = np.pad(wav_b, (0, 200 - len(wav_b) % 200))
167
+ wav_b = torch.tensor(wav_b).to(device).unsqueeze(0).unsqueeze(0)
168
+
169
+ enc_out_a = fa_encoder_v2(wav_a)
170
+ prosody_a = fa_encoder_v2.get_prosody_feature(wav_a)
171
+ enc_out_b = fa_encoder_v2(wav_b)
172
+ prosody_b = fa_encoder_v2.get_prosody_feature(wav_b)
173
+
174
+ vq_post_emb_a, vq_id_a, _, quantized, spk_embs_a = fa_decoder_v2(
175
+ enc_out_a, prosody_a, eval_vq=False, vq=True
176
+ )
177
+ vq_post_emb_b, vq_id_b, _, quantized, spk_embs_b = fa_decoder_v2(
178
+ enc_out_b, prosody_b, eval_vq=False, vq=True
179
+ )
180
+
181
+ recon_wav_a = fa_decoder_v2.inference(vq_post_emb_a, spk_embs_a)
182
+ recon_wav_b = fa_decoder_v2.inference(vq_post_emb_b, spk_embs_b)
183
+
184
+ vq_post_emb_a_to_b = fa_decoder_v2.vq2emb(vq_id_a, use_residual=False)
185
+ recon_wav_a_to_b = fa_decoder_v2.inference(vq_post_emb_a_to_b, spk_embs_b)
186
+
187
+ os.makedirs("temp", exist_ok=True)
188
+ recon_a_result_path = "temp/result_a.wav"
189
+ recon_b_result_path = "temp/result_b.wav"
190
+ vc_result_path = "temp/result_vc.wav"
191
+ sf.write(vc_result_path, recon_wav_a_to_b[0, 0].cpu().numpy(), 16000)
192
+ sf.write(recon_a_result_path, recon_wav_a[0, 0].cpu().numpy(), 16000)
193
+ sf.write(recon_b_result_path, recon_wav_b[0, 0].cpu().numpy(), 16000)
194
+
195
+ return recon_a_result_path, recon_b_result_path, vc_result_path
196
 
197
  demo_inputs = [
198
  gr.Audio(
 
258
  )
259
 
260
  gr.Interface(
261
+ fn=codec_voice_conversion_v2,
262
  inputs=vc_demo_inputs,
263
  outputs=vc_demo_outputs,
264
  title="FACodec Voice Conversion",