Hecheng0625 commited on
Commit
7634b6c
1 Parent(s): 223914e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -9
app.py CHANGED
@@ -4,8 +4,13 @@ import soundfile as sf
4
  import gradio as gr
5
  import torchaudio
6
  import os
 
7
 
8
- from Amphion.models.ns3_codec import FACodecEncoder, FACodecDecoder
 
 
 
 
9
 
10
  fa_encoder = FACodecEncoder(
11
  ngf=32,
@@ -31,15 +36,27 @@ fa_decoder = FACodecDecoder(
31
  use_gr_residual_phone=True,
32
  )
33
 
34
- fa_encoder.load_state_dict(torch.load("ns3_facodec_encoder.bin"))
35
- fa_decoder.load_state_dict(torch.load("ns3_facodec_decoder.bin"))
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
  fa_encoder = fa_encoder.to(device)
39
  fa_decoder = fa_decoder.to(device)
 
40
  fa_encoder.eval()
41
  fa_decoder.eval()
42
-
43
 
44
  def codec_inference(speech_path):
45
 
@@ -61,23 +78,69 @@ def codec_inference(speech_path):
61
  return result_path
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  demo_inputs = [
65
  gr.Audio(
66
  sources=["upload", "microphone"],
67
- label="Upload the speech file",
 
 
 
 
 
68
  type="filepath",
69
  ),
70
  ]
71
 
72
- demo_outputs = gr.Audio(label="")
 
 
 
 
73
 
74
  demo = gr.Interface(
75
- fn=codec_inference,
76
  inputs=demo_inputs,
77
  outputs=demo_outputs,
78
  title="NaturalSpeech3 FACodec",
79
- description=
80
- """
81
  ## FACodec: Speech Codec with Attribute Factorization used for NaturalSpeech 3
82
 
83
  [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2403.03100.pdf)
@@ -96,3 +159,4 @@ demo = gr.Interface(
96
 
97
  if __name__ == "__main__":
98
  demo.launch()
 
 
4
  import gradio as gr
5
  import torchaudio
6
  import os
7
+ from huggingface_hub import hf_hub_download
8
 
9
+ from Amphion.models.ns3_codec import (
10
+ FACodecEncoder,
11
+ FACodecDecoder,
12
+ FACodecRedecoder,
13
+ )
14
 
15
  fa_encoder = FACodecEncoder(
16
  ngf=32,
 
36
  use_gr_residual_phone=True,
37
  )
38
 
39
+ fa_redecoder = FACodecRedecoder()
40
+
41
+ # encoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_encoder.bin")
42
+ # decoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_decoder.bin")
43
+ # redecoder_ckpt = hf_hub_download(repo_id="amphion/naturalspeech3_facodec", filename="ns3_facodec_redecoder.bin")
44
+
45
+ encoder_ckpt = "ns3_facodec_encoder.bin"
46
+ decoder_ckpt = "ns3_facodec_decoder.bin"
47
+ redecoder_ckpt = "ns3_facodec_redecoder.bin"
48
+
49
+ fa_encoder.load_state_dict(torch.load(encoder_ckpt))
50
+ fa_decoder.load_state_dict(torch.load(decoder_ckpt))
51
+ fa_redecoder.load_state_dict(torch.load(redecoder_ckpt))
52
 
53
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
  fa_encoder = fa_encoder.to(device)
55
  fa_decoder = fa_decoder.to(device)
56
+ fa_redecoder = fa_redecoder.to(device)
57
  fa_encoder.eval()
58
  fa_decoder.eval()
59
+ fa_redecoder.eval()
60
 
61
  def codec_inference(speech_path):
62
 
 
78
  return result_path
79
 
80
 
81
+ def codec_voice_conversion(speech_path_a, speech_path_b):
82
+
83
+ with torch.no_grad():
84
+
85
+ wav_a, sr = librosa.load(speech_path_a, sr=16000)
86
+ wav_a = torch.tensor(wav_a).to(device).unsqueeze(0).unsqueeze(0)
87
+ wav_b, sr = librosa.load(speech_path_b, sr=16000)
88
+ wav_b = torch.tensor(wav_b).to(device).unsqueeze(0).unsqueeze(0)
89
+
90
+ enc_out_a = fa_encoder(wav_a)
91
+ enc_out_b = fa_encoder(wav_b)
92
+
93
+ vq_post_emb_a, vq_id_a, _, quantized, spk_embs_a = fa_decoder(
94
+ enc_out_a, eval_vq=False, vq=True
95
+ )
96
+ vq_post_emb_b, vq_id_b, _, quantized, spk_embs_b = fa_decoder(
97
+ enc_out_b, eval_vq=False, vq=True
98
+ )
99
+
100
+ recon_wav_a = fa_decoder.inference(vq_post_emb_a, spk_embs_a)
101
+ recon_wav_b = fa_decoder.inference(vq_post_emb_b, spk_embs_b)
102
+
103
+ vq_post_emb_a_to_b = fa_redecoder.vq2emb(
104
+ vq_id_a, spk_embs_b, use_residual=False
105
+ )
106
+ recon_wav_a_to_b = fa_redecoder.inference(vq_post_emb_a_to_b, spk_embs_b)
107
+
108
+ os.makedirs("temp", exist_ok=True)
109
+ recon_a_result_path = "temp/result_a.wav"
110
+ recon_b_result_path = "temp/result_b.wav"
111
+ vc_result_path = "temp/result_vc.wav"
112
+ sf.write(vc_result_path, recon_wav_a_to_b[0, 0].cpu().numpy(), 16000)
113
+ sf.write(recon_a_result_path, recon_wav_a[0, 0].cpu().numpy(), 16000)
114
+ sf.write(recon_b_result_path, recon_wav_b[0, 0].cpu().numpy(), 16000)
115
+
116
+ return recon_a_result_path, recon_b_result_path, vc_result_path
117
+
118
+
119
  demo_inputs = [
120
  gr.Audio(
121
  sources=["upload", "microphone"],
122
+ label="Upload the source speech file",
123
+ type="filepath",
124
+ ),
125
+ gr.Audio(
126
+ sources=["upload", "microphone"],
127
+ label="Upload the reference speech file",
128
  type="filepath",
129
  ),
130
  ]
131
 
132
+ demo_outputs = [
133
+ gr.Audio(label="Source speech reconstructed"),
134
+ gr.Audio(label="Reference speech reconstructed"),
135
+ gr.Audio(label="Voice conversion result"),
136
+ ]
137
 
138
  demo = gr.Interface(
139
+ fn=codec_voice_conversion,
140
  inputs=demo_inputs,
141
  outputs=demo_outputs,
142
  title="NaturalSpeech3 FACodec",
143
+ description="""
 
144
  ## FACodec: Speech Codec with Attribute Factorization used for NaturalSpeech 3
145
 
146
  [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/pdf/2403.03100.pdf)
 
159
 
160
  if __name__ == "__main__":
161
  demo.launch()
162
+