Spaces:

ramkamal2000
/

voice-conversion-ddp

Runtime error

App Files Files Community

ramkamal2000 commited on Jun 3, 2023

Commit

8313242

•

1 Parent(s): 02f4ff2

'normalization'

Browse files

Files changed (1) hide show

app.py +15 -12

app.py CHANGED Viewed

@@ -107,7 +107,7 @@ print("Loading WavLM for content...")
 cmodel = utils.get_cmodel(device).to(device)
 # cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
-def voice_conversion_yourtts(da, ta):
   # write(target_audio, ta[0], ta[1])
   # write(driving_audio, da[0], da[1])
@@ -118,20 +118,20 @@ def voice_conversion_yourtts(da, ta):
   files = [da, ta]
-  for file in files:
-      subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
   # ta_ = read(target_audio)
-  target_emb = SE_speaker_manager.compute_d_vector_from_clip([ta])
   target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
-  driving_emb = SE_speaker_manager.compute_d_vector_from_clip([da])
   driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
   # Convert the voice
-  driving_spec = compute_spec(da)
   y_lengths = torch.tensor([driving_spec.size(-1)])
   if USE_CUDA:
       ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
@@ -145,13 +145,16 @@ def voice_conversion_yourtts(da, ta):
   return (ap.sample_rate, ref_wav_voc)
-def voice_conversion_freevc(src, tgt):
     with torch.no_grad():
-        wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
         wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
         g_tgt = smodel.embed_utterance(wav_tgt)
         g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
-        wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
         wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
         # c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
         c = utils.get_content(cmodel, wav_src)
@@ -178,9 +181,9 @@ outputs_2 =  gr.outputs.Audio(label="Target Speaker - Output Audio", type='filep
 def voice_conversion(mod, sa, ta):
     if mod=='FreeVC':
-        return voice_conversion_yourtts(sa, ta)
-    else:
         return voice_conversion_freevc(sa, ta)
 examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']]
@@ -199,5 +202,5 @@ vc_2 = gr.Interface(
     description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project"
 )
-demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion")
 demo.launch(debug='True')

 cmodel = utils.get_cmodel(device).to(device)
 # cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
+def voice_conversion_yourtts(da, ta, normalize=False):
   # write(target_audio, ta[0], ta[1])
   # write(driving_audio, da[0], da[1])
   files = [da, ta]
+  subprocess.run(["ffmpeg-normalize", da, "-nt", "rms", "-t=-27", "-o", "source_yourtts.wav", "-ar", "16000", "-f"])
+  subprocess.run(["ffmpeg-normalize", ta, "-nt", "rms", "-t=-27", "-o", "target_yourtts.wav", "-ar", "16000", "-f"])
   # ta_ = read(target_audio)
+  target_emb = SE_speaker_manager.compute_d_vector_from_clip(["target_yourtts.wav"])
   target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
+  driving_emb = SE_speaker_manager.compute_d_vector_from_clip(["source_yourtts.wav"])
   driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
   # Convert the voice
+  driving_spec = compute_spec("source_yourtts.wav")
   y_lengths = torch.tensor([driving_spec.size(-1)])
   if USE_CUDA:
       ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
   return (ap.sample_rate, ref_wav_voc)
+def voice_conversion_freevc(src, tgt, normalize=False):
     with torch.no_grad():
+        subprocess.run(["ffmpeg-normalize", tgt, "-nt", "rms", "-t=-27", "-o", "target_fvc.wav", "-ar", "16000", "-f"])
+        wav_tgt, _ = librosa.load("target_fvc.wav", sr=hps.data.sampling_rate)
         wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
         g_tgt = smodel.embed_utterance(wav_tgt)
         g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
+        subprocess.run(["ffmpeg-normalize", src, "-nt", "rms", "-t=-27", "-o", "source_fvc.wav", "-ar", "16000", "-f"])
+        wav_src, _ = librosa.load("source_fvc.wav", sr=hps.data.sampling_rate)
         wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
         # c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
         c = utils.get_content(cmodel, wav_src)
 def voice_conversion(mod, sa, ta):
     if mod=='FreeVC':
         return voice_conversion_freevc(sa, ta)
+    else:
+        return voice_conversion_yourtts(sa, ta)
 examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']]
     description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project"
 )
+demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion Demo")
 demo.launch(debug='True')