Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import soundfile as sf | |
import gradio as gr | |
import spaces | |
from clearvoice import ClearVoice | |
import os | |
def fn_clearvoice_se(input_wav, sr): | |
if sr == "16000": | |
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K']) | |
fs = 16000 | |
else: | |
myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K']) | |
fs = 48000 | |
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False) | |
if isinstance(output_wav_dict, dict): | |
key = next(iter(output_wav_dict)) | |
output_wav = output_wav_dict[key] | |
else: | |
output_wav = output_wav_dict | |
sf.write('enhanced.wav', output_wav, fs) | |
return 'enhanced.wav' | |
def fn_clearvoice_ss(input_wav): | |
myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K']) | |
output_wav_dict = myClearVoice(input_path=input_wav, online_write=False) | |
if isinstance(output_wav_dict, dict): | |
key = next(iter(output_wav_dict)) | |
output_wav_list = output_wav_dict[key] | |
output_wav_s1 = output_wav_list[0] | |
output_wav_s2 = output_wav_list[1] | |
else: | |
output_wav_list = output_wav_dict | |
output_wav_s1 = output_wav_list[0] | |
output_wav_s2 = output_wav_list[1] | |
sf.write('separated_s1.wav', output_wav_s1, 16000) | |
sf.write('separated_s2.wav', output_wav_s2, 16000) | |
return "separated_s1.wav", "separated_s2.wav" | |
def find_mp4_files(directory): | |
mp4_files = [] | |
# Walk through the directory and its subdirectories | |
for root, dirs, files in os.walk(directory): | |
for file in files: | |
# Check if the file ends with .mp4 | |
if file.endswith(".mp4") and file[:3] == 'est': | |
mp4_files.append(os.path.join(root, file)) | |
return mp4_files | |
def fn_clearvoice_tse(input_video): | |
myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K']) | |
#output_wav_dict = | |
print(f'input_video: {input_video}') | |
myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse') | |
output_list = find_mp4_files('path_to_output_videos_tse/') | |
print(output_list) | |
return output_list | |
demo = gr.Blocks() | |
se_demo = gr.Interface( | |
fn=fn_clearvoice_se, | |
inputs = [ | |
gr.Audio(label="Input Audio", type="filepath"), | |
gr.Dropdown( | |
["16000", "48000"], value=["16000"], multiselect=False, label="Sampling Rate", info="Choose the sampling rate for your output." | |
), | |
], | |
outputs = [ | |
gr.Audio(label="Output Audio", type="filepath"), | |
], | |
title = "ClearVoice: Speech Enhancement", | |
description = ("Gradio demo for Speech enhancement with ClearVoice. The models support audios with 16 kHz (FRCRN backbone) and 48 kHz (MossFormer2 backbone) sampling rates. " | |
"We provide the generalized models trained on large scale of data for handling various of background environments. " | |
"To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."), | |
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> | <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>" | |
), | |
examples = [ | |
["examples/mandarin_speech_16kHz.wav", "16000"], | |
["examples/english_speech_48kHz.wav", "48000"], | |
], | |
cache_examples = True, | |
) | |
ss_demo = gr.Interface( | |
fn=fn_clearvoice_ss, | |
inputs = [ | |
gr.Audio(label="Input Audio", type="filepath"), | |
], | |
outputs = [ | |
gr.Audio(label="Output Audio", type="filepath"), | |
gr.Audio(label="Output Audio", type="filepath"), | |
], | |
title = "ClearVoice: Speech Separation", | |
description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. " | |
"We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. " | |
"To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."), | |
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>" | |
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"), | |
examples = [ | |
['examples/female_female_speech.wav'], | |
['examples/female_male_speech.wav'], | |
], | |
cache_examples = True, | |
) | |
tse_demo = gr.Interface( | |
fn=fn_clearvoice_tse, | |
inputs = [ | |
gr.Video(label="Input Video"), | |
], | |
outputs = [ | |
gr.Gallery(label="Output Video List") | |
], | |
title = "ClearVoice: Audio-visual speaker extraction", | |
description = ("Gradio demo for audio-visual speaker extraction with ClearVoice. The model (AV_MossFormer2_TSE_16K) supports 16 kHz sampling rate. " | |
"We provide the generalized models trained on mid-scale of data for handling independent speakers and various of background environments. " | |
"To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."), | |
article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>" | |
"<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"), | |
examples = [ | |
['examples/female_female_speech.wav'], | |
['examples/female_male_speech.wav'], | |
], | |
cache_examples = True, | |
) | |
with demo: | |
#gr.TabbedInterface([se_demo], ["Speech Enhancement"]) | |
gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Speech Enhancement", "Speech Separation", "Target Speaker Extraction"]) | |
demo.launch() |