Spaces:

speechmaster
/

denoise_and_diarization

Runtime error

agorlanov commited on Apr 30, 2023

Commit

ad99144

•

1 Parent(s): a227627

fix readme

Files changed (4) hide show

README.md CHANGED Viewed

@@ -9,8 +9,7 @@ app_file: app.py
 pinned: false
 ---
-How inference:
 1) [huggingface](https://huggingface.co/spaces/deepkotix/denoise_and_diarization)
 2) [telegram bot](https://t.me/diarizarion_bot)
 3) run local inference:
@@ -19,11 +18,19 @@ How inference:
    2) Inference local:
    `python main_pipeline.py --audio-path dialog.mp3`
-|                       | inference time for file dialog.mp3 |
-|-----------------------|:----------------------------------:|
-| cpu 2v CPU huggingece |              600 s/it              |
-| gpu tesla v100        |             8.23 s/it              |

 pinned: false
 ---
+# How inference:
 1) [huggingface](https://huggingface.co/spaces/deepkotix/denoise_and_diarization)
 2) [telegram bot](https://t.me/diarizarion_bot)
 3) run local inference:
    2) Inference local:
    `python main_pipeline.py --audio-path dialog.mp3`
+# About pipeline:
++ denoise audio
++ vad(voice activity detector)
++ speaker embeddings from each vad fragments
++ clustering this embeddings
+# Inference for hardware
+|                       |  inference time for file dialog.mp3  |
+|-----------------------|:------------------------------------:|
+| cpu 2v CPU huggingece |              453.8 s/it              |
+| gpu tesla v100        |              8.23 s/it               |

app.py CHANGED Viewed

@@ -3,12 +3,14 @@ import gradio as gr
 from main_pipeline import main_pipeline
 from scipy.io.wavfile import write
-title = "audio_denoise and speakser diarization. Faster inference [tg_bot](https://t.me/diarizarion_bot)"
 example_list = [
     ["dialog.mp3"]
 ]
 def app_pipeline(audio):
     audio_path = 'test.wav'
     write(audio_path, audio[0], audio[1])
@@ -17,12 +19,14 @@ def app_pipeline(audio):
     return result_diarization + [None] * (10 - len(result_diarization))
 gr.Interface(
     app_pipeline,
     gr.Audio(type="numpy", label="Input"),
     [gr.Audio(visible=True) for i in range(10)],
     title=title,
     examples=example_list,
-    cache_examples=False
 ).launch(enable_queue=True)

 from main_pipeline import main_pipeline
 from scipy.io.wavfile import write
+title = "audio_denoise and speakser diarization. Faster inference [tg_bot]()"
+description = '''Faster inference  tg_bot - https://t.me/diarizarion_bot '''
 example_list = [
     ["dialog.mp3"]
 ]
 def app_pipeline(audio):
     audio_path = 'test.wav'
     write(audio_path, audio[0], audio[1])
     return result_diarization + [None] * (10 - len(result_diarization))
 gr.Interface(
     app_pipeline,
     gr.Audio(type="numpy", label="Input"),
     [gr.Audio(visible=True) for i in range(10)],
     title=title,
     examples=example_list,
+    cache_examples=False,
+    description=description
 ).launch(enable_queue=True)

main_pipeline.py CHANGED Viewed

@@ -31,7 +31,7 @@ def save_speaker_audios(segments, denoised_audio_path, out_folder='out', out_f=4
         out_wav_paths.append(out_wav_path)
-    return out_wav_paths
 def main_pipeline(audio_path):
@@ -39,7 +39,7 @@ def main_pipeline(audio_path):
     denoised_audio_path = denoise(audio_path, device)
     segments = diarization(denoised_audio_path)
-    result_diarization = save_speaker_audios(segments, denoised_audio_path)
     return denoised_audio_path, result_diarization
@@ -47,5 +47,6 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--audio-path', default='dialog.mp3', help='Path to audio')
     opt = parser.parse_args()
     for _ in tqdm(range(10)):
         main_pipeline(audio_path=opt.audio_path)

         out_wav_paths.append(out_wav_path)
+    return out_wav_paths[:10]
 def main_pipeline(audio_path):
     denoised_audio_path = denoise(audio_path, device)
     segments = diarization(denoised_audio_path)
+    result_diarization = save_speaker_audios(segments, denoised_audio_path, out_folder='out')
     return denoised_audio_path, result_diarization
     parser = argparse.ArgumentParser()
     parser.add_argument('--audio-path', default='dialog.mp3', help='Path to audio')
     opt = parser.parse_args()
     for _ in tqdm(range(10)):
         main_pipeline(audio_path=opt.audio_path)

utils/denoise_pipeline.py CHANGED Viewed

@@ -10,10 +10,8 @@ from demucs.pretrained import get_model
 demucs_model = get_model('htdemucs')
-def denoise(filename: str, device: str) -> str:
     wav_ref, sr = librosa.load(filename, mono=False, sr=44100)
     wav = torch.tensor(wav_ref)
     wav = torch.cat([wav.unsqueeze(0), wav.unsqueeze(0)]) if len(wav.shape) == 1 else wav
     ref = wav.mean(0)
@@ -28,9 +26,9 @@ def denoise(filename: str, device: str) -> str:
     vocal_wav = librosa.to_mono(vocal_wav)
     vocal_wav = vocal_wav.T
     vocal_wav = librosa.resample(vocal_wav, orig_sr=44100, target_sr=48000)
-    write('denoise.wav', 48000, vocal_wav)
-    return 'denoise.wav'
 if __name__ == '__main__':

 demucs_model = get_model('htdemucs')
+def denoise(filename: str, device: str, out_filename='denoise.wav') -> str:
     wav_ref, sr = librosa.load(filename, mono=False, sr=44100)
     wav = torch.tensor(wav_ref)
     wav = torch.cat([wav.unsqueeze(0), wav.unsqueeze(0)]) if len(wav.shape) == 1 else wav
     ref = wav.mean(0)
     vocal_wav = librosa.to_mono(vocal_wav)
     vocal_wav = vocal_wav.T
     vocal_wav = librosa.resample(vocal_wav, orig_sr=44100, target_sr=48000)
+    write(out_filename, 48000, vocal_wav)
+    return out_filename
 if __name__ == '__main__':