agorlanov commited on
Commit
ad99144
1 Parent(s): a227627

fix readme

Browse files
Files changed (4) hide show
  1. README.md +13 -6
  2. app.py +6 -2
  3. main_pipeline.py +3 -2
  4. utils/denoise_pipeline.py +3 -5
README.md CHANGED
@@ -9,8 +9,7 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
-
13
- How inference:
14
  1) [huggingface](https://huggingface.co/spaces/deepkotix/denoise_and_diarization)
15
  2) [telegram bot](https://t.me/diarizarion_bot)
16
  3) run local inference:
@@ -19,11 +18,19 @@ How inference:
19
  2) Inference local:
20
  `python main_pipeline.py --audio-path dialog.mp3`
21
 
 
 
 
 
 
 
 
 
22
 
23
- | | inference time for file dialog.mp3 |
24
- |-----------------------|:----------------------------------:|
25
- | cpu 2v CPU huggingece | 600 s/it |
26
- | gpu tesla v100 | 8.23 s/it |
27
 
28
 
29
 
 
9
  pinned: false
10
  ---
11
 
12
+ # How inference:
 
13
  1) [huggingface](https://huggingface.co/spaces/deepkotix/denoise_and_diarization)
14
  2) [telegram bot](https://t.me/diarizarion_bot)
15
  3) run local inference:
 
18
  2) Inference local:
19
  `python main_pipeline.py --audio-path dialog.mp3`
20
 
21
+ # About pipeline:
22
+ + denoise audio
23
+ + vad(voice activity detector)
24
+ + speaker embeddings from each vad fragments
25
+ + clustering this embeddings
26
+
27
+
28
+ # Inference for hardware
29
 
30
+ | | inference time for file dialog.mp3 |
31
+ |-----------------------|:------------------------------------:|
32
+ | cpu 2v CPU huggingece | 453.8 s/it |
33
+ | gpu tesla v100 | 8.23 s/it |
34
 
35
 
36
 
app.py CHANGED
@@ -3,12 +3,14 @@ import gradio as gr
3
  from main_pipeline import main_pipeline
4
  from scipy.io.wavfile import write
5
 
6
- title = "audio_denoise and speakser diarization. Faster inference [tg_bot](https://t.me/diarizarion_bot)"
 
7
 
8
  example_list = [
9
  ["dialog.mp3"]
10
  ]
11
 
 
12
  def app_pipeline(audio):
13
  audio_path = 'test.wav'
14
  write(audio_path, audio[0], audio[1])
@@ -17,12 +19,14 @@ def app_pipeline(audio):
17
  return result_diarization + [None] * (10 - len(result_diarization))
18
 
19
 
 
20
  gr.Interface(
21
  app_pipeline,
22
  gr.Audio(type="numpy", label="Input"),
23
  [gr.Audio(visible=True) for i in range(10)],
24
  title=title,
25
  examples=example_list,
26
- cache_examples=False
 
27
 
28
  ).launch(enable_queue=True)
 
3
  from main_pipeline import main_pipeline
4
  from scipy.io.wavfile import write
5
 
6
+ title = "audio_denoise and speakser diarization. Faster inference [tg_bot]()"
7
+ description = '''Faster inference tg_bot - https://t.me/diarizarion_bot '''
8
 
9
  example_list = [
10
  ["dialog.mp3"]
11
  ]
12
 
13
+
14
  def app_pipeline(audio):
15
  audio_path = 'test.wav'
16
  write(audio_path, audio[0], audio[1])
 
19
  return result_diarization + [None] * (10 - len(result_diarization))
20
 
21
 
22
+
23
  gr.Interface(
24
  app_pipeline,
25
  gr.Audio(type="numpy", label="Input"),
26
  [gr.Audio(visible=True) for i in range(10)],
27
  title=title,
28
  examples=example_list,
29
+ cache_examples=False,
30
+ description=description
31
 
32
  ).launch(enable_queue=True)
main_pipeline.py CHANGED
@@ -31,7 +31,7 @@ def save_speaker_audios(segments, denoised_audio_path, out_folder='out', out_f=4
31
 
32
  out_wav_paths.append(out_wav_path)
33
 
34
- return out_wav_paths
35
 
36
 
37
  def main_pipeline(audio_path):
@@ -39,7 +39,7 @@ def main_pipeline(audio_path):
39
 
40
  denoised_audio_path = denoise(audio_path, device)
41
  segments = diarization(denoised_audio_path)
42
- result_diarization = save_speaker_audios(segments, denoised_audio_path)
43
  return denoised_audio_path, result_diarization
44
 
45
 
@@ -47,5 +47,6 @@ if __name__ == '__main__':
47
  parser = argparse.ArgumentParser()
48
  parser.add_argument('--audio-path', default='dialog.mp3', help='Path to audio')
49
  opt = parser.parse_args()
 
50
  for _ in tqdm(range(10)):
51
  main_pipeline(audio_path=opt.audio_path)
 
31
 
32
  out_wav_paths.append(out_wav_path)
33
 
34
+ return out_wav_paths[:10]
35
 
36
 
37
  def main_pipeline(audio_path):
 
39
 
40
  denoised_audio_path = denoise(audio_path, device)
41
  segments = diarization(denoised_audio_path)
42
+ result_diarization = save_speaker_audios(segments, denoised_audio_path, out_folder='out')
43
  return denoised_audio_path, result_diarization
44
 
45
 
 
47
  parser = argparse.ArgumentParser()
48
  parser.add_argument('--audio-path', default='dialog.mp3', help='Path to audio')
49
  opt = parser.parse_args()
50
+
51
  for _ in tqdm(range(10)):
52
  main_pipeline(audio_path=opt.audio_path)
utils/denoise_pipeline.py CHANGED
@@ -10,10 +10,8 @@ from demucs.pretrained import get_model
10
  demucs_model = get_model('htdemucs')
11
 
12
 
13
- def denoise(filename: str, device: str) -> str:
14
-
15
  wav_ref, sr = librosa.load(filename, mono=False, sr=44100)
16
-
17
  wav = torch.tensor(wav_ref)
18
  wav = torch.cat([wav.unsqueeze(0), wav.unsqueeze(0)]) if len(wav.shape) == 1 else wav
19
  ref = wav.mean(0)
@@ -28,9 +26,9 @@ def denoise(filename: str, device: str) -> str:
28
  vocal_wav = librosa.to_mono(vocal_wav)
29
  vocal_wav = vocal_wav.T
30
  vocal_wav = librosa.resample(vocal_wav, orig_sr=44100, target_sr=48000)
31
- write('denoise.wav', 48000, vocal_wav)
32
 
33
- return 'denoise.wav'
34
 
35
 
36
  if __name__ == '__main__':
 
10
  demucs_model = get_model('htdemucs')
11
 
12
 
13
+ def denoise(filename: str, device: str, out_filename='denoise.wav') -> str:
 
14
  wav_ref, sr = librosa.load(filename, mono=False, sr=44100)
 
15
  wav = torch.tensor(wav_ref)
16
  wav = torch.cat([wav.unsqueeze(0), wav.unsqueeze(0)]) if len(wav.shape) == 1 else wav
17
  ref = wav.mean(0)
 
26
  vocal_wav = librosa.to_mono(vocal_wav)
27
  vocal_wav = vocal_wav.T
28
  vocal_wav = librosa.resample(vocal_wav, orig_sr=44100, target_sr=48000)
29
+ write(out_filename, 48000, vocal_wav)
30
 
31
+ return out_filename
32
 
33
 
34
  if __name__ == '__main__':