File size: 4,336 Bytes
8235b4f
b3d591c
8235b4f
 
b3d591c
 
 
8235b4f
 
 
b3d591c
8235b4f
 
 
 
dc334de
897877f
b3d591c
39e4af1
b3d591c
 
076c808
8235b4f
3d7e2e4
8235b4f
3d7e2e4
 
 
 
8235b4f
b3d591c
8235b4f
b3d591c
3d7e2e4
 
8235b4f
 
b3d591c
 
8235b4f
b3d591c
 
 
 
 
 
8235b4f
 
 
 
 
 
 
 
 
 
 
82ca6b2
8235b4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d7e2e4
 
 
 
 
b3d591c
3d7e2e4
8235b4f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from svoice.separate import *
import scipy.io.wavfile as wav
import gradio as gr
import os
import torch
import soundfile as sf
from transformers import pipeline
from glob import glob
load_model()

device = "cuda" if torch.cuda.is_available() else "cpu"
BASE_PATH = os.path.dirname(os.path.abspath(__file__))
os.makedirs('input', exist_ok=True)
os.makedirs('separated', exist_ok=True)

print(f"Loading ASR model on {device}...")
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if device == "cuda" else -1)
print("ASR model loaded!")

def transcribe_audio(audiopath):
    audio_input, sr = sf.read(audiopath)
    return pipe(audio_input, max_new_tokens=500)['text']

def separator(audio, rec_audio, example):
    outputs= {}
    for f in glob('input/*'):
        os.remove(f)
    for f in glob('separated/*'):
        os.remove(f)
    if audio:
        wav.write('input/original.wav', audio[0], audio[1])
    elif rec_audio:
        wav.write('input/original.wav', rec_audio[0], rec_audio[1])
    else:
        os.system(f'cp {example} input/original.wav')
    separate_demo(mix_dir="./input")
    separated_files = glob(os.path.join('separated', "*.wav"))
    separated_files = sorted([f for f in separated_files if "original.wav" not in f])
    outputs["transcripts"] = []
    
    for i, f in enumerate(separated_files):
        print(f"Transcribing separated audio {i+1} ...")
        outputs["transcripts"].append(transcribe_audio(f))
        print("Text:", outputs["transcripts"][-1])
    return separated_files + outputs['transcripts']

def set_example_audio(example: list) -> dict:
    return gr.Audio.update(value=example[0])

demo = gr.Blocks()
with demo:
    gr.Markdown('''
    <center>
        <h1>Multiple Voice Separation with Transcription DEMO</h1>
        <div style="display:flex;align-items:center;justify-content:center;"><iframe src="https://streamable.com/e/0x8osl?autoplay=1&nocontrols=1" frameborder="0" allow="autoplay"></iframe></div>
        <p>
            This is a demo for the multiple voice separation algorithm. The algorithm is trained on the LibriMix7 dataset and can be used to separate multiple voices from a single audio file.
            *This is an intermediate checkpoint just for experimentation purpose. It isn't performing well on 16k sample rate so you can go here <b><a href="https://github.com/muhammad-ahmed-ghani/svoice_demo">svoice_demo</a></b> to train it on 8k.
        </p>
    </center>
    ''')
    
    with gr.Row():
        input_audio = gr.Audio(label="Input audio", type="numpy")
        rec_audio = gr.Audio(label="Record Using Microphone", type="numpy", source="microphone")

    with gr.Row():
        output_audio1 = gr.Audio(label='Speaker 1', interactive=False)
        output_text1 = gr.Text(label='Speaker 1', interactive=False)
        output_audio2 = gr.Audio(label='Speaker 2', interactive=False)
        output_text2 = gr.Text(label='Speaker 2', interactive=False)

    with gr.Row():
        output_audio3 = gr.Audio(label='Speaker 3', interactive=False)
        output_text3 = gr.Text(label='Speaker 3', interactive=False)
        output_audio4 = gr.Audio(label='Speaker 4', interactive=False)
        output_text4 = gr.Text(label='Speaker 4', interactive=False)

    with gr.Row():
        output_audio5 = gr.Audio(label='Speaker 5', interactive=False)
        output_text5 = gr.Text(label='Speaker 5', interactive=False)
        output_audio6 = gr.Audio(label='Speaker 6', interactive=False)
        output_text6 = gr.Text(label='Speaker 6', interactive=False)

    with gr.Row():
        output_audio7 = gr.Audio(label='Speaker 7', interactive=False)
        output_text7 = gr.Text(label='Speaker 7', interactive=False)

    outputs_audio = [output_audio1, output_audio2, output_audio3, output_audio4, output_audio5, output_audio6, output_audio7]
    outputs_text = [output_text1, output_text2, output_text3, output_text4, output_text5, output_text6, output_text7]
    button = gr.Button("Separate")
    examples = [
        "samples/mixture1.wav",
        "samples/mixture2.wav",
        "samples/mixture3.wav"
    ]
    example_selector = gr.inputs.Radio(examples, label="Example Audio")
    button.click(separator, inputs=[input_audio, rec_audio, example_selector], outputs=outputs_audio + outputs_text)

demo.launch()