File size: 2,219 Bytes
2ddf3fe
 
0412962
 
2ddf3fe
b0ab37c
a00d114
0412962
b0ab37c
a00d114
0412962
a00d114
0412962
2ddf3fe
 
b0ab37c
a00d114
 
 
 
 
 
 
 
 
2ddf3fe
0412962
 
 
2ddf3fe
 
 
b865c6d
a00d114
 
2ddf3fe
0412962
2ddf3fe
 
0412962
 
 
 
 
2ddf3fe
0c8ffff
2ddf3fe
b0ab37c
2ddf3fe
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
import numpy as np
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
import torch

probs = None
audio_length_samples = None
def process_audio(audio_input):
    global probs
    global audio_length_samples
    wav = read_audio(audio_input, sampling_rate=16_000)
    audio_length_samples = len(wav)
    probs = get_speech_probs(wav, sampling_rate=16_000)
    return make_visualization(probs, 512 / 16_000)

def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
    print(probs)
    timestamps = probs2speech_timestamps(probs, audio_length_samples, 
                                         threshold = threshold,
                                         min_speech_duration_ms = min_speech_duration_ms, 
                                         min_silence_duration_ms=min_silence_duration_ms, 
                                         window_size_samples=window_size_samples, 
                                         speech_pad_ms=speech_pad_ms)
    print(timestamps)
    return timestamps

def main(): 
    


    with gr.Blocks() as demo:
        with gr.Row():
            audio_input = gr.Audio(type="filepath")
            button1 = gr.Button("Compute Probabilities")
            figure = gr.Plot()

        button1.click(process_audio, inputs=[audio_input], outputs=figure)

        with gr.Row():
            threshold = gr.Number(label="Threshold", value=0.5, minimum=0.0, maximum=1.0)
            min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=250)
            min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
            window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
            speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
            button2 = gr.Button("Process Parameters")
            output_text = gr.Textbox()

        button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms], outputs=output_text)

    demo.launch()

if __name__ == "__main__":
    main()