File size: 2,219 Bytes
2ddf3fe 0412962 2ddf3fe b0ab37c a00d114 0412962 b0ab37c a00d114 0412962 a00d114 0412962 2ddf3fe b0ab37c a00d114 2ddf3fe 0412962 2ddf3fe b865c6d a00d114 2ddf3fe 0412962 2ddf3fe 0412962 2ddf3fe 0c8ffff 2ddf3fe b0ab37c 2ddf3fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
import numpy as np
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
import torch
probs = None
audio_length_samples = None
def process_audio(audio_input):
global probs
global audio_length_samples
wav = read_audio(audio_input, sampling_rate=16_000)
audio_length_samples = len(wav)
probs = get_speech_probs(wav, sampling_rate=16_000)
return make_visualization(probs, 512 / 16_000)
def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
print(probs)
timestamps = probs2speech_timestamps(probs, audio_length_samples,
threshold = threshold,
min_speech_duration_ms = min_speech_duration_ms,
min_silence_duration_ms=min_silence_duration_ms,
window_size_samples=window_size_samples,
speech_pad_ms=speech_pad_ms)
print(timestamps)
return timestamps
def main():
with gr.Blocks() as demo:
with gr.Row():
audio_input = gr.Audio(type="filepath")
button1 = gr.Button("Compute Probabilities")
figure = gr.Plot()
button1.click(process_audio, inputs=[audio_input], outputs=figure)
with gr.Row():
threshold = gr.Number(label="Threshold", value=0.5, minimum=0.0, maximum=1.0)
min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=250)
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
button2 = gr.Button("Process Parameters")
output_text = gr.Textbox()
button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms], outputs=output_text)
demo.launch()
if __name__ == "__main__":
main()
|