File size: 3,479 Bytes
2ddf3fe
 
0412962
 
50a5992
 
2ddf3fe
0412962
 
a00d114
0412962
8e14b4c
 
 
9117637
a00d114
 
 
 
 
8e14b4c
 
 
 
50a5992
 
 
 
2ddf3fe
50a5992
 
 
 
4b0b51b
0412962
50a5992
2ddf3fe
8e14b4c
 
50a5992
865b8d5
 
 
 
50a5992
 
2ddf3fe
b865c6d
50a5992
a00d114
2ddf3fe
50a5992
 
8e14b4c
2ddf3fe
 
865b8d5
 
 
0412962
 
50a5992
 
 
 
2ddf3fe
8e14b4c
50a5992
2ddf3fe
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
import numpy as np
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
import torch
import pandas as pd
import gdown

def process_audio(audio_input):
    wav = read_audio(audio_input, sampling_rate=16_000)
    audio_length_samples = len(wav)
    probs = get_speech_probs(wav, sampling_rate=16_000)
    return make_visualization(probs, 512 / 16_000), probs, audio_length_samples

def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
    print(probs, audio_length_samples)
    timestamps = probs2speech_timestamps(probs, audio_length_samples, 
                                         threshold = threshold,
                                         min_speech_duration_ms = min_speech_duration_ms, 
                                         min_silence_duration_ms=min_silence_duration_ms, 
                                         window_size_samples=window_size_samples, 
                                         speech_pad_ms=speech_pad_ms,
                                         return_seconds=True,
                                         rounding=3)
    
    df = pd.DataFrame(timestamps)
    df["note"] = ""
    df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
    return "timestamps.txt", df

def download_gdrive(id):
    output_file = "audio.wav"  # Replace "data_file.ext" with the desired output filename and extension

    gdown.download(f"https://drive.google.com/uc?id={id}", output_file)
    return output_file

def main(): 
    with gr.Blocks() as demo:
        probs = gr.State()
        audio_length_samples = gr.State()
        with gr.Row():
            info = """Input the Google Drive file id from the shared link.
            It comes after https://drive.google.com/file/d/ <id here.
            For example the link https://drive.google.com/file/d/15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8/view?usp=drive_link has id 15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8"""
            gdrive_str = gr.Text(label="File ID", info = info)
            download_button = gr.Button("Download Audio")

        with gr.Row():
            audio_input = gr.Audio(type="filepath")
            button1 = gr.Button("Compute Speech Probabilities")
            figure = gr.Plot()

        download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)

        button1.click(process_audio, inputs=[audio_input], outputs=[figure, probs, audio_length_samples])

        with gr.Row():
            threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
            min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=10_000)
            min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=5_000)
            window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
            speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
            button2 = gr.Button("Compute Speech Timestamps")
            output_file = gr.File()
        with gr.Row():
            output_df = gr.DataFrame()

        button2.click(process_parameters, inputs=[probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
                      outputs=[output_file, output_df])

    demo.launch()

if __name__ == "__main__":
    main()