File size: 3,479 Bytes
2ddf3fe 0412962 50a5992 2ddf3fe 0412962 a00d114 0412962 8e14b4c 9117637 a00d114 8e14b4c 50a5992 2ddf3fe 50a5992 4b0b51b 0412962 50a5992 2ddf3fe 8e14b4c 50a5992 865b8d5 50a5992 2ddf3fe b865c6d 50a5992 a00d114 2ddf3fe 50a5992 8e14b4c 2ddf3fe 865b8d5 0412962 50a5992 2ddf3fe 8e14b4c 50a5992 2ddf3fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
import numpy as np
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
import torch
import pandas as pd
import gdown
def process_audio(audio_input):
wav = read_audio(audio_input, sampling_rate=16_000)
audio_length_samples = len(wav)
probs = get_speech_probs(wav, sampling_rate=16_000)
return make_visualization(probs, 512 / 16_000), probs, audio_length_samples
def process_parameters(probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
print(probs, audio_length_samples)
timestamps = probs2speech_timestamps(probs, audio_length_samples,
threshold = threshold,
min_speech_duration_ms = min_speech_duration_ms,
min_silence_duration_ms=min_silence_duration_ms,
window_size_samples=window_size_samples,
speech_pad_ms=speech_pad_ms,
return_seconds=True,
rounding=3)
df = pd.DataFrame(timestamps)
df["note"] = ""
df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
return "timestamps.txt", df
def download_gdrive(id):
output_file = "audio.wav" # Replace "data_file.ext" with the desired output filename and extension
gdown.download(f"https://drive.google.com/uc?id={id}", output_file)
return output_file
def main():
with gr.Blocks() as demo:
probs = gr.State()
audio_length_samples = gr.State()
with gr.Row():
info = """Input the Google Drive file id from the shared link.
It comes after https://drive.google.com/file/d/ <id here.
For example the link https://drive.google.com/file/d/15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8/view?usp=drive_link has id 15C6aHry8sJr43r0EYPPrIlPjMWp6SDb8"""
gdrive_str = gr.Text(label="File ID", info = info)
download_button = gr.Button("Download Audio")
with gr.Row():
audio_input = gr.Audio(type="filepath")
button1 = gr.Button("Compute Speech Probabilities")
figure = gr.Plot()
download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
button1.click(process_audio, inputs=[audio_input], outputs=[figure, probs, audio_length_samples])
with gr.Row():
threshold = gr.Number(label="Threshold", value=0.6, minimum=0.0, maximum=1.0)
min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=10_000)
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=5_000)
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
button2 = gr.Button("Compute Speech Timestamps")
output_file = gr.File()
with gr.Row():
output_df = gr.DataFrame()
button2.click(process_parameters, inputs=[probs, audio_length_samples, threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
outputs=[output_file, output_df])
demo.launch()
if __name__ == "__main__":
main()
|