|
import gradio as gr |
|
import os |
|
from uuid import uuid4 |
|
from pydub.silence import detect_nonsilent |
|
from pydub import AudioSegment |
|
|
|
|
|
def get_labels(audio_fp, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length, end_extension, extend_small_segments, show_duration_label): |
|
audio = AudioSegment.from_file(audio_fp.name).set_channels(1) |
|
speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=-40) |
|
speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps)) |
|
speech_timestamps = [{"start": s[0]/1000, "end": s[1]/1000} for s in speech_timestamps] |
|
labels_str = [] |
|
labels = [] |
|
|
|
uppper_merge_threshold = float(uppper_merge_threshold) |
|
|
|
for i, st in enumerate(speech_timestamps): |
|
labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}") |
|
labels.append((float(st['start']), float(st['end']), f"Sound {i+1}")) |
|
|
|
fn = str(uuid4()) + ".txt" |
|
with open(fn, "w") as f: |
|
f.write("\n".join(labels_str)) |
|
|
|
if not auto_merge: |
|
return fn, None |
|
|
|
gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))] |
|
|
|
duration = lambda x: float(x[1]) - float(x[0]) |
|
|
|
new_labels = [list(labels[0])] |
|
for i in range(1, len(labels)): |
|
if ( |
|
gaps[i - 1] <= uppper_merge_threshold |
|
and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i]) |
|
< max_segment_length |
|
): |
|
new_labels[-1][1] = labels[i][1] |
|
new_labels[-1][ |
|
2 |
|
] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}' |
|
else: |
|
new_labels.append(list(labels[i])) |
|
|
|
extended = [False] * (len(new_labels) - 1) |
|
if extend_small_segments: |
|
for i, nl in enumerate(new_labels[:-1]): |
|
if nl[1] - nl[0] <= 1.02 and nl[0] + 1.02 < new_labels[i+1][0]: |
|
nl[1] = nl[0] + 1.02 |
|
extended[i] = True |
|
|
|
if end_extension: |
|
for i, nl in enumerate(new_labels[:-1]): |
|
if not extended[i]: |
|
if nl[1] + end_extension < new_labels[i+1][0]: |
|
nl[1] = nl[1] + end_extension |
|
|
|
if show_duration_label: |
|
for nl in new_labels: |
|
nl[2] = round(nl[1] - nl[0], 3) |
|
|
|
translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels)) |
|
|
|
filename_path = f"{fn}_translate_label.txt" |
|
with open(filename_path, "w") as f: |
|
f.write("\n".join(translate_labels)) |
|
|
|
return fn, filename_path |
|
|
|
|
|
interface = gr.Interface( |
|
get_labels, |
|
[ |
|
gr.File(type="filepath", label="Audio file", file_types=["audio"], file_count="single"), |
|
gr.Number(label="min_speech_duration_ms", value=40, info="default (40)"), |
|
gr.Number(label="min_silence_duration_ms", value=40, info="default (40)"), |
|
gr.Checkbox(label="Auto merge", value=True), |
|
gr.Textbox(label="Gap max threshold value (seconds)", value=0.350), |
|
gr.Number(label="Approx Max Segment Length", value=7), |
|
gr.Number(label="Extend end by (seconds)", value=0), |
|
gr.Checkbox(label="Extend small segments (minimum 1.02 seconds)", value=False), |
|
gr.Checkbox(label="Show only duration in labels", value=False) |
|
], |
|
[ |
|
gr.File(label="VAD Labels"), |
|
gr.File(label="Merged Labels File") |
|
] |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.queue().launch() |