Spaces:
Runtime error
Runtime error
File size: 4,112 Bytes
0ab122b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
from speechline.transcribers import Wav2Vec2Transcriber
from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
from speechline.utils.tokenizer import WordTokenizer
from datasets import Dataset, Audio
from pathlib import Path
import gradio as gr
import shutil
max_textboxes=5
def preprocess(audio_path, transcriber):
dataset = Dataset.from_dict({"audio": [audio_path]})
dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate))
return dataset
def transcribe(audio_path, transcriber):
dataset = preprocess(audio_path, transcriber)
output_offsets = transcriber.predict(dataset, output_offsets=True)
return output_offsets
def segmentation_interface(choice):
if choice == "silence":
return gr.update(visible=True), gr.update(visible=False)
elif choice == "word_overlap":
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
def process(audio_path, model, segmentation_type, silence_duration, ground_truth):
output_dir = "./audio_chunks"
transcriber = Wav2Vec2Transcriber(model)
output_offsets = transcribe(audio_path, transcriber)
if segmentation_type == "silence":
segmenter = SilenceSegmenter()
elif segmentation_type == "word_overlap":
segmenter = WordOverlapSegmenter()
tokenizer = WordTokenizer()
if os.path.exists(f"{output_dir}/tmp"):
shutil.rmtree(f"{output_dir}/tmp")
segmenter.chunk_audio_segments(
audio_path,
output_dir,
output_offsets[0],
minimum_chunk_duration=0,
silence_duration=silence_duration,
ground_truth=tokenizer(ground_truth),
)
outputs = []
idx = 0
for path in sorted(Path(f"{output_dir}/tmp").iterdir()):
if str(path).split('.')[-1] == 'tsv':
gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"])
outputs.append(gr.Dataframe.update(value=gt,visible=True))
idx+=1
if str(path).split('.')[-1] == 'wav':
audio = (str(path))
outputs.append(gr.Audio.update(value=audio, visible=True))
for i in range(max_textboxes-idx):
outputs.append(gr.Dataframe.update(visible=False))
outputs.append(gr.Audio.update(visible=False))
outputs.append(gr.Column.update(visible=True))
return outputs
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio = gr.Audio(type="filepath")
radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True)
model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model")
slider = gr.Slider(0, 100, value=3, step=0.1, visible=False)
gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False)
radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt])
inputs = [audio, model, radio, slider, gt]
transcribe_btn = gr.Button("Transcribe")
with gr.Column(visible=False) as output_col:
outputs = []
gt1 = gr.Dataframe(visible=False)
audio1 = gr.Audio(visible=False)
gt2 = gr.Dataframe(visible=False)
audio2 = gr.Audio(visible=False)
gt3 = gr.Dataframe(visible=False)
audio3 = gr.Audio(visible=False)
gt4 = gr.Dataframe(visible=False)
audio4 = gr.Audio(visible=False)
gt5 = gr.Dataframe(visible=False)
audio5 = gr.Audio(visible=False)
for i in range(max_textboxes):
outputs.append(gr.Dataframe(visible=False))
outputs.append(gr.Audio(visible=False))
outputs.append(output_col)
transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs)
demo.queue().launch() |