from speechline.transcribers import Wav2Vec2Transcriber from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter from speechline.utils.tokenizer import WordTokenizer from datasets import Dataset, Audio from pathlib import Path import os import gradio as gr import shutil import pandas as pd max_textboxes=5 def preprocess(audio_path, transcriber): dataset = Dataset.from_dict({"audio": [audio_path]}) dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate)) return dataset def transcribe(audio_path, transcriber): dataset = preprocess(audio_path, transcriber) output_offsets = transcriber.predict(dataset, output_offsets=True) return output_offsets def segmentation_interface(choice): if choice == "silence": return gr.update(visible=True), gr.update(visible=False) elif choice == "word_overlap": return gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=False) def process(audio_path, model, segmentation_type, silence_duration, ground_truth): output_dir = "./audio_chunks" transcriber = Wav2Vec2Transcriber(model) output_offsets = transcribe(audio_path, transcriber) if segmentation_type == "silence": segmenter = SilenceSegmenter() elif segmentation_type == "word_overlap": segmenter = WordOverlapSegmenter() tokenizer = WordTokenizer() if os.path.exists(f"{output_dir}/tmp"): shutil.rmtree(f"{output_dir}/tmp") segmenter.chunk_audio_segments( audio_path, output_dir, output_offsets[0], minimum_chunk_duration=0, silence_duration=silence_duration, ground_truth=tokenizer(ground_truth), ) outputs = [] idx = 0 for path in sorted(Path(f"{output_dir}/tmp").iterdir()): if str(path).split('.')[-1] == 'tsv': gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"]) outputs.append(gr.Dataframe.update(value=gt,visible=True)) idx+=1 if str(path).split('.')[-1] == 'wav': audio = (str(path)) outputs.append(gr.Audio.update(value=audio, visible=True)) for i in range(max_textboxes-idx): outputs.append(gr.Dataframe.update(visible=False)) outputs.append(gr.Audio.update(visible=False)) outputs.append(gr.Column.update(visible=True)) return outputs with gr.Blocks() as demo: with gr.Row(): with gr.Column(): audio = gr.Audio(type="filepath") radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True) model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model") slider = gr.Slider(0, 100, value=3, step=0.1, visible=False) gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False) radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt]) inputs = [audio, model, radio, slider, gt] transcribe_btn = gr.Button("Transcribe") with gr.Column(visible=False) as output_col: outputs = [] gt1 = gr.Dataframe(visible=False) audio1 = gr.Audio(visible=False) gt2 = gr.Dataframe(visible=False) audio2 = gr.Audio(visible=False) gt3 = gr.Dataframe(visible=False) audio3 = gr.Audio(visible=False) gt4 = gr.Dataframe(visible=False) audio4 = gr.Audio(visible=False) gt5 = gr.Dataframe(visible=False) audio5 = gr.Audio(visible=False) for i in range(max_textboxes): outputs.append(gr.Dataframe(visible=False)) outputs.append(gr.Audio(visible=False)) outputs.append(output_col) transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs) demo.queue().launch()