from speechline.transcribers import Wav2Vec2Transcriber from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter from speechline.utils.tokenizer import WordTokenizer from datasets import Dataset, Audio from pathlib import Path import gradio as gr import shutil max_textboxes=5 def preprocess(audio_path, transcriber): dataset = Dataset.from_dict({"audio": [audio_path]}) dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate)) return dataset def transcribe(audio_path, transcriber): dataset = preprocess(audio_path, transcriber) output_offsets = transcriber.predict(dataset, output_offsets=True) return output_offsets def segmentation_interface(choice): if choice == "silence": return gr.update(visible=True), gr.update(visible=False) elif choice == "word_overlap": return gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=False) def process(audio_path, model, segmentation_type, silence_duration, ground_truth): output_dir = "./audio_chunks" transcriber = Wav2Vec2Transcriber(model) output_offsets = transcribe(audio_path, transcriber) if segmentation_type == "silence": segmenter = SilenceSegmenter() elif segmentation_type == "word_overlap": segmenter = WordOverlapSegmenter() tokenizer = WordTokenizer() if os.path.exists(f"{output_dir}/tmp"): shutil.rmtree(f"{output_dir}/tmp") segmenter.chunk_audio_segments( audio_path, output_dir, output_offsets[0], minimum_chunk_duration=0, silence_duration=silence_duration, ground_truth=tokenizer(ground_truth), ) outputs = [] idx = 0 for path in sorted(Path(f"{output_dir}/tmp").iterdir()): if str(path).split('.')[-1] == 'tsv': gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"]) outputs.append(gr.Dataframe.update(value=gt,visible=True)) idx+=1 if str(path).split('.')[-1] == 'wav': audio = (str(path)) outputs.append(gr.Audio.update(value=audio, visible=True)) for i in range(max_textboxes-idx): outputs.append(gr.Dataframe.update(visible=False)) outputs.append(gr.Audio.update(visible=False)) outputs.append(gr.Column.update(visible=True)) return outputs with gr.Blocks() as demo: with gr.Row(): with gr.Column(): audio = gr.Audio(type="filepath") radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True) model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model") slider = gr.Slider(0, 100, value=3, step=0.1, visible=False) gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False) radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt]) inputs = [audio, model, radio, slider, gt] transcribe_btn = gr.Button("Transcribe") with gr.Column(visible=False) as output_col: outputs = [] gt1 = gr.Dataframe(visible=False) audio1 = gr.Audio(visible=False) gt2 = gr.Dataframe(visible=False) audio2 = gr.Audio(visible=False) gt3 = gr.Dataframe(visible=False) audio3 = gr.Audio(visible=False) gt4 = gr.Dataframe(visible=False) audio4 = gr.Audio(visible=False) gt5 = gr.Dataframe(visible=False) audio5 = gr.Audio(visible=False) for i in range(max_textboxes): outputs.append(gr.Dataframe(visible=False)) outputs.append(gr.Audio(visible=False)) outputs.append(output_col) transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs) demo.queue().launch()