SpeechLine / app.py
Davidsamuel101's picture
Initial Commit
0ab122b
raw history blame
No virus
4.11 kB
from speechline.transcribers import Wav2Vec2Transcriber
from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
from speechline.utils.tokenizer import WordTokenizer
from datasets import Dataset, Audio
from pathlib import Path
import gradio as gr
import shutil
max_textboxes=5
def preprocess(audio_path, transcriber):
dataset = Dataset.from_dict({"audio": [audio_path]})
dataset = dataset.cast_column("audio", Audio(sampling_rate=transcriber.sampling_rate))
return dataset
def transcribe(audio_path, transcriber):
dataset = preprocess(audio_path, transcriber)
output_offsets = transcriber.predict(dataset, output_offsets=True)
return output_offsets
def segmentation_interface(choice):
if choice == "silence":
return gr.update(visible=True), gr.update(visible=False)
elif choice == "word_overlap":
return gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False)
def process(audio_path, model, segmentation_type, silence_duration, ground_truth):
output_dir = "./audio_chunks"
transcriber = Wav2Vec2Transcriber(model)
output_offsets = transcribe(audio_path, transcriber)
if segmentation_type == "silence":
segmenter = SilenceSegmenter()
elif segmentation_type == "word_overlap":
segmenter = WordOverlapSegmenter()
tokenizer = WordTokenizer()
if os.path.exists(f"{output_dir}/tmp"):
shutil.rmtree(f"{output_dir}/tmp")
segmenter.chunk_audio_segments(
audio_path,
output_dir,
output_offsets[0],
minimum_chunk_duration=0,
silence_duration=silence_duration,
ground_truth=tokenizer(ground_truth),
)
outputs = []
idx = 0
for path in sorted(Path(f"{output_dir}/tmp").iterdir()):
if str(path).split('.')[-1] == 'tsv':
gt = pd.read_csv(path, sep='\t', names=["start_offset", "end_offset", "text"])
outputs.append(gr.Dataframe.update(value=gt,visible=True))
idx+=1
if str(path).split('.')[-1] == 'wav':
audio = (str(path))
outputs.append(gr.Audio.update(value=audio, visible=True))
for i in range(max_textboxes-idx):
outputs.append(gr.Dataframe.update(visible=False))
outputs.append(gr.Audio.update(visible=False))
outputs.append(gr.Column.update(visible=True))
return outputs
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio = gr.Audio(type="filepath")
radio = gr.Radio(["silence", "word_overlap"], label="Select Segmentation Method", required=True)
model = gr.Dropdown(["facebook/wav2vec2-base-960h", "bookbot/wav2vec-en", "bookbot/wav2vec-id"], value="facebook/wav2vec2-base-960h", label="Select Model")
slider = gr.Slider(0, 100, value=3, step=0.1, visible=False)
gt = gr.Textbox(label="Ground Truth", placeholder="Enter Ground Truth Text", interactive=True, visible=False)
radio.change(fn=segmentation_interface, inputs=radio, outputs=[slider, gt])
inputs = [audio, model, radio, slider, gt]
transcribe_btn = gr.Button("Transcribe")
with gr.Column(visible=False) as output_col:
outputs = []
gt1 = gr.Dataframe(visible=False)
audio1 = gr.Audio(visible=False)
gt2 = gr.Dataframe(visible=False)
audio2 = gr.Audio(visible=False)
gt3 = gr.Dataframe(visible=False)
audio3 = gr.Audio(visible=False)
gt4 = gr.Dataframe(visible=False)
audio4 = gr.Audio(visible=False)
gt5 = gr.Dataframe(visible=False)
audio5 = gr.Audio(visible=False)
for i in range(max_textboxes):
outputs.append(gr.Dataframe(visible=False))
outputs.append(gr.Audio(visible=False))
outputs.append(output_col)
transcribe_btn.click(fn=process, inputs=inputs, outputs=outputs)
demo.queue().launch()