File size: 3,845 Bytes
9839c08
 
0ab122b
 
 
8097d61
9839c08
 
 
 
0ab122b
798ba23
9839c08
0ab122b
 
9839c08
 
0ab122b
9839c08
0ab122b
 
 
9839c08
0ab122b
9839c08
 
 
 
 
0ab122b
9839c08
0ab122b
9839c08
0ab122b
 
 
 
9839c08
 
0ab122b
 
 
9839c08
0ab122b
 
 
 
 
9839c08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ab122b
 
9839c08
0ab122b
9839c08
 
 
 
 
 
 
 
 
 
 
0ab122b
 
 
9839c08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import shutil
from pathlib import Path

import gradio as gr
import pandas as pd
from datasets import Audio, Dataset
from speechline.segmenters import SilenceSegmenter, WordOverlapSegmenter
from speechline.transcribers import Wav2Vec2Transcriber
from speechline.utils.tokenizer import WordTokenizer

MAX_SEGMENTS = 100
OUTPUT_DIR = "tmp"


def segmentation_interface(choice: str):
    if choice == "Silence Gap":
        return gr.update(visible=True), gr.update(visible=False)
    elif choice == "Word Overlap":
        return gr.update(visible=False), gr.update(visible=True)


def run(audio_path, model, segmentation_type, silence_duration, ground_truth):
    transcriber = Wav2Vec2Transcriber(model)
    dataset = Dataset.from_dict({"audio": [audio_path]})
    dataset = dataset.cast_column(
        "audio", Audio(sampling_rate=transcriber.sampling_rate)
    )
    output_offsets = transcriber.predict(dataset, output_offsets=True)

    if segmentation_type == "Silence Gap":
        segmenter = SilenceSegmenter()
    elif segmentation_type == "Word Overlap":
        segmenter = WordOverlapSegmenter()

    tokenizer = WordTokenizer()

    if os.path.exists(OUTPUT_DIR):
        shutil.rmtree(OUTPUT_DIR)

    segmenter.chunk_audio_segments(
        audio_path,
        OUTPUT_DIR,
        output_offsets[0],
        minimum_chunk_duration=0,
        silence_duration=silence_duration,
        ground_truth=tokenizer(ground_truth),
    )

    outputs, idx = [], 0

    for path in sorted(Path(OUTPUT_DIR).rglob("*")):
        if path.suffix == ".tsv":
            gt = pd.read_csv(
                path, sep="\t", names=["start_offset", "end_offset", "text"]
            )
            outputs.append(gr.Dataframe.update(value=gt, visible=True))
        elif path.suffix == ".wav":
            outputs.append(gr.Audio.update(value=str(path), visible=True))
            idx += 1

    for _ in range(MAX_SEGMENTS - idx):
        outputs += [gr.Dataframe.update(visible=False), gr.Audio.update(visible=False)]
    return outputs


with gr.Blocks() as demo:
    gr.Markdown(
        f"""
        <center>

        # 🎙️ SpeechLine Demo 
        [Repository](https://github.com/bookbot-kids/speechline) | [Documentation](https://bookbot-kids.github.io/speechline/)

        </center>
        """
    )

    with gr.Row():
        with gr.Column():
            audio = gr.Audio(type="filepath")
            model = gr.Dropdown(
                choices=[
                    "facebook/wav2vec2-base-960h",
                ],
                value="facebook/wav2vec2-base-960h",
                label="Transcriber Model",
            )
            segmenter = gr.Radio(
                choices=["Silence Gap", "Word Overlap"],
                value="Silence Gap",
                label="Segmentation Method",
            )
            sil = gr.Slider(
                0, 1, value=0.1, step=0.1, label="Silence Duration", visible=True
            )
            gt = gr.Textbox(
                label="Ground Truth",
                placeholder="Enter Ground Truth Text",
                interactive=True,
                visible=False,
            )

            segmenter.change(
                fn=segmentation_interface, inputs=segmenter, outputs=[sil, gt]
            )

            inputs = [audio, model, segmenter, sil, gt]
            transcribe_btn = gr.Button("Transcribe")

        with gr.Column():
            outputs = [
                gr.Dataframe(
                    visible=True, headers=["start_offset", "end_offset", "text"]
                ),
                gr.Audio(visible=True),
            ]
            for _ in range(MAX_SEGMENTS - 1):
                outputs += [gr.Dataframe(visible=False), gr.Audio(visible=False)]
            transcribe_btn.click(fn=run, inputs=inputs, outputs=outputs)

demo.launch()