File size: 3,754 Bytes
851ae91
2503b95
746f081
851ae91
 
b263b21
851ae91
0fec7fe
851ae91
 
ff878ab
4b01587
ff878ab
4b01587
851ae91
ff878ab
851ae91
 
38d85c1
 
851ae91
 
38d85c1
 
 
 
 
851ae91
 
38d85c1
 
 
 
851ae91
 
 
 
 
 
 
 
 
 
 
38d85c1
851ae91
 
 
 
 
4b01587
851ae91
38d85c1
851ae91
 
 
 
 
 
 
6e91234
38d85c1
851ae91
 
 
 
e0361cf
851ae91
9c2d9a5
0fec7fe
9c2d9a5
 
 
241f12e
c993b6c
ff9897a
02215da
c993b6c
851ae91
 
 
 
 
38d85c1
 
 
 
851ae91
 
 
e0361cf
851ae91
9c2d9a5
 
 
 
 
241f12e
38d85c1
 
02215da
38d85c1
851ae91
 
 
 
 
 
9c51c8a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import torch
import os

import gradio as gr
import pytube as pt
from speechbox import ASRDiarizationPipeline

MODEL_NAME = "openai/whisper-small"

device = 0 if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.environ.get("HF_TOKEN")

pipe = ASRDiarizationPipeline.from_pretrained(
    asr_model=MODEL_NAME,
    device=device,
    use_auth_token=HF_TOKEN,
)

def tuple_to_string(start_end_tuple, ndigits=1):
    return str((round(start_end_tuple[0], ndigits), round(start_end_tuple[1], ndigits)))


def format_as_transcription(raw_segments, with_timestamps=False):
    if with_timestamps:
        return "\n\n".join([chunk["speaker"] + " " + tuple_to_string(chunk["timestamp"]) +  chunk["text"] for chunk in raw_segments])
    else:
        return "\n\n".join([chunk["speaker"] + chunk["text"] for chunk in raw_segments])


def transcribe(file_upload, with_timestamps):
    raw_segments = pipe(file_upload)
    transcription = format_as_transcription(raw_segments, with_timestamps=with_timestamps)
    return transcription


def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str


def yt_transcribe(yt_url, with_timestamps):
    yt = pt.YouTube(yt_url)
    html_embed_str = _return_yt_html_embed(yt_url)
    stream = yt.streams.filter(only_audio=True)[0]
    stream.download(filename="audio.mp3")

    text = pipe("audio.mp3")

    return html_embed_str, format_as_transcription(text, with_timestamps=with_timestamps)


demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="upload", type="filepath"),
        gr.Checkbox(label="With timestamps?", value=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Speaker Diarization: Transcribe Audio",
    description=(
        "Transcribe audio files with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
        "Demo uses the pre-trained checkpoint [Whisper Small](https://huggingface.co/openai/whisper-small) for the ASR "
        "transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
        "\n\n"
        "Check out the repo here: https://github.com/huggingface/speechbox/"
    ),
    examples=[
        ["./processed.wav", True],
        ["./processed.wav", False],
    ],
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Checkbox(label="With timestamps?", value=True),
    ],
    outputs=["html", "text"],
    layout="horizontal",
    theme="huggingface",
    title="Whisper Speaker Diarization: Transcribe YouTube",
    description=(
        "Transcribe YouTube videos with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
        "Demo uses the pre-trained checkpoint [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR "
        "transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
        "\n\n"
        "Check out the repo here: https://github.com/huggingface/speechbox/"
    ),
    examples=[
        ["https://www.youtube.com/watch?v=9dAWIPixYxc", True],
        ["https://www.youtube.com/watch?v=9dAWIPixYxc", False],
    ],
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])

demo.launch(enable_queue=True)