File size: 3,932 Bytes
851ae91
2503b95
746f081
851ae91
 
b263b21
99975a5
851ae91
bf72a19
851ae91
 
ff878ab
4b01587
99975a5
ff878ab
4b01587
851ae91
659a76f
851ae91
 
38d85c1
 
851ae91
 
38d85c1
 
 
 
 
851ae91
 
38d85c1
2a98152
 
38d85c1
 
 
851ae91
 
 
 
 
 
 
 
 
 
 
38d85c1
851ae91
 
 
 
 
4b01587
851ae91
38d85c1
851ae91
 
 
 
 
 
 
6e91234
38d85c1
851ae91
 
 
 
e0361cf
851ae91
9c2d9a5
bf72a19
9c2d9a5
 
 
241f12e
bf72a19
 
 
 
851ae91
 
 
 
 
38d85c1
 
 
 
851ae91
 
 
e0361cf
851ae91
9c2d9a5
86aed81
9c2d9a5
 
 
241f12e
270ca3e
 
 
 
851ae91
 
 
 
 
 
9c51c8a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
import os

import gradio as gr
import pytube as pt
from speechbox import ASRDiarizationPipeline
from huggingface_hub import login

MODEL_NAME = "openai/whisper-small"

device = 0 if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.environ.get("HF_TOKEN")


pipe = ASRDiarizationPipeline.from_pretrained(
    asr_model=MODEL_NAME,
    device=device,
    use_auth_token=HF_TOKEN,
)

def tuple_to_string(start_end_tuple, ndigits=1):
    return str((round(start_end_tuple[0], ndigits), round(start_end_tuple[1], ndigits)))


def format_as_transcription(raw_segments, with_timestamps=False):
    if with_timestamps:
        return "\n\n".join([chunk["speaker"] + " " + tuple_to_string(chunk["timestamp"]) +  chunk["text"] for chunk in raw_segments])
    else:
        return "\n\n".join([chunk["speaker"] + chunk["text"] for chunk in raw_segments])


def transcribe(file_upload, with_timestamps):
    if file_upload is None:
        raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
    raw_segments = pipe(file_upload)
    transcription = format_as_transcription(raw_segments, with_timestamps=with_timestamps)
    return transcription


def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str


def yt_transcribe(yt_url, with_timestamps):
    yt = pt.YouTube(yt_url)
    html_embed_str = _return_yt_html_embed(yt_url)
    stream = yt.streams.filter(only_audio=True)[0]
    stream.download(filename="audio.mp3")

    text = pipe("audio.mp3")

    return html_embed_str, format_as_transcription(text, with_timestamps=with_timestamps)


demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="upload", type="filepath"),
        gr.Checkbox(label="With timestamps?", value=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Speaker Diarization: Transcribe Audio",
    description=(
        "Transcribe audio files with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
        "Demo uses the pre-trained checkpoint [Whisper Small](https://huggingface.co/openai/whisper-small) for the ASR "
        "transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
        "\n\n"
        "Check out the repo here: https://github.com/huggingface/speechbox/"
    ),
    #examples=[
    #    ["./processed.wav", True],
    #    ["./processed.wav", False],
    #],
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Checkbox(label="With timestamps?", value=True),
    ],
    outputs=["html", "text"],
    layout="horizontal",
    theme="huggingface",
    title="Whisper Speaker Diarization: Transcribe YouTube",
    description=(
        "Transcribe YouTube videos with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
        "Demo uses the pre-trained checkpoint [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR "
        "transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
        "\n\n"
        "Check out the repo here: https://github.com/huggingface/speechbox/"
    ),
    examples=[
        ["https://www.youtube.com/watch?v=9dAWIPixYxc", True],
        ["https://www.youtube.com/watch?v=9dAWIPixYxc", False],
    ],
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])

demo.launch(enable_queue=True)