Spaces:
Runtime error
Runtime error
File size: 3,754 Bytes
851ae91 2503b95 746f081 851ae91 b263b21 851ae91 0fec7fe 851ae91 ff878ab 4b01587 ff878ab 4b01587 851ae91 ff878ab 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 851ae91 4b01587 851ae91 38d85c1 851ae91 6e91234 38d85c1 851ae91 e0361cf 851ae91 9c2d9a5 0fec7fe 9c2d9a5 241f12e c993b6c ff9897a 02215da c993b6c 851ae91 38d85c1 851ae91 e0361cf 851ae91 9c2d9a5 241f12e 38d85c1 02215da 38d85c1 851ae91 9c51c8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import torch
import os
import gradio as gr
import pytube as pt
from speechbox import ASRDiarizationPipeline
MODEL_NAME = "openai/whisper-small"
device = 0 if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.environ.get("HF_TOKEN")
pipe = ASRDiarizationPipeline.from_pretrained(
asr_model=MODEL_NAME,
device=device,
use_auth_token=HF_TOKEN,
)
def tuple_to_string(start_end_tuple, ndigits=1):
return str((round(start_end_tuple[0], ndigits), round(start_end_tuple[1], ndigits)))
def format_as_transcription(raw_segments, with_timestamps=False):
if with_timestamps:
return "\n\n".join([chunk["speaker"] + " " + tuple_to_string(chunk["timestamp"]) + chunk["text"] for chunk in raw_segments])
else:
return "\n\n".join([chunk["speaker"] + chunk["text"] for chunk in raw_segments])
def transcribe(file_upload, with_timestamps):
raw_segments = pipe(file_upload)
transcription = format_as_transcription(raw_segments, with_timestamps=with_timestamps)
return transcription
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, with_timestamps):
yt = pt.YouTube(yt_url)
html_embed_str = _return_yt_html_embed(yt_url)
stream = yt.streams.filter(only_audio=True)[0]
stream.download(filename="audio.mp3")
text = pipe("audio.mp3")
return html_embed_str, format_as_transcription(text, with_timestamps=with_timestamps)
demo = gr.Blocks()
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", type="filepath"),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs="text",
layout="horizontal",
theme="huggingface",
title="Whisper Speaker Diarization: Transcribe Audio",
description=(
"Transcribe audio files with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
"Demo uses the pre-trained checkpoint [Whisper Small](https://huggingface.co/openai/whisper-small) for the ASR "
"transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
"\n\n"
"Check out the repo here: https://github.com/huggingface/speechbox/"
),
examples=[
["./processed.wav", True],
["./processed.wav", False],
],
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs=["html", "text"],
layout="horizontal",
theme="huggingface",
title="Whisper Speaker Diarization: Transcribe YouTube",
description=(
"Transcribe YouTube videos with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
"Demo uses the pre-trained checkpoint [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR "
"transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
"\n\n"
"Check out the repo here: https://github.com/huggingface/speechbox/"
),
examples=[
["https://www.youtube.com/watch?v=9dAWIPixYxc", True],
["https://www.youtube.com/watch?v=9dAWIPixYxc", False],
],
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
demo.launch(enable_queue=True)
|