Spaces:
Runtime error
Runtime error
File size: 3,932 Bytes
851ae91 2503b95 746f081 851ae91 b263b21 99975a5 851ae91 bf72a19 851ae91 ff878ab 4b01587 99975a5 ff878ab 4b01587 851ae91 659a76f 851ae91 38d85c1 851ae91 38d85c1 851ae91 38d85c1 2a98152 38d85c1 851ae91 38d85c1 851ae91 4b01587 851ae91 38d85c1 851ae91 6e91234 38d85c1 851ae91 e0361cf 851ae91 9c2d9a5 bf72a19 9c2d9a5 241f12e bf72a19 851ae91 38d85c1 851ae91 e0361cf 851ae91 9c2d9a5 86aed81 9c2d9a5 241f12e 270ca3e 851ae91 9c51c8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import torch
import os
import gradio as gr
import pytube as pt
from speechbox import ASRDiarizationPipeline
from huggingface_hub import login
MODEL_NAME = "openai/whisper-small"
device = 0 if torch.cuda.is_available() else "cpu"
HF_TOKEN = os.environ.get("HF_TOKEN")
pipe = ASRDiarizationPipeline.from_pretrained(
asr_model=MODEL_NAME,
device=device,
use_auth_token=HF_TOKEN,
)
def tuple_to_string(start_end_tuple, ndigits=1):
return str((round(start_end_tuple[0], ndigits), round(start_end_tuple[1], ndigits)))
def format_as_transcription(raw_segments, with_timestamps=False):
if with_timestamps:
return "\n\n".join([chunk["speaker"] + " " + tuple_to_string(chunk["timestamp"]) + chunk["text"] for chunk in raw_segments])
else:
return "\n\n".join([chunk["speaker"] + chunk["text"] for chunk in raw_segments])
def transcribe(file_upload, with_timestamps):
if file_upload is None:
raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
raw_segments = pipe(file_upload)
transcription = format_as_transcription(raw_segments, with_timestamps=with_timestamps)
return transcription
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, with_timestamps):
yt = pt.YouTube(yt_url)
html_embed_str = _return_yt_html_embed(yt_url)
stream = yt.streams.filter(only_audio=True)[0]
stream.download(filename="audio.mp3")
text = pipe("audio.mp3")
return html_embed_str, format_as_transcription(text, with_timestamps=with_timestamps)
demo = gr.Blocks()
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="upload", type="filepath"),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs="text",
layout="horizontal",
theme="huggingface",
title="Whisper Speaker Diarization: Transcribe Audio",
description=(
"Transcribe audio files with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
"Demo uses the pre-trained checkpoint [Whisper Small](https://huggingface.co/openai/whisper-small) for the ASR "
"transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
"\n\n"
"Check out the repo here: https://github.com/huggingface/speechbox/"
),
#examples=[
# ["./processed.wav", True],
# ["./processed.wav", False],
#],
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Checkbox(label="With timestamps?", value=True),
],
outputs=["html", "text"],
layout="horizontal",
theme="huggingface",
title="Whisper Speaker Diarization: Transcribe YouTube",
description=(
"Transcribe YouTube videos with speaker diarization using [🤗 Speechbox](https://github.com/huggingface/speechbox/). "
"Demo uses the pre-trained checkpoint [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) for the ASR "
"transcriptions and [pyannote.audio](https://huggingface.co/pyannote/speaker-diarization) to label the speakers."
"\n\n"
"Check out the repo here: https://github.com/huggingface/speechbox/"
),
examples=[
["https://www.youtube.com/watch?v=9dAWIPixYxc", True],
["https://www.youtube.com/watch?v=9dAWIPixYxc", False],
],
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
demo.launch(enable_queue=True)
|