Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,678 Bytes
6c226f9 8e787d3 2d4d7e5 d790c0b a414c37 7bbd83c 554c0b5 d790c0b 88183ad 7bbd83c 592b794 6c226f9 a11fbef 6b09585 9d6fa91 554c0b5 6c226f9 bf63579 35c87e9 755f3a2 6b09585 94e9e1d f4720e3 35c87e9 6c226f9 554c0b5 8dba9f0 7bbd83c 6c226f9 554c0b5 7bbd83c 520f263 7bbd83c 6c226f9 06d68be 6b09585 592b794 7bbd83c 3c0cd8e 7bbd83c 6c226f9 d790c0b 554c0b5 d790c0b 7bbd83c d790c0b 7bbd83c 66efbc3 6b09585 592b794 7bbd83c d790c0b b97a3c2 3c0cd8e 7bbd83c 6c226f9 7bbd83c 6c226f9 7505a12 6c226f9 7bbd83c 6c226f9 7097513 3ce82e9 7097513 7bbd83c a5bfe25 6c226f9 b95b5ca 6c226f9 7505a12 6c226f9 7505a12 6c226f9 ab14d7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline, BitsAndBytesConfig, WhisperForConditionalGeneration
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
from huggingface_hub import CommitScheduler
import spaces
import tempfile
import os
import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4
from functools import lru_cache
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
MODEL_NAME = "dwb2023/whisper-large-v3-quantized"
BATCH_SIZE = 8
YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
device = 0 if torch.cuda.is_available() else "cpu"
# Load model with bitsandbytes quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
# Load the model
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, config=bnb_config)
# bnb_config = bnb.QuantizationConfig(bits=4)
pipe = pipeline(task="automatic-speech-recognition", model=model, chunk_length_s=30, device=device)
# Define paths and create directory if not exists
JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
# Initialize CommitScheduler for saving data to Hugging Face Dataset
scheduler = CommitScheduler(
repo_id="transcript-dataset-repo",
repo_type="dataset",
folder_path=JSON_DATASET_DIR,
path_in_repo="data",
)
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
@spaces.GPU
@lru_cache(maxsize=10)
def transcribe_audio(inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
def download_yt_audio(yt_url, filename):
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length = info["duration"]
if file_length > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
@spaces.GPU
@lru_cache(maxsize=10)
def yt_transcribe(yt_url, task):
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
save_transcription(yt_url, text)
return text
def save_transcription(yt_url, transcription):
with scheduler.lock:
with JSON_DATASET_PATH.open("a") as f:
json.dump({"url": yt_url, "transcription": transcription, "datetime": datetime.now().isoformat()}, f)
f.write("\n")
@spaces.GPU
def yt_transcribe2(yt_url, task, max_filesize=75.0):
html_embed_str = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return html_embed_str, text
demo = gr.Blocks()
yt_transcribe_interface = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
],
outputs="text",
title="Whisper Large V3: Transcribe YouTube",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
" arbitrary length."
),
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe2,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
],
outputs=["html", "text"],
title="Whisper Large V3: Transcribe YouTube",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
" arbitrary length."
),
allow_flagging="never",
)
with demo:
gr.TabbedInterface([yt_transcribe_interface, yt_transcribe], ["YouTube", "YouTube HF"])
demo.queue().launch()
|