File size: 5,678 Bytes
6c226f9
8e787d3
2d4d7e5
d790c0b
a414c37
7bbd83c
554c0b5
d790c0b
88183ad
7bbd83c
 
 
 
592b794
6c226f9
a11fbef
 
6b09585
9d6fa91
554c0b5
6c226f9
 
bf63579
35c87e9
 
755f3a2
6b09585
94e9e1d
f4720e3
35c87e9
 
6c226f9
554c0b5
8dba9f0
7bbd83c
 
6c226f9
554c0b5
7bbd83c
520f263
7bbd83c
 
 
 
6c226f9
06d68be
 
 
 
 
 
 
 
6b09585
592b794
7bbd83c
3c0cd8e
 
 
7bbd83c
6c226f9
d790c0b
 
 
 
 
 
554c0b5
 
 
 
d790c0b
7bbd83c
d790c0b
7bbd83c
66efbc3
6b09585
592b794
7bbd83c
d790c0b
 
 
 
 
b97a3c2
 
3c0cd8e
7bbd83c
 
6c226f9
7bbd83c
 
 
 
 
6c226f9
7505a12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c226f9
 
7bbd83c
6c226f9
7097513
3ce82e9
 
7097513
7bbd83c
a5bfe25
6c226f9
b95b5ca
 
6c226f9
 
 
 
 
7505a12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c226f9
7505a12
6c226f9
ab14d7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline, BitsAndBytesConfig, WhisperForConditionalGeneration
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
from huggingface_hub import CommitScheduler
import spaces
import tempfile
import os
import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4
from functools import lru_cache

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

MODEL_NAME = "dwb2023/whisper-large-v3-quantized"
BATCH_SIZE = 8
YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes

device = 0 if torch.cuda.is_available() else "cpu"

# Load model with bitsandbytes quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True)

# Load the model
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, config=bnb_config)

# bnb_config = bnb.QuantizationConfig(bits=4)
pipe = pipeline(task="automatic-speech-recognition", model=model, chunk_length_s=30, device=device)

# Define paths and create directory if not exists
JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"

# Initialize CommitScheduler for saving data to Hugging Face Dataset
scheduler = CommitScheduler(
    repo_id="transcript-dataset-repo",
    repo_type="dataset",
    folder_path=JSON_DATASET_DIR,
    path_in_repo="data",
)

def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str

@spaces.GPU
@lru_cache(maxsize=10)
def transcribe_audio(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return text

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))
    file_length = info["duration"]
    if file_length > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
    ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([yt_url])

@spaces.GPU
@lru_cache(maxsize=10)
def yt_transcribe(yt_url, task):
    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()
    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    save_transcription(yt_url, text)
    return text

def save_transcription(yt_url, transcription):
    with scheduler.lock:
        with JSON_DATASET_PATH.open("a") as f:
            json.dump({"url": yt_url, "transcription": transcription, "datetime": datetime.now().isoformat()}, f)
            f.write("\n")

@spaces.GPU
def yt_transcribe2(yt_url, task, max_filesize=75.0):
    html_embed_str = _return_yt_html_embed(yt_url)

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]

    return html_embed_str, text

demo = gr.Blocks()

yt_transcribe_interface = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe2,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
    ],
    outputs=["html", "text"],
    title="Whisper Large V3: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([yt_transcribe_interface, yt_transcribe], ["YouTube", "YouTube HF"])

demo.queue().launch()