whisper-large-v3

Running

File size: 5,864 Bytes

6c226f9
 
5a985c6
6c226f9
d790c0b
 
 
88183ad
7bab834
6c226f9
a5bfe25
9d6fa91
66efbc3
d790c0b
6c226f9
 
 
 
 
 
 
 
 
 
3c0cd8e
 
 
6c226f9
3c0cd8e
7bab834
6c226f9
 
 
 
 
 
 
 
 
d790c0b
 
3c0cd8e
d790c0b
 
 
 
3c0cd8e
1b51c36
 
3c0cd8e
d790c0b
1b51c36
 
d790c0b
3c0cd8e
1b51c36
3c0cd8e
d790c0b
 
 
 
 
 
66efbc3
6c226f9
66efbc3
d790c0b
1b51c36
d790c0b
 
 
6c226f9
b97a3c2
 
0a7fcda
3c0cd8e
6c226f9
 
 
2fbf87f
 
 
 
 
 
0752331
2fbf87f
 
3c0cd8e
 
 
567ec12
7bab834
6c226f9
 
a5bfe25
2fbf87f
6c226f9
 
 
2fbf87f
 
 
 
 
 
 
 
 
6c226f9
 
7097513
7bab834
 
7097513
6c226f9
a5bfe25
2fbf87f
6c226f9
 
 
7bab834
567ec12
6c226f9
2fbf87f

import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

import tempfile
import os
import time

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return text

def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()
    
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))
    
    file_length = info["duration"]
    file_length_s = int(file_length)
    
    if file_length_s > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length_s))
        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
    
    ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
    
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([yt_url])
        except youtube_dl.utils.ExtractorError as err:
            raise gr.Error(str(err))

def yt_transcribe(yt_url, task, max_filesize=75.0):
    html_embed_str = _return_yt_html_embed(yt_url)

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "audio.m4a")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]

    return html_embed_str, text

description = """
Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the checkpoint openai/whisper-large-v3 and Transformers to transcribe audio files of arbitrary length.<br>
<a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D. </a><br>
<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a> | <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a><br>
<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a> | <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大模型開發會踩的坑</a><br>
<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型，它是什麼？想要嗎？</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PPOCRLabel來幫PaddleOCR做OCR的微調和標註</a> | <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description=description,
    allow_flagging="never",
)

yt_description = """
Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint openai/whisper-large-v3 and Transformers to transcribe audio files of arbitrary length.<br>
<a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D. </a><br>
<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a> | <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a><br>
<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a> | <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大模型開發會踩的坑</a><br>
<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型，它是什麼？想要嗎？</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PaddleOCR的PPOCRLabel來微調醫療診斷書和收據</a> | <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
    ],
    outputs=["html", "text"],
    title="Whisper Large V3: Transcribe YouTube",
    description=yt_description,
    allow_flagging="never",
)

with gr.Blocks() as demo:
    gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])

demo.launch(debug=True)