Spaces:
Sleeping
Sleeping
File size: 5,870 Bytes
6c226f9 5a985c6 6c226f9 d790c0b 88183ad 7bab834 6c226f9 a5bfe25 9d6fa91 66efbc3 d790c0b 6c226f9 3c0cd8e 6c226f9 3c0cd8e 7bab834 6c226f9 d790c0b 3c0cd8e d790c0b 3c0cd8e 1b51c36 3c0cd8e d790c0b 1b51c36 d790c0b 3c0cd8e 1b51c36 3c0cd8e d790c0b 66efbc3 6c226f9 66efbc3 d790c0b 1b51c36 d790c0b 6c226f9 b97a3c2 0a7fcda 3c0cd8e 6c226f9 2fbf87f 3c0cd8e 567ec12 7bab834 6c226f9 a5bfe25 2fbf87f 6c226f9 2fbf87f 6c226f9 7097513 7bab834 7097513 6c226f9 a5bfe25 2fbf87f 6c226f9 7bab834 567ec12 6c226f9 2fbf87f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
import os
import time
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def transcribe(inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def download_yt_audio(yt_url, filename):
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length = info["duration"]
file_length_s = int(file_length)
if file_length_s > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length_s))
raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([yt_url])
except youtube_dl.utils.ExtractorError as err:
raise gr.Error(str(err))
def yt_transcribe(yt_url, task, max_filesize=75.0):
html_embed_str = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "audio.m4a")
download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return html_embed_str, text
description = """
Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the checkpoint openai/whisper-large-v3 and Transformers to transcribe audio files of arbitrary length.<br>
<a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D. </a><br>
<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a> | <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a><br>
<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a> | <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大模型開發會踩的坑</a><br>
<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型,它是什麼?想要嗎?</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PaddleOCR的PPOCRLabel來微調醫療診斷書和收據</a> | <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="Whisper Large V3: Transcribe Audio",
description=description,
allow_flagging="never",
)
yt_description = """
Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint openai/whisper-large-v3 and Transformers to transcribe audio files of arbitrary length.<br>
<a href='https://www.twman.org' target='_blank'>TonTon Huang Ph.D. </a><br>
<a href='https://blog.twman.org/2021/04/ASR.html' target='_blank'>那些語音處理 (Speech Processing) 踩的坑</a> | <a href='https://blog.twman.org/2021/04/NLP.html' target='_blank'>那些自然語言處理 (Natural Language Processing, NLP) 踩的坑</a><br>
<a href='https://blog.twman.org/2024/02/asr-tts.html' target='_blank'>那些ASR和TTS可能會踩的坑</a> | <a href='https://blog.twman.org/2024/02/LLM.html' target='_blank'>那些大模型開發會踩的坑</a><br>
<a href='https://blog.twman.org/2023/04/GPT.html' target='_blank'>什麼是大語言模型,它是什麼?想要嗎?</a><br>
<a href='https://blog.twman.org/2023/07/wsl.html' target='_blank'>用PaddleOCR的PPOCRLabel來微調醫療診斷書和收據</a> | <a href='https://blog.twman.org/2023/07/HugIE.html' target='_blank'>基於機器閱讀理解和指令微調的統一信息抽取框架之診斷書醫囑資訊擷取分析</a><br>
"""
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
],
outputs=["html", "text"],
title="Whisper Large V3: Transcribe YouTube",
description=yt_description,
allow_flagging="never",
)
with gr.Blocks() as demo:
gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])
demo.launch(debug=True)
|