ytdlp-whisper / app.py
lanbogao's picture
Update app.py
3424983
raw
history blame
4.31 kB
import gradio as gr
import whisper
from pytube import YouTube
from fastapi import FastAPI, Response, Request
import yt_dlp
import uvicorn
CUSTOM_PATH = "/"
app = FastAPI()
langs = ["None"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
model_size = list(whisper._MODELS.keys())
#async def get_subtitle(url: str):
# Download the subtitle with download_subtitle()
#subtitle_url = download_subtitle(url)
# Stream the subtitle as a response
#return StreamingResponse(requests.get(subtitle_url, stream=True).iter_content(chunk_size=1024))
@app.get("/subtitle")
async def get_subtitle(url, lang='en'):
# Download subtitles if available
ydl_opts = {
'writesubtitles': True,
'outtmpl': '%(id)s.%(ext)s',
'subtitleslangs': [lang],
'skip_download': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=True)
video_id = info_dict.get("id", None)
if video_id is None:
return None
subtitle_file = f"{video_id}.{lang}.vtt"
with open(subtitle_file, 'r') as f:
subtitle_content = f.read()
subtitle_content = re.sub(r"<[^>]+>", "", subtitle_content)
return subtitle_content
return None
def download_audio(video_url, quality: str = '128', speed: float = None):
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': '%(title)s.%(ext)s',
'quiet': True,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3', #'opus',
'preferredquality': quality,
}]
}
if speed:
ydl_opts['postprocessors'].append({
'key': 'FFmpegFilter',
'filter_complex': f"atempo={speed}"
})
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
audio_file = ydl.prepare_filename(ydl.extract_info(video_url, download=False))
print('audio_file', audio_file)
return audio_file
def get_audio(url):
yt = YouTube(url)
return yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
def get_transcript(url, model_size, lang, format):
subtitle = get_subtitle(url, lang)
print(subtitle)
return subtitle
model = whisper.load_model(model_size)
if lang == "None":
lang = None
result = model.transcribe(download_audio(url), fp16=False, language=lang)
if format == "None":
return result["text"]
elif format == ".srt":
return format_to_srt(result["segments"])
def format_to_srt(segments):
output = ""
for i, segment in enumerate(segments):
output += f"{i + 1}\n"
output += f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
output += f"{segment['text']}\n\n"
return output
def format_timestamp(t):
hh = t//3600
mm = (t - hh*3600)//60
ss = t - hh*3600 - mm*60
mi = (t - int(t))*1000
return f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d},{int(mi):03d}"
def gradio_interface():
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
with gr.Row():
url = gr.Textbox(placeholder='Youtube video URL', label='URL')
with gr.Row():
model_size = gr.Dropdown(choices=model_size, value='tiny', label="Model")
lang = gr.Dropdown(choices=langs, value="None", label="Language (Optional)")
format = gr.Dropdown(choices=["None", ".srt"], value="None", label="Timestamps? (Optional)")
with gr.Row():
gr.Markdown("Larger models are more accurate, but slower. For 1min video, it'll take ~30s (tiny), ~1min (base), ~3min (small), ~5min (medium), etc.")
transcribe_btn = gr.Button('Transcribe')
with gr.Column():
outputs = gr.Textbox(placeholder='Transcription of the video', label='Transcription')
transcribe_btn.click(get_transcript, inputs=[url, model_size, lang, format], outputs=outputs)
#demo.launch(debug=True)
io = gr.Interface(gradio_interface)
app = gr.mount_gradio_app(app, io, path=CUSTOM_PATH)
uvicorn.run(app, host="0.0.0.0", port=7860)