Spaces:
Sleeping
Sleeping
import gradio as gr | |
import whisper | |
from pytube import YouTube | |
from fastapi import FastAPI, Response, Request | |
import yt_dlp | |
import uvicorn | |
CUSTOM_PATH = "/" | |
app = FastAPI() | |
langs = ["None"] + sorted(list(whisper.tokenizer.LANGUAGES.values())) | |
model_size = list(whisper._MODELS.keys()) | |
#async def get_subtitle(url: str): | |
# Download the subtitle with download_subtitle() | |
#subtitle_url = download_subtitle(url) | |
# Stream the subtitle as a response | |
#return StreamingResponse(requests.get(subtitle_url, stream=True).iter_content(chunk_size=1024)) | |
async def get_subtitle(url, lang='en'): | |
# Download subtitles if available | |
ydl_opts = { | |
'writesubtitles': True, | |
'outtmpl': '%(id)s.%(ext)s', | |
'subtitleslangs': [lang], | |
'skip_download': True, | |
} | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
info_dict = ydl.extract_info(url, download=True) | |
video_id = info_dict.get("id", None) | |
if video_id is None: | |
return None | |
subtitle_file = f"{video_id}.{lang}.vtt" | |
with open(subtitle_file, 'r') as f: | |
subtitle_content = f.read() | |
subtitle_content = re.sub(r"<[^>]+>", "", subtitle_content) | |
return subtitle_content | |
return None | |
def download_audio(video_url, quality: str = '128', speed: float = None): | |
ydl_opts = { | |
'format': 'bestaudio/best', | |
'outtmpl': '%(title)s.%(ext)s', | |
'quiet': True, | |
'postprocessors': [{ | |
'key': 'FFmpegExtractAudio', | |
'preferredcodec': 'mp3', #'opus', | |
'preferredquality': quality, | |
}] | |
} | |
if speed: | |
ydl_opts['postprocessors'].append({ | |
'key': 'FFmpegFilter', | |
'filter_complex': f"atempo={speed}" | |
}) | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
ydl.download([video_url]) | |
audio_file = ydl.prepare_filename(ydl.extract_info(video_url, download=False)) | |
print('audio_file', audio_file) | |
return audio_file | |
def get_audio(url): | |
yt = YouTube(url) | |
return yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4") | |
def get_transcript(url, model_size, lang, format): | |
subtitle = get_subtitle(url, lang) | |
print(subtitle) | |
return subtitle | |
model = whisper.load_model(model_size) | |
if lang == "None": | |
lang = None | |
result = model.transcribe(download_audio(url), fp16=False, language=lang) | |
if format == "None": | |
return result["text"] | |
elif format == ".srt": | |
return format_to_srt(result["segments"]) | |
def format_to_srt(segments): | |
output = "" | |
for i, segment in enumerate(segments): | |
output += f"{i + 1}\n" | |
output += f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n" | |
output += f"{segment['text']}\n\n" | |
return output | |
def format_timestamp(t): | |
hh = t//3600 | |
mm = (t - hh*3600)//60 | |
ss = t - hh*3600 - mm*60 | |
mi = (t - int(t))*1000 | |
return f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d},{int(mi):03d}" | |
def gradio_interface(): | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
url = gr.Textbox(placeholder='Youtube video URL', label='URL') | |
with gr.Row(): | |
model_size = gr.Dropdown(choices=model_size, value='tiny', label="Model") | |
lang = gr.Dropdown(choices=langs, value="None", label="Language (Optional)") | |
format = gr.Dropdown(choices=["None", ".srt"], value="None", label="Timestamps? (Optional)") | |
with gr.Row(): | |
gr.Markdown("Larger models are more accurate, but slower. For 1min video, it'll take ~30s (tiny), ~1min (base), ~3min (small), ~5min (medium), etc.") | |
transcribe_btn = gr.Button('Transcribe') | |
with gr.Column(): | |
outputs = gr.Textbox(placeholder='Transcription of the video', label='Transcription') | |
transcribe_btn.click(get_transcript, inputs=[url, model_size, lang, format], outputs=outputs) | |
#demo.launch(debug=True) | |
io = gr.Interface(gradio_interface) | |
app = gr.mount_gradio_app(app, io, path=CUSTOM_PATH) | |
uvicorn.run(app, host="0.0.0.0", port=7860) |