import torch import gradio as gr import yt_dlp as youtube_dl from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline from transformers.pipelines.audio_utils import ffmpeg_read import tempfile import os import time import requests from playwright.sync_api import sync_playwright from languages import get_language_names from subtitle import text_output, subtitle_output import subprocess try: import spaces USING_SPACES = True except ImportError: USING_SPACES = False subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) os.system("playwright install") YT_LENGTH_LIMIT_S = 360 SPACES_GPU_DURATION = 90 device = 0 if torch.cuda.is_available() else "cpu" def gpu_decorator(duration=60): def actual_decorator(func): if USING_SPACES: return spaces.GPU(duration=duration)(func) return func return actual_decorator def device_info(): try: subprocess.run(["df", "-h"], check=True) subprocess.run(["lsblk"], check=True) subprocess.run(["free", "-h"], check=True) subprocess.run(["lscpu"], check=True) subprocess.run(["nvidia-smi"], check=True) except subprocess.CalledProcessError as e: print(f"Command failed: {e}") @gpu_decorator(duration=SPACES_GPU_DURATION) def transcribe(inputs, model, language, batch_size, chunk_length_s, stride_length_s, task, timestamp_mode, progress=gr.Progress(track_tqdm=True)): try: if inputs is None: raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") torch_dtype = torch.float16 model_gen = AutoModelForSpeechSeq2Seq.from_pretrained( model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model_gen.to(device) processor = AutoProcessor.from_pretrained(model) tokenizer = WhisperTokenizer.from_pretrained(model) pipe = pipeline( task="automatic-speech-recognition", model=model_gen, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, model_kwargs={"attn_implementation": "flash_attention_2"}, device=device, ) generate_kwargs = {} if language != "Automatic Detection" and model.endswith(".en") == False: generate_kwargs["language"] = language if model.endswith(".en") == False: generate_kwargs["task"] = task output = pipe(inputs, batch_size=batch_size, generate_kwargs=generate_kwargs, return_timestamps=timestamp_mode) print(output) print({"inputs": inputs, "model": model, "language": language, "batch_size": batch_size, "chunk_length_s": chunk_length_s, "stride_length_s": stride_length_s, "task": task, "timestamp_mode": timestamp_mode}) if not timestamp_mode: text = output['text'] return text_output(inputs, text) else: chunks = output['chunks'] return subtitle_output(inputs, chunks) except Exception as e: error_message = str(e) raise gr.Error(error_message, duration=10) def _download_yt_audio(yt_url, filename): info_loader = youtube_dl.YoutubeDL() try: info = info_loader.extract_info(yt_url, download=False) except youtube_dl.utils.DownloadError as err: raise gr.Error(str(err)) file_length = info.get("duration_string") if not file_length: raise gr.Error("Video duration is unavailable.") file_h_m_s = file_length.split(":") file_h_m_s = [int(sub_length) for sub_length in file_h_m_s] if len(file_h_m_s) == 1: file_h_m_s.insert(0, 0) if len(file_h_m_s) == 2: file_h_m_s.insert(0, 0) file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2] if file_length_s > YT_LENGTH_LIMIT_S: yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S)) file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s)) raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.", duration=10) try: ydl_opts = { "outtmpl": filename, "format": "bestaudio[ext=m4a]/best", } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([yt_url]) except youtube_dl.utils.ExtractorError as err: available_formats = info_loader.extract_info(yt_url, download=False)['formats'] raise gr.Error(f"Requested format not available. Available formats: {available_formats}", duration=10) def _return_yt_video_id(yt_url): if "https://www.youtube.com/watch?v=" in yt_url: video_id = yt_url.split("?v=")[-1] elif "https://youtu.be/" in yt_url: video_id = yt_url.split("be/")[1] return video_id def _return_yt_html_embed(yt_url): video_id = _return_yt_video_id(yt_url) HTML_str = ( f'
' "
" ) return HTML_str def _return_yt_thumbnail(yt_url): video_id = _return_yt_video_id(yt_url) if not video_id: raise ValueError("Invalid YouTube URL: Unable to extract video ID.") thumbnail_url = f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg" thumbnail_path = None try: with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file: response = requests.get(thumbnail_url) if response.status_code == 200: temp_file.write(response.content) thumbnail_path = temp_file.name else: raise Exception(f"Failed to retrieve thumbnail. Status code: {response.status_code}") except Exception as e: print(f"Error occurred: {e}") return None return thumbnail_path def _return_yt_info(yt_url): video_id = _return_yt_video_id(yt_url) try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto(yt_url) page.wait_for_load_state("networkidle") title = page.title() description = page.query_selector("meta[name='description']").get_attribute("content") keywords = page.query_selector("meta[name='keywords']").get_attribute("content") gr_title = gr.Textbox(label="YouTube Title", visible=True, value=title) gr_description = gr.Textbox(label="YouTube Description", visible=True, value=description) gr_keywords = gr.Textbox(label="YouTube Keywords", visible=True, value=keywords) browser.close() return gr_title, gr_description, gr_keywords except Exception as e: print(e) return gr.Textbox(visible=False), gr.Textbox(visible=False), gr.Textbox(visible=False) def return_youtube(yt_url): html_embed_str = _return_yt_html_embed(yt_url) thumbnail = _return_yt_thumbnail(yt_url) gr_html = gr.HTML(label="Youtube Video", visible=True, value=html_embed_str) gr_thumbnail = gr.Image(label="Youtube Thumbnail", visible=True, value=thumbnail) gr_title, gr_description, gr_keywords = _return_yt_info(yt_url) return gr_html, gr_thumbnail, gr_title, gr_description, gr_keywords @gpu_decorator(duration=SPACES_GPU_DURATION) def yt_transcribe(yt_url, model, language, batch_size, chunk_length_s, stride_length_s, task, timestamp_mode): gr_html, gr_thumbnail, gr_title, gr_description, gr_keywords = return_youtube(yt_url) try: with tempfile.TemporaryDirectory() as tmpdirname: filepath = os.path.join(tmpdirname, "video.mp4") _download_yt_audio(yt_url, filepath) with open(filepath, "rb") as f: inputs = f.read() inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate) inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate} torch_dtype = torch.float16 model_gen = AutoModelForSpeechSeq2Seq.from_pretrained( model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model_gen.to(device) processor = AutoProcessor.from_pretrained(model) tokenizer = WhisperTokenizer.from_pretrained(model) pipe = pipeline( task="automatic-speech-recognition", model=model_gen, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, model_kwargs={"attn_implementation": "flash_attention_2"}, device=device, ) generate_kwargs = {} if language != "Automatic Detection" and model.endswith(".en") == False: generate_kwargs["language"] = language if model.endswith(".en") == False: generate_kwargs["task"] = task output = pipe(inputs, batch_size=batch_size, generate_kwargs=generate_kwargs, return_timestamps=timestamp_mode) print(output) print({"inputs": yt_url, "model": model, "language": language, "batch_size": batch_size, "chunk_length_s": chunk_length_s, "stride_length_s": stride_length_s, "task": task, "timestamp_mode": timestamp_mode}) if not timestamp_mode: text = output['text'] subtitle, files = text_output(inputs, text) else: chunks = output['chunks'] subtitle, files = subtitle_output(inputs, chunks) return subtitle, files, gr_title, gr_html, gr_thumbnail, gr_description, gr_keywords except Exception as e: error_message = str(e) gr.Warning(error_message, duration=10) return gr.Textbox(visible=False),gr.Textbox(visible=False), gr_title, gr_html, gr_thumbnail, gr_description, gr_keywords demo = gr.Blocks() file_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources=['upload', 'microphone'], type="filepath", label="Audio file"), gr.Dropdown( choices=[ "openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large", "openai/whisper-large-v1", "openai/whisper-large-v2", "distil-whisper/distil-large-v2", "openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2", ], value="openai/whisper-large-v3-turbo", label="Model Name", allow_custom_value=True, ), gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,), gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1), gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1), gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), gr.Dropdown( choices=[True, False, "word"], value=True, label="Timestamp Mode" ), ], outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")], title="Whisper: Transcribe Audio", flagging_mode="auto", ) video_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.Video(sources=["upload", "webcam"], label="Video file", show_label=False, show_download_button=False, show_share_button=False, streaming=True), gr.Dropdown( choices=[ "openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large", "openai/whisper-large-v1", "openai/whisper-large-v2", "distil-whisper/distil-large-v2", "openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2", ], value="openai/whisper-large-v3-turbo", label="Model Name", allow_custom_value=True, ), gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,), gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1), gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1), gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), gr.Dropdown( choices=[True, False, "word"], value=True, label="Timestamp Mode" ), ], outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")], title="Whisper: Transcribe Video", flagging_mode="auto", ) yt_transcribe = gr.Interface( fn=yt_transcribe, inputs=[ gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), gr.Dropdown( choices=[ "openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large", "openai/whisper-large-v1", "openai/whisper-large-v2", "distil-whisper/distil-large-v2", "openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2", ], value="openai/whisper-large-v3-turbo", label="Model Name", allow_custom_value=True, ), gr.Dropdown(choices=["Automatic Detection"] + sorted(get_language_names()), value="Automatic Detection", label="Language", interactive = True,), gr.Slider(label="Batch Size", minimum=1, maximum=32, value=16, step=1), gr.Slider(label="Chunk Length (s)", minimum=1, maximum=60, value=17.5, step=0.1), gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1), gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"), gr.Dropdown( choices=[True, False, "word"], value=True, label="Timestamp Mode" ), ], outputs=[ gr.Textbox(label="Output"), gr.File(label="Download Files"), gr.Textbox(label="Youtube Title"), gr.HTML(label="Youtube Video"), gr.Image(label="Youtube Thumbnail"), gr.Textbox(label="Youtube Description"), gr.Textbox(label="Youtube Keywords"), ], title="Whisper: Transcribe YouTube", flagging_mode="auto", ) with demo: gr.TabbedInterface( interface_list=[file_transcribe, video_transcribe, yt_transcribe], tab_names=["Audio", "Video", "YouTube"] ) if __name__ == "__main__": demo.queue().launch(ssr_mode=False)