hellorahulk's picture
Add video caption app with Whisper auto-captioning and styling options
2302206
import os
import tempfile
import gradio as gr
import ffmpeg
import logging
import whisper as openai_whisper # Renamed to avoid potential conflicts
import numpy as np
import torch
import datetime
import subprocess
import shlex
from pathlib import Path
import re # For parsing ASS/SRT
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Define fonts directory - adapt for Hugging Face environment if needed
FONTS_DIR = '/usr/share/fonts/truetype' # Common Linux font location
# Check common font locations for other OS if needed
if not os.path.exists(FONTS_DIR) and os.path.exists('/System/Library/Fonts'): # macOS
FONTS_DIR = '/System/Library/Fonts'
elif not os.path.exists(FONTS_DIR) and os.path.exists('C:\Windows\Fonts'): # Windows
FONTS_DIR = 'C:\Windows\Fonts'
FONT_PATHS = {}
ACCEPTABLE_FONTS = ['Arial', 'Helvetica', 'Times New Roman'] # Start with common fallbacks
try:
if FONTS_DIR and os.path.exists(FONTS_DIR):
logger.info(f"Searching for fonts in: {FONTS_DIR}")
found_fonts = []
for root, dirs, files in os.walk(FONTS_DIR):
for file in files:
if file.lower().endswith(('.ttf', '.otf', '.ttc')):
font_path = os.path.join(root, file)
font_name = os.path.splitext(file)[0]
# Basic name cleanup
base_font_name = re.sub(r'[-_ ]?(bold|italic|regular|medium|light|condensed)?$', '', font_name, flags=re.IGNORECASE)
if base_font_name not in FONT_PATHS:
FONT_PATHS[base_font_name] = font_path
found_fonts.append(base_font_name)
if found_fonts:
ACCEPTABLE_FONTS = sorted(list(set(found_fonts + ACCEPTABLE_FONTS)))
logger.info(f"Found system fonts: {ACCEPTABLE_FONTS}")
else:
logger.warning(f"No font files found in {FONTS_DIR}. Using defaults.")
else:
logger.warning(f"Font directory {FONTS_DIR} not found. Using defaults: {ACCEPTABLE_FONTS}")
except Exception as e:
logger.warning(f"Could not load system fonts from {FONTS_DIR}: {e}. Using defaults: {ACCEPTABLE_FONTS}")
# Global variable for Whisper model to avoid reloading
whisper_model = None
def generate_style_line(options):
"""Generate ASS style line from options. Uses common defaults.
Ensure color format is correct (&HBBGGRRAA or &HAABBGGRR depending on FFmpeg build)
Using &HBBGGRR format for PrimaryColour based on common FFmpeg usage.
"""
# Convert hex color picker (#FFFFFF) to ASS format (&HBBGGRR)
def hex_to_ass_bgr(hex_color):
hex_color = hex_color.lstrip('#')
if len(hex_color) == 6:
r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
return f"&H{b:02X}{g:02X}{r:02X}"
return '&H00FFFFFF' # Default to white if format is wrong
primary_color_ass = hex_to_ass_bgr(options.get('primary_color', '#FFFFFF'))
style_options = {
'Name': 'Default',
'Fontname': options.get('font_name', 'Arial'), # Ensure this font is accessible to FFmpeg
'Fontsize': options.get('font_size', 24),
'PrimaryColour': primary_color_ass,
'SecondaryColour': '&H000000FF', # Often unused, but good to define
'OutlineColour': '&H00000000', # Black outline
'BackColour': '&H80000000', # Semi-transparent black background/shadow
'Bold': 0, # Use -1 for True, 0 for False in ASS
'Italic': 0,
'Underline': 0,
'StrikeOut': 0,
'ScaleX': 100,
'ScaleY': 100,
'Spacing': 0,
'Angle': 0,
'BorderStyle': 1, # 1 = Outline + Shadow
'Outline': 2, # Outline thickness
'Shadow': 1, # Shadow distance
'Alignment': options.get('alignment', 2), # 2 = Bottom Center
'MarginL': 10,
'MarginR': 10,
'MarginV': 10, # Bottom margin
'Encoding': 1 # Default ANSI encoding
}
logger.info(f"Generated ASS Style Options: {style_options}")
return f"Style: {','.join(map(str, style_options.values()))}"
def transcribe_audio(audio_path, progress=None):
"""Transcribe audio using Whisper ASR model."""
global whisper_model
logger.info(f"Starting transcription for: {audio_path}")
try:
if whisper_model is None:
safe_progress_update(progress, 0.1, "Loading Whisper model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device} for Whisper")
# Use a smaller model if only CPU is available to potentially speed things up
model_size = "base" if device == "cuda" else "tiny.en" # or "tiny"
logger.info(f"Loading Whisper model size: {model_size}")
whisper_model = openai_whisper.load_model(model_size, device=device)
safe_progress_update(progress, 0.3, "Model loaded, processing audio...")
result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available())
logger.info(f"Transcription result (first 100 chars): {str(result)[:100]}")
safe_progress_update(progress, 0.7, "Transcription complete, formatting captions...")
return result
except Exception as e:
logger.exception(f"Error transcribing audio: {audio_path}") # Use logger.exception to include traceback
raise
def format_time(seconds):
"""Format time in SRT/ASS format (H:MM:SS.ms)."""
# ASS format uses H:MM:SS.xx (hundredths of a second)
hundredths = int((seconds % 1) * 100)
s = int(seconds) % 60
m = int(seconds / 60) % 60
h = int(seconds / 3600)
return f"{h}:{m:02d}:{s:02d}.{hundredths:02d}"
def format_time_srt(seconds):
"""Format time in SRT format (HH:MM:SS,ms)."""
ms = int((seconds % 1) * 1000)
s = int(seconds) % 60
m = int(seconds / 60) % 60
h = int(seconds / 3600)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
def generate_srt_from_transcript(segments):
"""Convert whisper segments to SRT format."""
srt_content = ""
for i, segment in enumerate(segments):
start_time = format_time_srt(segment["start"])
end_time = format_time_srt(segment["end"])
text = segment["text"].strip()
srt_content += f"{i+1}\n{start_time} --> {end_time}\n{text}\n\n"
logger.info(f"Generated SRT (first 200 chars): {srt_content[:200]}")
return srt_content.strip()
def generate_ass_dialogue_line(segment, style_name='Default'):
"""Generate a single ASS dialogue line from a segment."""
start_time = format_time(segment["start"])
end_time = format_time(segment["end"])
text = segment["text"].strip().replace('\n', '\\N') # Replace newline with ASS newline
# Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
return f"Dialogue: 0,{start_time},{end_time},{style_name},,0,0,0,,{text}"
def generate_ass_from_transcript(segments, style_options):
"""Convert whisper segments to ASS format including style header."""
style_line = generate_style_line(style_options)
ass_header = f"""
[Script Info]
Title: Generated Captions
ScriptType: v4.00+
WrapStyle: 0
PlayResX: 384
PlayResY: 288
ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
{style_line}
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
dialogue_lines = [generate_ass_dialogue_line(seg) for seg in segments]
full_ass_content = ass_header + "\n".join(dialogue_lines)
logger.info(f"Generated ASS (first 300 chars): {full_ass_content[:300]}")
return full_ass_content
def extract_audio(video_path, output_path):
"""Extract audio from video file using ffmpeg subprocess."""
logger.info(f"Attempting to extract audio from {video_path} to {output_path}")
try:
command = [
"ffmpeg", "-i", video_path,
"-vn", # No video
"-acodec", "pcm_s16le", # Standard WAV format
"-ac", "1", # Mono
"-ar", "16000", # 16kHz sample rate (common for ASR)
"-y", # Overwrite output
output_path
]
logger.info(f"Running audio extraction command: {' '.join(map(shlex.quote, command))}")
process = subprocess.run(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8', # Explicitly set encoding
check=False
)
if process.returncode != 0:
logger.error(f"FFmpeg audio extraction error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}")
return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..."
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
logger.error(f"Audio extraction failed: Output file not created or empty. FFmpeg stderr: {process.stderr}")
return False, f"Output audio file not created or empty. FFmpeg stderr: {process.stderr[:500]}..."
logger.info(f"Audio extracted successfully to {output_path}, size: {os.path.getsize(output_path)} bytes")
return True, ""
except Exception as e:
logger.exception(f"Exception during audio extraction from {video_path}")
return False, str(e)
def run_ffmpeg_with_subtitles(video_path, subtitle_path, output_path, style_options=None):
"""Burn subtitles into video using ffmpeg subprocess.
Args:
video_path: Path to input video
subtitle_path: Path to ASS subtitle file
output_path: Path to save output video
style_options: Optional style parameters (not directly used, but kept for consistency)
Returns:
tuple: (success, error_message)
"""
logger.info(f"Attempting to burn subtitles from {subtitle_path} into {video_path}")
# Check if the subtitle file exists and is not empty
if not os.path.exists(subtitle_path) or os.path.getsize(subtitle_path) == 0:
return False, f"Subtitle file {subtitle_path} does not exist or is empty"
# Check if the video file exists
if not os.path.exists(video_path):
return False, f"Video file {video_path} does not exist"
# Validate the video file using ffprobe
try:
probe_cmd = [
"ffprobe", "-v", "error",
"-select_streams", "v:0",
"-show_entries", "stream=codec_name,width,height",
"-of", "json",
video_path
]
probe_result = subprocess.run(
probe_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8'
)
if probe_result.returncode != 0:
logger.error(f"FFprobe validation failed: {probe_result.stderr}")
return False, f"FFprobe validation failed: {probe_result.stderr[:200]}..."
except Exception as e:
logger.exception(f"Exception during video validation: {video_path}")
return False, f"Video validation failed: {str(e)}"
try:
# The subtitle path needs to be properly escaped for the filter complex
# On Windows, backslashes need special handling
subtitle_path_esc = subtitle_path.replace('\\', '\\\\')
# Ensure paths are properly quoted for the shell command
command = [
"ffmpeg",
"-i", video_path,
"-vf", f"ass='{subtitle_path_esc}'",
"-c:v", "libx264", # Use H.264 codec for broad compatibility
"-preset", "medium", # Balance between speed and quality
"-crf", "23", # Reasonable quality setting (lower is better)
"-c:a", "aac", # Use AAC for audio
"-b:a", "128k", # Decent audio bitrate
"-movflags", "+faststart", # Optimize for web playback
"-y", # Overwrite output if exists
output_path
]
logger.info(f"Running subtitle burn command: {' '.join(map(shlex.quote, command))}")
process = subprocess.run(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8',
check=False
)
if process.returncode != 0:
logger.error(f"FFmpeg subtitle burn error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}")
return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..."
# Verify output file was created and is not empty
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
logger.error(f"Subtitle burning failed: Output file not created or empty. FFmpeg stderr: {process.stderr}")
return False, f"Output video file not created or empty. FFmpeg stderr: {process.stderr[:500]}..."
logger.info(f"Subtitles burned successfully, output: {output_path}, size: {os.path.getsize(output_path)} bytes")
return True, ""
except Exception as e:
logger.exception(f"Exception during subtitle burning: {video_path}")
return False, str(e)
def safe_progress_update(progress_callback, value, desc=""):
"""Safely update progress without crashing if progress_callback is None or fails."""
if progress_callback is not None:
try:
progress_callback(value, desc)
except Exception as e:
# Avoid flooding logs for simple progress updates
# logger.warning(f"Progress update progress failed: {e}")
pass # Silently ignore progress update errors
def parse_srt_to_dialogue(srt_content):
"""Basic SRT parser to list of dialogue events for ASS conversion."""
dialogue = []
# Regex to find index, timecodes, and text blocks
# Allows comma or period for milliseconds separator
pattern = re.compile(
r'^\s*(\d+)\s*$\n?' # Index line
r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*' # Start time
r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*$\n' # End time
r'(.*?)(?=\n\s*\n\d+\s*$|\Z)', # Text block (non-greedy) until blank line and next index or end of string
re.DOTALL | re.MULTILINE
)
logger.info("Attempting to parse SRT/VTT content...")
matches_found = 0
last_index = 0
for match in pattern.finditer(srt_content):
matches_found += 1
try:
index = int(match.group(1))
sh, sm, ss, sms = map(int, match.group(2, 3, 4, 5))
eh, em, es, ems = map(int, match.group(6, 7, 8, 9))
start_sec = sh * 3600 + sm * 60 + ss + sms / 1000.0
end_sec = eh * 3600 + em * 60 + es + ems / 1000.0
text = match.group(10).strip().replace('\n', '\\N') # Replace newline with ASS \N
# Basic validation
if end_sec < start_sec:
logger.warning(f"SRT parse warning: End time {end_sec} before start time {start_sec} at index {index}. Skipping.")
continue
if not text:
logger.warning(f"SRT parse warning: Empty text content at index {index}. Skipping.")
continue
dialogue.append({'start': start_sec, 'end': end_sec, 'text': text})
last_index = match.end()
except Exception as e:
logger.warning(f"Could not parse SRT block starting near index {match.group(1)}: {e}")
# Check if parsing consumed a reasonable amount of the input
if matches_found > 0 and last_index < len(srt_content) * 0.8:
logger.warning(f"SRT parsing finished early. Found {matches_found} blocks, but stopped near character {last_index} of {len(srt_content)}. Input format might be inconsistent.")
elif matches_found == 0 and len(srt_content) > 10:
logger.error(f"SRT parsing failed. No dialogue blocks found in content starting with: {srt_content[:100]}...")
logger.info(f"Parsed {len(dialogue)} dialogue events from SRT/VTT content.")
return dialogue
def parse_ass_to_dialogue(ass_content):
"""Basic ASS parser to extract dialogue events."""
dialogue = []
# Regex for ASS Dialogue line - make capturing groups non-optional where possible
# Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
pattern = re.compile(
r'^Dialogue:\s*'
r'(?P<layer>\d+),\s*'
r'(?P<start>\d+:\d{2}:\d{2}\.\d{2}),\s*'
r'(?P<end>\d+:\d{2}:\d{2}\.\d{2}),\s*'
r'(?P<style>[^,]*),\s*' # Style name
r'(?P<name>[^,]*),\s*' # Actor name
r'(?P<marginL>\d+),\s*'
r'(?P<marginR>\d+),\s*'
r'(?P<marginV>\d+),\s*'
r'(?P<effect>[^,]*),\s*' # Effect
r'(?P<text>.*?)$', # Text (rest of line)
re.IGNORECASE
)
# Helper to convert H:MM:SS.xx to seconds
def time_to_seconds(time_str):
try:
parts = time_str.split(':')
h = int(parts[0])
m = int(parts[1])
s_parts = parts[2].split('.')
s = int(s_parts[0])
cs = int(s_parts[1])
return h * 3600 + m * 60 + s + cs / 100.0
except Exception as e:
logger.error(f"Failed to parse time string '{time_str}': {e}")
return 0.0 # Return 0 on failure to avoid crashing, but log it
logger.info("Attempting to parse ASS content...")
lines_parsed = 0
for line in ass_content.splitlines():
line = line.strip()
if not line.lower().startswith('dialogue:'):
continue
match = pattern.match(line)
if match:
lines_parsed += 1
try:
start_sec = time_to_seconds(match.group('start'))
end_sec = time_to_seconds(match.group('end'))
text = match.group('text').strip() # Already handles \N from ASS spec
if end_sec < start_sec:
logger.warning(f"ASS parse warning: End time {end_sec} before start time {start_sec} in line: '{line}'. Skipping.")
continue
if not text:
logger.warning(f"ASS parse warning: Empty text content in line: '{line}'. Skipping.")
continue
dialogue.append({'start': start_sec, 'end': end_sec, 'text': text})
except Exception as e:
logger.warning(f"Could not parse ASS dialogue line: '{line}'. Error: {e}")
else:
logger.warning(f"ASS dialogue line did not match expected pattern: '{line}'")
if lines_parsed == 0 and len(ass_content) > 50: # Check if content was substantial
logger.error(f"ASS parsing failed. No dialogue lines matched the expected pattern in content starting with: {ass_content[:200]}...")
logger.info(f"Parsed {len(dialogue)} dialogue events from {lines_parsed} matched ASS lines.")
return dialogue
def process_video_with_captions(video, captions, caption_type, font_name, font_size,
primary_color, alignment, auto_caption):
"""Main processing function."""
progress = gr.Progress(track_tqdm=True)
temp_dir = None
try:
progress(0, desc="Initializing...")
temp_dir = tempfile.mkdtemp()
logger.info(f"Created temp dir: {temp_dir}")
video_path = os.path.join(temp_dir, "input_video.mp4")
output_path = os.path.join(temp_dir, "output_video.mp4")
# Removed initial_subtitle_path, only need final
final_ass_path = os.path.join(temp_dir, "captions_final.ass")
# --- Handle Video Input ---
progress(0.05, desc="Saving video...")
if hasattr(video, 'name') and video.name and os.path.exists(video.name):
import shutil
shutil.copy(video.name, video_path)
logger.info(f"Copied input video from Gradio temp file {video.name} to {video_path}")
elif isinstance(video, str) and os.path.exists(video):
import shutil
shutil.copy(video, video_path)
logger.info(f"Copied input video from path {video} to {video_path}")
else:
raise gr.Error("Could not access uploaded video file. Please try uploading again.")
# --- Prepare Styles ---
progress(0.1, desc="Preparing styles...")
generated_captions_display_text = ""
alignment_map = {"Bottom Center": 2, "Bottom Left": 1, "Bottom Right": 3}
style_options = {
'font_name': font_name,
'font_size': font_size,
'primary_color': primary_color,
'alignment': alignment_map.get(alignment, 2)
}
# --- Auto-Generate or Process Provided Captions ---
dialogue_events = [] # To hold {'start': float, 'end': float, 'text': str}
if auto_caption:
logger.info("Auto-generating captions...")
progress(0.15, desc="Extracting audio...")
audio_path = os.path.join(temp_dir, "audio.wav")
success, error_msg = extract_audio(video_path, audio_path)
if not success: raise gr.Error(f"Audio extraction failed: {error_msg}")
progress(0.25, desc="Transcribing audio...")
transcript = transcribe_audio(audio_path, progress=progress)
if not transcript or not transcript.get("segments"): raise gr.Error("No speech detected.")
dialogue_events = transcript["segments"] # Use segments directly
progress(0.6, desc="Generating ASS captions...")
else: # Use provided captions
logger.info(f"Using provided {caption_type} captions.")
if not captions or captions.strip() == "": raise gr.Error("Caption input is empty.")
progress(0.6, desc=f"Processing {caption_type} captions...")
if caption_type.lower() == 'ass':
logger.info("Parsing provided ASS content.")
dialogue_events = parse_ass_to_dialogue(captions)
if not dialogue_events:
raise gr.Error("Could not parse dialogue lines from provided ASS content.")
elif caption_type.lower() in ['srt', 'vtt']:
logger.info(f"Parsing provided {caption_type} content.")
dialogue_events = parse_srt_to_dialogue(captions)
if not dialogue_events:
raise gr.Error(f"Could not parse provided {caption_type} content.")
else:
raise gr.Error(f"Unsupported caption type: {caption_type}")
# --- Generate Final ASS File ---
if not dialogue_events:
raise gr.Error("No caption dialogue events found or generated.")
logger.info(f"Generating final ASS file with {len(dialogue_events)} events and UI styles.")
final_ass_content = generate_ass_from_transcript(dialogue_events, style_options)
generated_captions_display_text = final_ass_content # Show the final generated ASS
with open(final_ass_path, 'w', encoding='utf-8') as f:
f.write(final_ass_content)
logger.info(f"Written final styled ASS to {final_ass_path}")
# Verify file creation
if not os.path.exists(final_ass_path) or os.path.getsize(final_ass_path) == 0:
raise gr.Error(f"Internal error: Failed to write final ASS file to {final_ass_path}")
# --- Burn Subtitles ---
progress(0.7, desc="Burning subtitles into video...")
success, error_msg = run_ffmpeg_with_subtitles(
video_path, final_ass_path, output_path, style_options
)
if not success:
logger.error(f"Subtitle burning failed. Video: {video_path}, ASS: {final_ass_path}")
raise gr.Error(f"FFmpeg failed to burn subtitles: {error_msg}")
progress(1.0, desc="Processing complete!")
logger.info(f"Output video generated: {output_path}")
return output_path, generated_captions_display_text
except Exception as e:
logger.exception(f"Error in process_video_with_captions")
if temp_dir and os.path.exists(temp_dir):
try:
files = os.listdir(temp_dir)
logger.error(f"Files in temp dir {temp_dir} during error: {files}")
except Exception as list_e:
logger.error(f"Could not list temp dir {temp_dir}: {list_e}")
if isinstance(e, gr.Error): raise e
else: raise gr.Error(f"An unexpected error occurred: {str(e)}")
# Function to toggle interactivity
def toggle_captions_input(auto_generate):
"""Toggle the interactivity of the captions input."""
return gr.update(interactive=not auto_generate)
# --- Gradio Interface ---
with gr.Blocks(title="Video Caption Generator") as app:
gr.Markdown("## Video Caption Generator")
gr.Markdown("Upload a video, choose styling, and add captions. Use auto-generation or provide your own SRT/ASS/VTT.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("**Input & Options**")
video_input = gr.Video(label="Upload Video")
auto_caption = gr.Checkbox(label="Auto-generate captions (Overrides below)", value=False)
captions_input = gr.Textbox(
label="Or Enter Captions Manually",
placeholder="1\n00:00:01,000 --> 00:00:05,000\nHello World\n\n2\n...",
lines=8,
interactive=True
)
caption_type = gr.Dropdown(
choices=["srt", "ass", "vtt"],
value="srt",
label="Format (if providing captions manually)"
)
gr.Markdown("**Caption Styling** (Applied to auto-generated or converted ASS)")
with gr.Row():
font_name = gr.Dropdown(
choices=ACCEPTABLE_FONTS,
value=ACCEPTABLE_FONTS[0] if ACCEPTABLE_FONTS else "Arial",
label="Font"
)
font_size = gr.Slider(minimum=10, maximum=60, value=24, step=1, label="Font Size")
with gr.Row():
primary_color = gr.ColorPicker(value="#FFFFFF", label="Text Color")
alignment = gr.Dropdown(
choices=["Bottom Center", "Bottom Left", "Bottom Right"],
value="Bottom Center",
label="Alignment"
)
process_btn = gr.Button("Generate Captioned Video", variant="primary")
with gr.Column(scale=1):
gr.Markdown("**Output**")
video_output = gr.Video(label="Captioned Video")
generated_captions_output = gr.Textbox(
label="Generated Captions (ASS format if auto-generated)",
lines=10,
interactive=False
)
# Link checkbox to captions input interactivity
auto_caption.change(
fn=toggle_captions_input,
inputs=[auto_caption],
outputs=[captions_input]
)
# Define the main processing function call for the button
process_btn.click(
fn=process_video_with_captions,
inputs=[
video_input,
captions_input,
caption_type,
font_name,
font_size,
primary_color,
alignment,
auto_caption
],
outputs=[video_output, generated_captions_output],
# api_name="generate_captions"
)
# Launch the app
if __name__ == "__main__":
app.launch(debug=True, share=False) # Enable debug for local testing