Spaces:

hellorahulk
/

videocaptiontool

Running

File size: 28,370 Bytes
import os
import tempfile
import gradio as gr
import ffmpeg
import logging
import whisper as openai_whisper  # Renamed to avoid potential conflicts
import numpy as np
import torch
import datetime
import subprocess
import shlex
from pathlib import Path
import re # For parsing ASS/SRT

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define fonts directory - adapt for Hugging Face environment if needed
FONTS_DIR = '/usr/share/fonts/truetype'  # Common Linux font location
# Check common font locations for other OS if needed
if not os.path.exists(FONTS_DIR) and os.path.exists('/System/Library/Fonts'): # macOS
    FONTS_DIR = '/System/Library/Fonts'
elif not os.path.exists(FONTS_DIR) and os.path.exists('C:\Windows\Fonts'): # Windows
    FONTS_DIR = 'C:\Windows\Fonts'

FONT_PATHS = {}
ACCEPTABLE_FONTS = ['Arial', 'Helvetica', 'Times New Roman'] # Start with common fallbacks
try:
    if FONTS_DIR and os.path.exists(FONTS_DIR):
        logger.info(f"Searching for fonts in: {FONTS_DIR}")
        found_fonts = []
        for root, dirs, files in os.walk(FONTS_DIR):
             for file in files:
                 if file.lower().endswith(('.ttf', '.otf', '.ttc')):
                     font_path = os.path.join(root, file)
                     font_name = os.path.splitext(file)[0]
                     # Basic name cleanup
                     base_font_name = re.sub(r'[-_ ]?(bold|italic|regular|medium|light|condensed)?$', '', font_name, flags=re.IGNORECASE)
                     if base_font_name not in FONT_PATHS:
                         FONT_PATHS[base_font_name] = font_path
                         found_fonts.append(base_font_name)
        if found_fonts:
             ACCEPTABLE_FONTS = sorted(list(set(found_fonts + ACCEPTABLE_FONTS)))
             logger.info(f"Found system fonts: {ACCEPTABLE_FONTS}")
        else:
            logger.warning(f"No font files found in {FONTS_DIR}. Using defaults.")
    else:
        logger.warning(f"Font directory {FONTS_DIR} not found. Using defaults: {ACCEPTABLE_FONTS}")
except Exception as e:
    logger.warning(f"Could not load system fonts from {FONTS_DIR}: {e}. Using defaults: {ACCEPTABLE_FONTS}")

# Global variable for Whisper model to avoid reloading
whisper_model = None

def generate_style_line(options):
    """Generate ASS style line from options. Uses common defaults.
       Ensure color format is correct (&HBBGGRRAA or &HAABBGGRR depending on FFmpeg build)
       Using &HBBGGRR format for PrimaryColour based on common FFmpeg usage.
    """
    # Convert hex color picker (#FFFFFF) to ASS format (&HBBGGRR)
    def hex_to_ass_bgr(hex_color):
        hex_color = hex_color.lstrip('#')
        if len(hex_color) == 6:
            r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
            return f"&H{b:02X}{g:02X}{r:02X}"
        return '&H00FFFFFF' # Default to white if format is wrong

    primary_color_ass = hex_to_ass_bgr(options.get('primary_color', '#FFFFFF'))

    style_options = {
        'Name': 'Default',
        'Fontname': options.get('font_name', 'Arial'), # Ensure this font is accessible to FFmpeg
        'Fontsize': options.get('font_size', 24),
        'PrimaryColour': primary_color_ass,
        'SecondaryColour': '&H000000FF', # Often unused, but good to define
        'OutlineColour': '&H00000000', # Black outline
        'BackColour': '&H80000000', # Semi-transparent black background/shadow
        'Bold': 0, # Use -1 for True, 0 for False in ASS
        'Italic': 0,
        'Underline': 0,
        'StrikeOut': 0,
        'ScaleX': 100,
        'ScaleY': 100,
        'Spacing': 0,
        'Angle': 0,
        'BorderStyle': 1, # 1 = Outline + Shadow
        'Outline': 2, # Outline thickness
        'Shadow': 1, # Shadow distance
        'Alignment': options.get('alignment', 2), # 2 = Bottom Center
        'MarginL': 10,
        'MarginR': 10,
        'MarginV': 10, # Bottom margin
        'Encoding': 1 # Default ANSI encoding
    }
    logger.info(f"Generated ASS Style Options: {style_options}")
    return f"Style: {','.join(map(str, style_options.values()))}"

def transcribe_audio(audio_path, progress=None):
    """Transcribe audio using Whisper ASR model."""
    global whisper_model
    logger.info(f"Starting transcription for: {audio_path}")
    try:
        if whisper_model is None:
            safe_progress_update(progress, 0.1, "Loading Whisper model...")
            device = "cuda" if torch.cuda.is_available() else "cpu"
            logger.info(f"Using device: {device} for Whisper")
            # Use a smaller model if only CPU is available to potentially speed things up
            model_size = "base" if device == "cuda" else "tiny.en" # or "tiny"
            logger.info(f"Loading Whisper model size: {model_size}")
            whisper_model = openai_whisper.load_model(model_size, device=device)
            safe_progress_update(progress, 0.3, "Model loaded, processing audio...")
        
        result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available())
        logger.info(f"Transcription result (first 100 chars): {str(result)[:100]}")
        safe_progress_update(progress, 0.7, "Transcription complete, formatting captions...")
        return result
    except Exception as e:
        logger.exception(f"Error transcribing audio: {audio_path}") # Use logger.exception to include traceback
        raise

def format_time(seconds):
    """Format time in SRT/ASS format (H:MM:SS.ms)."""
    # ASS format uses H:MM:SS.xx (hundredths of a second)
    hundredths = int((seconds % 1) * 100)
    s = int(seconds) % 60
    m = int(seconds / 60) % 60
    h = int(seconds / 3600)
    return f"{h}:{m:02d}:{s:02d}.{hundredths:02d}"

def format_time_srt(seconds):
    """Format time in SRT format (HH:MM:SS,ms)."""
    ms = int((seconds % 1) * 1000)
    s = int(seconds) % 60
    m = int(seconds / 60) % 60
    h = int(seconds / 3600)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

def generate_srt_from_transcript(segments):
    """Convert whisper segments to SRT format."""
    srt_content = ""
    for i, segment in enumerate(segments):
        start_time = format_time_srt(segment["start"])
        end_time = format_time_srt(segment["end"])
        text = segment["text"].strip()
        srt_content += f"{i+1}\n{start_time} --> {end_time}\n{text}\n\n"
    logger.info(f"Generated SRT (first 200 chars): {srt_content[:200]}")
    return srt_content.strip()

def generate_ass_dialogue_line(segment, style_name='Default'):
    """Generate a single ASS dialogue line from a segment."""
    start_time = format_time(segment["start"])
    end_time = format_time(segment["end"])
    text = segment["text"].strip().replace('\n', '\\N') # Replace newline with ASS newline
    # Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
    return f"Dialogue: 0,{start_time},{end_time},{style_name},,0,0,0,,{text}"

def generate_ass_from_transcript(segments, style_options):
    """Convert whisper segments to ASS format including style header."""
    style_line = generate_style_line(style_options)
    ass_header = f"""
[Script Info]
Title: Generated Captions
ScriptType: v4.00+
WrapStyle: 0
PlayResX: 384
PlayResY: 288
ScaledBorderAndShadow: yes

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
{style_line}

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
    dialogue_lines = [generate_ass_dialogue_line(seg) for seg in segments]
    full_ass_content = ass_header + "\n".join(dialogue_lines)
    logger.info(f"Generated ASS (first 300 chars): {full_ass_content[:300]}")
    return full_ass_content

def extract_audio(video_path, output_path):
    """Extract audio from video file using ffmpeg subprocess."""
    logger.info(f"Attempting to extract audio from {video_path} to {output_path}")
    try:
        command = [
            "ffmpeg", "-i", video_path,
            "-vn", # No video
            "-acodec", "pcm_s16le", # Standard WAV format
            "-ac", "1", # Mono
            "-ar", "16000", # 16kHz sample rate (common for ASR)
            "-y", # Overwrite output
            output_path
        ]
        logger.info(f"Running audio extraction command: {' '.join(map(shlex.quote, command))}")
        process = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8', # Explicitly set encoding
            check=False
        )

        if process.returncode != 0:
            logger.error(f"FFmpeg audio extraction error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}")
            return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..."

        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
            logger.error(f"Audio extraction failed: Output file not created or empty. FFmpeg stderr: {process.stderr}")
            return False, f"Output audio file not created or empty. FFmpeg stderr: {process.stderr[:500]}..."

        logger.info(f"Audio extracted successfully to {output_path}, size: {os.path.getsize(output_path)} bytes")
        return True, ""
    except Exception as e:
        logger.exception(f"Exception during audio extraction from {video_path}")
        return False, str(e)

def run_ffmpeg_with_subtitles(video_path, subtitle_path, output_path, style_options=None):
    """Burn subtitles into video using ffmpeg subprocess.
    
    Args:
        video_path: Path to input video
        subtitle_path: Path to ASS subtitle file
        output_path: Path to save output video
        style_options: Optional style parameters (not directly used, but kept for consistency)
        
    Returns:
        tuple: (success, error_message)
    """
    logger.info(f"Attempting to burn subtitles from {subtitle_path} into {video_path}")
    
    # Check if the subtitle file exists and is not empty
    if not os.path.exists(subtitle_path) or os.path.getsize(subtitle_path) == 0:
        return False, f"Subtitle file {subtitle_path} does not exist or is empty"
    
    # Check if the video file exists
    if not os.path.exists(video_path):
        return False, f"Video file {video_path} does not exist"
    
    # Validate the video file using ffprobe
    try:
        probe_cmd = [
            "ffprobe", "-v", "error", 
            "-select_streams", "v:0", 
            "-show_entries", "stream=codec_name,width,height", 
            "-of", "json",
            video_path
        ]
        probe_result = subprocess.run(
            probe_cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8'
        )
        
        if probe_result.returncode != 0:
            logger.error(f"FFprobe validation failed: {probe_result.stderr}")
            return False, f"FFprobe validation failed: {probe_result.stderr[:200]}..."
    except Exception as e:
        logger.exception(f"Exception during video validation: {video_path}")
        return False, f"Video validation failed: {str(e)}"
    
    try:
        # The subtitle path needs to be properly escaped for the filter complex
        # On Windows, backslashes need special handling
        subtitle_path_esc = subtitle_path.replace('\\', '\\\\')
        
        # Ensure paths are properly quoted for the shell command
        command = [
            "ffmpeg", 
            "-i", video_path,
            "-vf", f"ass='{subtitle_path_esc}'",
            "-c:v", "libx264",  # Use H.264 codec for broad compatibility
            "-preset", "medium", # Balance between speed and quality
            "-crf", "23",        # Reasonable quality setting (lower is better)
            "-c:a", "aac",       # Use AAC for audio
            "-b:a", "128k",      # Decent audio bitrate
            "-movflags", "+faststart", # Optimize for web playback
            "-y",                # Overwrite output if exists
            output_path
        ]
        
        logger.info(f"Running subtitle burn command: {' '.join(map(shlex.quote, command))}")
        
        process = subprocess.run(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8',
            check=False
        )
        
        if process.returncode != 0:
            logger.error(f"FFmpeg subtitle burn error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}")
            return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..."
        
        # Verify output file was created and is not empty
        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
            logger.error(f"Subtitle burning failed: Output file not created or empty. FFmpeg stderr: {process.stderr}")
            return False, f"Output video file not created or empty. FFmpeg stderr: {process.stderr[:500]}..."
        
        logger.info(f"Subtitles burned successfully, output: {output_path}, size: {os.path.getsize(output_path)} bytes")
        return True, ""
        
    except Exception as e:
        logger.exception(f"Exception during subtitle burning: {video_path}")
        return False, str(e)

def safe_progress_update(progress_callback, value, desc=""):
    """Safely update progress without crashing if progress_callback is None or fails."""
    if progress_callback is not None:
        try:
            progress_callback(value, desc)
        except Exception as e:
            # Avoid flooding logs for simple progress updates
            # logger.warning(f"Progress update progress failed: {e}")
            pass # Silently ignore progress update errors

def parse_srt_to_dialogue(srt_content):
    """Basic SRT parser to list of dialogue events for ASS conversion."""
    dialogue = []
    # Regex to find index, timecodes, and text blocks
    # Allows comma or period for milliseconds separator
    pattern = re.compile(
        r'^\s*(\d+)\s*$\n?'  # Index line
        r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*'  # Start time
        r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*$\n'  # End time
        r'(.*?)(?=\n\s*\n\d+\s*$|\Z)',  # Text block (non-greedy) until blank line and next index or end of string
        re.DOTALL | re.MULTILINE
    )
    
    logger.info("Attempting to parse SRT/VTT content...")
    matches_found = 0
    last_index = 0
    for match in pattern.finditer(srt_content):
        matches_found += 1
        try:
            index = int(match.group(1))
            sh, sm, ss, sms = map(int, match.group(2, 3, 4, 5))
            eh, em, es, ems = map(int, match.group(6, 7, 8, 9))
            start_sec = sh * 3600 + sm * 60 + ss + sms / 1000.0
            end_sec = eh * 3600 + em * 60 + es + ems / 1000.0
            text = match.group(10).strip().replace('\n', '\\N') # Replace newline with ASS \N
            
            # Basic validation
            if end_sec < start_sec:
                 logger.warning(f"SRT parse warning: End time {end_sec} before start time {start_sec} at index {index}. Skipping.")
                 continue
            if not text:
                 logger.warning(f"SRT parse warning: Empty text content at index {index}. Skipping.")
                 continue
                 
            dialogue.append({'start': start_sec, 'end': end_sec, 'text': text})
            last_index = match.end()
            
        except Exception as e:
            logger.warning(f"Could not parse SRT block starting near index {match.group(1)}: {e}")
            
    # Check if parsing consumed a reasonable amount of the input
    if matches_found > 0 and last_index < len(srt_content) * 0.8:
        logger.warning(f"SRT parsing finished early. Found {matches_found} blocks, but stopped near character {last_index} of {len(srt_content)}. Input format might be inconsistent.")
    elif matches_found == 0 and len(srt_content) > 10:
        logger.error(f"SRT parsing failed. No dialogue blocks found in content starting with: {srt_content[:100]}...")
        
    logger.info(f"Parsed {len(dialogue)} dialogue events from SRT/VTT content.")
    return dialogue

def parse_ass_to_dialogue(ass_content):
    """Basic ASS parser to extract dialogue events."""
    dialogue = []
    # Regex for ASS Dialogue line - make capturing groups non-optional where possible
    # Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
    pattern = re.compile(
        r'^Dialogue:\s*'
        r'(?P<layer>\d+),\s*'
        r'(?P<start>\d+:\d{2}:\d{2}\.\d{2}),\s*'
        r'(?P<end>\d+:\d{2}:\d{2}\.\d{2}),\s*'
        r'(?P<style>[^,]*),\s*'  # Style name
        r'(?P<name>[^,]*),\s*'  # Actor name
        r'(?P<marginL>\d+),\s*'
        r'(?P<marginR>\d+),\s*'
        r'(?P<marginV>\d+),\s*'
        r'(?P<effect>[^,]*),\s*'  # Effect
        r'(?P<text>.*?)$',  # Text (rest of line)
        re.IGNORECASE
    )
    
    # Helper to convert H:MM:SS.xx to seconds
    def time_to_seconds(time_str):
        try:
            parts = time_str.split(':')
            h = int(parts[0])
            m = int(parts[1])
            s_parts = parts[2].split('.')
            s = int(s_parts[0])
            cs = int(s_parts[1])
            return h * 3600 + m * 60 + s + cs / 100.0
        except Exception as e:
             logger.error(f"Failed to parse time string '{time_str}': {e}")
             return 0.0 # Return 0 on failure to avoid crashing, but log it

    logger.info("Attempting to parse ASS content...")
    lines_parsed = 0
    for line in ass_content.splitlines():
        line = line.strip()
        if not line.lower().startswith('dialogue:'):
            continue
            
        match = pattern.match(line)
        if match:
            lines_parsed += 1
            try:
                start_sec = time_to_seconds(match.group('start'))
                end_sec = time_to_seconds(match.group('end'))
                text = match.group('text').strip() # Already handles \N from ASS spec
                
                if end_sec < start_sec:
                    logger.warning(f"ASS parse warning: End time {end_sec} before start time {start_sec} in line: '{line}'. Skipping.")
                    continue
                if not text:
                    logger.warning(f"ASS parse warning: Empty text content in line: '{line}'. Skipping.")
                    continue
                    
                dialogue.append({'start': start_sec, 'end': end_sec, 'text': text})
            except Exception as e:
                logger.warning(f"Could not parse ASS dialogue line: '{line}'. Error: {e}")
        else:
             logger.warning(f"ASS dialogue line did not match expected pattern: '{line}'")
             
    if lines_parsed == 0 and len(ass_content) > 50: # Check if content was substantial
         logger.error(f"ASS parsing failed. No dialogue lines matched the expected pattern in content starting with: {ass_content[:200]}...")
         
    logger.info(f"Parsed {len(dialogue)} dialogue events from {lines_parsed} matched ASS lines.")
    return dialogue

def process_video_with_captions(video, captions, caption_type, font_name, font_size, 
                               primary_color, alignment, auto_caption):
    """Main processing function."""
    progress = gr.Progress(track_tqdm=True)
    temp_dir = None
    try:
        progress(0, desc="Initializing...")
        temp_dir = tempfile.mkdtemp()
        logger.info(f"Created temp dir: {temp_dir}")
        
        video_path = os.path.join(temp_dir, "input_video.mp4")
        output_path = os.path.join(temp_dir, "output_video.mp4")
        # Removed initial_subtitle_path, only need final
        final_ass_path = os.path.join(temp_dir, "captions_final.ass")
        
        # --- Handle Video Input --- 
        progress(0.05, desc="Saving video...")
        if hasattr(video, 'name') and video.name and os.path.exists(video.name):
             import shutil
             shutil.copy(video.name, video_path)
             logger.info(f"Copied input video from Gradio temp file {video.name} to {video_path}")
        elif isinstance(video, str) and os.path.exists(video):
             import shutil
             shutil.copy(video, video_path)
             logger.info(f"Copied input video from path {video} to {video_path}")
        else:
             raise gr.Error("Could not access uploaded video file. Please try uploading again.")
            
        # --- Prepare Styles --- 
        progress(0.1, desc="Preparing styles...")
        generated_captions_display_text = "" 
        alignment_map = {"Bottom Center": 2, "Bottom Left": 1, "Bottom Right": 3}
        style_options = {
            'font_name': font_name,
            'font_size': font_size,
            'primary_color': primary_color,
            'alignment': alignment_map.get(alignment, 2)
        }
        
        # --- Auto-Generate or Process Provided Captions --- 
        dialogue_events = [] # To hold {'start': float, 'end': float, 'text': str}

        if auto_caption:
            logger.info("Auto-generating captions...")
            progress(0.15, desc="Extracting audio...")
            audio_path = os.path.join(temp_dir, "audio.wav")
            success, error_msg = extract_audio(video_path, audio_path)
            if not success: raise gr.Error(f"Audio extraction failed: {error_msg}")
            
            progress(0.25, desc="Transcribing audio...")
            transcript = transcribe_audio(audio_path, progress=progress)
            if not transcript or not transcript.get("segments"): raise gr.Error("No speech detected.")
            dialogue_events = transcript["segments"] # Use segments directly
            progress(0.6, desc="Generating ASS captions...")
            
        else: # Use provided captions
            logger.info(f"Using provided {caption_type} captions.")
            if not captions or captions.strip() == "": raise gr.Error("Caption input is empty.")
            
            progress(0.6, desc=f"Processing {caption_type} captions...")
            if caption_type.lower() == 'ass':
                 logger.info("Parsing provided ASS content.")
                 dialogue_events = parse_ass_to_dialogue(captions)
                 if not dialogue_events:
                      raise gr.Error("Could not parse dialogue lines from provided ASS content.")
            elif caption_type.lower() in ['srt', 'vtt']:
                logger.info(f"Parsing provided {caption_type} content.")
                dialogue_events = parse_srt_to_dialogue(captions)
                if not dialogue_events:
                     raise gr.Error(f"Could not parse provided {caption_type} content.")
            else:
                 raise gr.Error(f"Unsupported caption type: {caption_type}")

        # --- Generate Final ASS File --- 
        if not dialogue_events:
             raise gr.Error("No caption dialogue events found or generated.")
             
        logger.info(f"Generating final ASS file with {len(dialogue_events)} events and UI styles.")
        final_ass_content = generate_ass_from_transcript(dialogue_events, style_options)
        generated_captions_display_text = final_ass_content # Show the final generated ASS
        
        with open(final_ass_path, 'w', encoding='utf-8') as f:
            f.write(final_ass_content)
        logger.info(f"Written final styled ASS to {final_ass_path}")
        
        # Verify file creation
        if not os.path.exists(final_ass_path) or os.path.getsize(final_ass_path) == 0:
            raise gr.Error(f"Internal error: Failed to write final ASS file to {final_ass_path}")

        # --- Burn Subtitles --- 
        progress(0.7, desc="Burning subtitles into video...")
        success, error_msg = run_ffmpeg_with_subtitles(
            video_path, final_ass_path, output_path, style_options
        )
        if not success:
            logger.error(f"Subtitle burning failed. Video: {video_path}, ASS: {final_ass_path}")
            raise gr.Error(f"FFmpeg failed to burn subtitles: {error_msg}")
        
        progress(1.0, desc="Processing complete!")
        logger.info(f"Output video generated: {output_path}")
        
        return output_path, generated_captions_display_text
            
    except Exception as e:
        logger.exception(f"Error in process_video_with_captions")
        if temp_dir and os.path.exists(temp_dir):
            try:
                files = os.listdir(temp_dir)
                logger.error(f"Files in temp dir {temp_dir} during error: {files}")
            except Exception as list_e:
                logger.error(f"Could not list temp dir {temp_dir}: {list_e}")
        if isinstance(e, gr.Error): raise e
        else: raise gr.Error(f"An unexpected error occurred: {str(e)}")

# Function to toggle interactivity
def toggle_captions_input(auto_generate):
    """Toggle the interactivity of the captions input."""
    return gr.update(interactive=not auto_generate)

# --- Gradio Interface --- 
with gr.Blocks(title="Video Caption Generator") as app:
    gr.Markdown("## Video Caption Generator")
    gr.Markdown("Upload a video, choose styling, and add captions. Use auto-generation or provide your own SRT/ASS/VTT.")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("**Input & Options**")
            video_input = gr.Video(label="Upload Video")
            auto_caption = gr.Checkbox(label="Auto-generate captions (Overrides below)", value=False)
            captions_input = gr.Textbox(
                label="Or Enter Captions Manually", 
                placeholder="1\n00:00:01,000 --> 00:00:05,000\nHello World\n\n2\n...",
                lines=8,
                interactive=True
            )
            caption_type = gr.Dropdown(
                choices=["srt", "ass", "vtt"], 
                value="srt", 
                label="Format (if providing captions manually)"
            )
            
            gr.Markdown("**Caption Styling** (Applied to auto-generated or converted ASS)")
            with gr.Row():
                 font_name = gr.Dropdown(
                    choices=ACCEPTABLE_FONTS,
                    value=ACCEPTABLE_FONTS[0] if ACCEPTABLE_FONTS else "Arial",
                    label="Font"
                )
                 font_size = gr.Slider(minimum=10, maximum=60, value=24, step=1, label="Font Size")
            with gr.Row():
                primary_color = gr.ColorPicker(value="#FFFFFF", label="Text Color")
                alignment = gr.Dropdown(
                    choices=["Bottom Center", "Bottom Left", "Bottom Right"],
                    value="Bottom Center",
                    label="Alignment"
                )
                
            process_btn = gr.Button("Generate Captioned Video", variant="primary")
        
        with gr.Column(scale=1):
             gr.Markdown("**Output**")
             video_output = gr.Video(label="Captioned Video")
             generated_captions_output = gr.Textbox(
                label="Generated Captions (ASS format if auto-generated)", 
                lines=10,
                interactive=False
             )
    
    # Link checkbox to captions input interactivity
    auto_caption.change(
        fn=toggle_captions_input,
        inputs=[auto_caption], 
        outputs=[captions_input]
    )
    
    # Define the main processing function call for the button
    process_btn.click(
        fn=process_video_with_captions,
        inputs=[
            video_input,
            captions_input,
            caption_type,
            font_name,
            font_size,
            primary_color,
            alignment,
            auto_caption
        ],
        outputs=[video_output, generated_captions_output],
        # api_name="generate_captions" 
    )

# Launch the app
if __name__ == "__main__":
    app.launch(debug=True, share=False) # Enable debug for local testing