Spaces:
Running
Running
import os | |
import tempfile | |
import gradio as gr | |
import ffmpeg | |
import logging | |
import whisper as openai_whisper # Renamed to avoid potential conflicts | |
import numpy as np | |
import torch | |
import datetime | |
import subprocess | |
import shlex | |
from pathlib import Path | |
import re # For parsing ASS/SRT | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Define fonts directory - adapt for Hugging Face environment if needed | |
FONTS_DIR = '/usr/share/fonts/truetype' # Common Linux font location | |
# Check common font locations for other OS if needed | |
if not os.path.exists(FONTS_DIR) and os.path.exists('/System/Library/Fonts'): # macOS | |
FONTS_DIR = '/System/Library/Fonts' | |
elif not os.path.exists(FONTS_DIR) and os.path.exists('C:\Windows\Fonts'): # Windows | |
FONTS_DIR = 'C:\Windows\Fonts' | |
FONT_PATHS = {} | |
ACCEPTABLE_FONTS = ['Arial', 'Helvetica', 'Times New Roman'] # Start with common fallbacks | |
try: | |
if FONTS_DIR and os.path.exists(FONTS_DIR): | |
logger.info(f"Searching for fonts in: {FONTS_DIR}") | |
found_fonts = [] | |
for root, dirs, files in os.walk(FONTS_DIR): | |
for file in files: | |
if file.lower().endswith(('.ttf', '.otf', '.ttc')): | |
font_path = os.path.join(root, file) | |
font_name = os.path.splitext(file)[0] | |
# Basic name cleanup | |
base_font_name = re.sub(r'[-_ ]?(bold|italic|regular|medium|light|condensed)?$', '', font_name, flags=re.IGNORECASE) | |
if base_font_name not in FONT_PATHS: | |
FONT_PATHS[base_font_name] = font_path | |
found_fonts.append(base_font_name) | |
if found_fonts: | |
ACCEPTABLE_FONTS = sorted(list(set(found_fonts + ACCEPTABLE_FONTS))) | |
logger.info(f"Found system fonts: {ACCEPTABLE_FONTS}") | |
else: | |
logger.warning(f"No font files found in {FONTS_DIR}. Using defaults.") | |
else: | |
logger.warning(f"Font directory {FONTS_DIR} not found. Using defaults: {ACCEPTABLE_FONTS}") | |
except Exception as e: | |
logger.warning(f"Could not load system fonts from {FONTS_DIR}: {e}. Using defaults: {ACCEPTABLE_FONTS}") | |
# Global variable for Whisper model to avoid reloading | |
whisper_model = None | |
def generate_style_line(options): | |
"""Generate ASS style line from options. Uses common defaults. | |
Ensure color format is correct (&HBBGGRRAA or &HAABBGGRR depending on FFmpeg build) | |
Using &HBBGGRR format for PrimaryColour based on common FFmpeg usage. | |
""" | |
# Convert hex color picker (#FFFFFF) to ASS format (&HBBGGRR) | |
def hex_to_ass_bgr(hex_color): | |
hex_color = hex_color.lstrip('#') | |
if len(hex_color) == 6: | |
r, g, b = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) | |
return f"&H{b:02X}{g:02X}{r:02X}" | |
return '&H00FFFFFF' # Default to white if format is wrong | |
primary_color_ass = hex_to_ass_bgr(options.get('primary_color', '#FFFFFF')) | |
style_options = { | |
'Name': 'Default', | |
'Fontname': options.get('font_name', 'Arial'), # Ensure this font is accessible to FFmpeg | |
'Fontsize': options.get('font_size', 24), | |
'PrimaryColour': primary_color_ass, | |
'SecondaryColour': '&H000000FF', # Often unused, but good to define | |
'OutlineColour': '&H00000000', # Black outline | |
'BackColour': '&H80000000', # Semi-transparent black background/shadow | |
'Bold': 0, # Use -1 for True, 0 for False in ASS | |
'Italic': 0, | |
'Underline': 0, | |
'StrikeOut': 0, | |
'ScaleX': 100, | |
'ScaleY': 100, | |
'Spacing': 0, | |
'Angle': 0, | |
'BorderStyle': 1, # 1 = Outline + Shadow | |
'Outline': 2, # Outline thickness | |
'Shadow': 1, # Shadow distance | |
'Alignment': options.get('alignment', 2), # 2 = Bottom Center | |
'MarginL': 10, | |
'MarginR': 10, | |
'MarginV': 10, # Bottom margin | |
'Encoding': 1 # Default ANSI encoding | |
} | |
logger.info(f"Generated ASS Style Options: {style_options}") | |
return f"Style: {','.join(map(str, style_options.values()))}" | |
def transcribe_audio(audio_path, progress=None): | |
"""Transcribe audio using Whisper ASR model.""" | |
global whisper_model | |
logger.info(f"Starting transcription for: {audio_path}") | |
try: | |
if whisper_model is None: | |
safe_progress_update(progress, 0.1, "Loading Whisper model...") | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
logger.info(f"Using device: {device} for Whisper") | |
# Use a smaller model if only CPU is available to potentially speed things up | |
model_size = "base" if device == "cuda" else "tiny.en" # or "tiny" | |
logger.info(f"Loading Whisper model size: {model_size}") | |
whisper_model = openai_whisper.load_model(model_size, device=device) | |
safe_progress_update(progress, 0.3, "Model loaded, processing audio...") | |
result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available()) | |
logger.info(f"Transcription result (first 100 chars): {str(result)[:100]}") | |
safe_progress_update(progress, 0.7, "Transcription complete, formatting captions...") | |
return result | |
except Exception as e: | |
logger.exception(f"Error transcribing audio: {audio_path}") # Use logger.exception to include traceback | |
raise | |
def format_time(seconds): | |
"""Format time in SRT/ASS format (H:MM:SS.ms).""" | |
# ASS format uses H:MM:SS.xx (hundredths of a second) | |
hundredths = int((seconds % 1) * 100) | |
s = int(seconds) % 60 | |
m = int(seconds / 60) % 60 | |
h = int(seconds / 3600) | |
return f"{h}:{m:02d}:{s:02d}.{hundredths:02d}" | |
def format_time_srt(seconds): | |
"""Format time in SRT format (HH:MM:SS,ms).""" | |
ms = int((seconds % 1) * 1000) | |
s = int(seconds) % 60 | |
m = int(seconds / 60) % 60 | |
h = int(seconds / 3600) | |
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" | |
def generate_srt_from_transcript(segments): | |
"""Convert whisper segments to SRT format.""" | |
srt_content = "" | |
for i, segment in enumerate(segments): | |
start_time = format_time_srt(segment["start"]) | |
end_time = format_time_srt(segment["end"]) | |
text = segment["text"].strip() | |
srt_content += f"{i+1}\n{start_time} --> {end_time}\n{text}\n\n" | |
logger.info(f"Generated SRT (first 200 chars): {srt_content[:200]}") | |
return srt_content.strip() | |
def generate_ass_dialogue_line(segment, style_name='Default'): | |
"""Generate a single ASS dialogue line from a segment.""" | |
start_time = format_time(segment["start"]) | |
end_time = format_time(segment["end"]) | |
text = segment["text"].strip().replace('\n', '\\N') # Replace newline with ASS newline | |
# Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text | |
return f"Dialogue: 0,{start_time},{end_time},{style_name},,0,0,0,,{text}" | |
def generate_ass_from_transcript(segments, style_options): | |
"""Convert whisper segments to ASS format including style header.""" | |
style_line = generate_style_line(style_options) | |
ass_header = f""" | |
[Script Info] | |
Title: Generated Captions | |
ScriptType: v4.00+ | |
WrapStyle: 0 | |
PlayResX: 384 | |
PlayResY: 288 | |
ScaledBorderAndShadow: yes | |
[V4+ Styles] | |
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding | |
{style_line} | |
[Events] | |
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text | |
""" | |
dialogue_lines = [generate_ass_dialogue_line(seg) for seg in segments] | |
full_ass_content = ass_header + "\n".join(dialogue_lines) | |
logger.info(f"Generated ASS (first 300 chars): {full_ass_content[:300]}") | |
return full_ass_content | |
def extract_audio(video_path, output_path): | |
"""Extract audio from video file using ffmpeg subprocess.""" | |
logger.info(f"Attempting to extract audio from {video_path} to {output_path}") | |
try: | |
command = [ | |
"ffmpeg", "-i", video_path, | |
"-vn", # No video | |
"-acodec", "pcm_s16le", # Standard WAV format | |
"-ac", "1", # Mono | |
"-ar", "16000", # 16kHz sample rate (common for ASR) | |
"-y", # Overwrite output | |
output_path | |
] | |
logger.info(f"Running audio extraction command: {' '.join(map(shlex.quote, command))}") | |
process = subprocess.run( | |
command, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
text=True, | |
encoding='utf-8', # Explicitly set encoding | |
check=False | |
) | |
if process.returncode != 0: | |
logger.error(f"FFmpeg audio extraction error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}") | |
return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..." | |
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: | |
logger.error(f"Audio extraction failed: Output file not created or empty. FFmpeg stderr: {process.stderr}") | |
return False, f"Output audio file not created or empty. FFmpeg stderr: {process.stderr[:500]}..." | |
logger.info(f"Audio extracted successfully to {output_path}, size: {os.path.getsize(output_path)} bytes") | |
return True, "" | |
except Exception as e: | |
logger.exception(f"Exception during audio extraction from {video_path}") | |
return False, str(e) | |
def run_ffmpeg_with_subtitles(video_path, subtitle_path, output_path, style_options=None): | |
"""Burn subtitles into video using ffmpeg subprocess. | |
Args: | |
video_path: Path to input video | |
subtitle_path: Path to ASS subtitle file | |
output_path: Path to save output video | |
style_options: Optional style parameters (not directly used, but kept for consistency) | |
Returns: | |
tuple: (success, error_message) | |
""" | |
logger.info(f"Attempting to burn subtitles from {subtitle_path} into {video_path}") | |
# Check if the subtitle file exists and is not empty | |
if not os.path.exists(subtitle_path) or os.path.getsize(subtitle_path) == 0: | |
return False, f"Subtitle file {subtitle_path} does not exist or is empty" | |
# Check if the video file exists | |
if not os.path.exists(video_path): | |
return False, f"Video file {video_path} does not exist" | |
# Validate the video file using ffprobe | |
try: | |
probe_cmd = [ | |
"ffprobe", "-v", "error", | |
"-select_streams", "v:0", | |
"-show_entries", "stream=codec_name,width,height", | |
"-of", "json", | |
video_path | |
] | |
probe_result = subprocess.run( | |
probe_cmd, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
text=True, | |
encoding='utf-8' | |
) | |
if probe_result.returncode != 0: | |
logger.error(f"FFprobe validation failed: {probe_result.stderr}") | |
return False, f"FFprobe validation failed: {probe_result.stderr[:200]}..." | |
except Exception as e: | |
logger.exception(f"Exception during video validation: {video_path}") | |
return False, f"Video validation failed: {str(e)}" | |
try: | |
# The subtitle path needs to be properly escaped for the filter complex | |
# On Windows, backslashes need special handling | |
subtitle_path_esc = subtitle_path.replace('\\', '\\\\') | |
# Ensure paths are properly quoted for the shell command | |
command = [ | |
"ffmpeg", | |
"-i", video_path, | |
"-vf", f"ass='{subtitle_path_esc}'", | |
"-c:v", "libx264", # Use H.264 codec for broad compatibility | |
"-preset", "medium", # Balance between speed and quality | |
"-crf", "23", # Reasonable quality setting (lower is better) | |
"-c:a", "aac", # Use AAC for audio | |
"-b:a", "128k", # Decent audio bitrate | |
"-movflags", "+faststart", # Optimize for web playback | |
"-y", # Overwrite output if exists | |
output_path | |
] | |
logger.info(f"Running subtitle burn command: {' '.join(map(shlex.quote, command))}") | |
process = subprocess.run( | |
command, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
text=True, | |
encoding='utf-8', | |
check=False | |
) | |
if process.returncode != 0: | |
logger.error(f"FFmpeg subtitle burn error (Code {process.returncode}):\nSTDOUT:\n{process.stdout}\nSTDERR:\n{process.stderr}") | |
return False, f"FFmpeg failed (Code {process.returncode}): {process.stderr[:500]}..." | |
# Verify output file was created and is not empty | |
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: | |
logger.error(f"Subtitle burning failed: Output file not created or empty. FFmpeg stderr: {process.stderr}") | |
return False, f"Output video file not created or empty. FFmpeg stderr: {process.stderr[:500]}..." | |
logger.info(f"Subtitles burned successfully, output: {output_path}, size: {os.path.getsize(output_path)} bytes") | |
return True, "" | |
except Exception as e: | |
logger.exception(f"Exception during subtitle burning: {video_path}") | |
return False, str(e) | |
def safe_progress_update(progress_callback, value, desc=""): | |
"""Safely update progress without crashing if progress_callback is None or fails.""" | |
if progress_callback is not None: | |
try: | |
progress_callback(value, desc) | |
except Exception as e: | |
# Avoid flooding logs for simple progress updates | |
# logger.warning(f"Progress update progress failed: {e}") | |
pass # Silently ignore progress update errors | |
def parse_srt_to_dialogue(srt_content): | |
"""Basic SRT parser to list of dialogue events for ASS conversion.""" | |
dialogue = [] | |
# Regex to find index, timecodes, and text blocks | |
# Allows comma or period for milliseconds separator | |
pattern = re.compile( | |
r'^\s*(\d+)\s*$\n?' # Index line | |
r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*' # Start time | |
r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{3})\s*$\n' # End time | |
r'(.*?)(?=\n\s*\n\d+\s*$|\Z)', # Text block (non-greedy) until blank line and next index or end of string | |
re.DOTALL | re.MULTILINE | |
) | |
logger.info("Attempting to parse SRT/VTT content...") | |
matches_found = 0 | |
last_index = 0 | |
for match in pattern.finditer(srt_content): | |
matches_found += 1 | |
try: | |
index = int(match.group(1)) | |
sh, sm, ss, sms = map(int, match.group(2, 3, 4, 5)) | |
eh, em, es, ems = map(int, match.group(6, 7, 8, 9)) | |
start_sec = sh * 3600 + sm * 60 + ss + sms / 1000.0 | |
end_sec = eh * 3600 + em * 60 + es + ems / 1000.0 | |
text = match.group(10).strip().replace('\n', '\\N') # Replace newline with ASS \N | |
# Basic validation | |
if end_sec < start_sec: | |
logger.warning(f"SRT parse warning: End time {end_sec} before start time {start_sec} at index {index}. Skipping.") | |
continue | |
if not text: | |
logger.warning(f"SRT parse warning: Empty text content at index {index}. Skipping.") | |
continue | |
dialogue.append({'start': start_sec, 'end': end_sec, 'text': text}) | |
last_index = match.end() | |
except Exception as e: | |
logger.warning(f"Could not parse SRT block starting near index {match.group(1)}: {e}") | |
# Check if parsing consumed a reasonable amount of the input | |
if matches_found > 0 and last_index < len(srt_content) * 0.8: | |
logger.warning(f"SRT parsing finished early. Found {matches_found} blocks, but stopped near character {last_index} of {len(srt_content)}. Input format might be inconsistent.") | |
elif matches_found == 0 and len(srt_content) > 10: | |
logger.error(f"SRT parsing failed. No dialogue blocks found in content starting with: {srt_content[:100]}...") | |
logger.info(f"Parsed {len(dialogue)} dialogue events from SRT/VTT content.") | |
return dialogue | |
def parse_ass_to_dialogue(ass_content): | |
"""Basic ASS parser to extract dialogue events.""" | |
dialogue = [] | |
# Regex for ASS Dialogue line - make capturing groups non-optional where possible | |
# Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text | |
pattern = re.compile( | |
r'^Dialogue:\s*' | |
r'(?P<layer>\d+),\s*' | |
r'(?P<start>\d+:\d{2}:\d{2}\.\d{2}),\s*' | |
r'(?P<end>\d+:\d{2}:\d{2}\.\d{2}),\s*' | |
r'(?P<style>[^,]*),\s*' # Style name | |
r'(?P<name>[^,]*),\s*' # Actor name | |
r'(?P<marginL>\d+),\s*' | |
r'(?P<marginR>\d+),\s*' | |
r'(?P<marginV>\d+),\s*' | |
r'(?P<effect>[^,]*),\s*' # Effect | |
r'(?P<text>.*?)$', # Text (rest of line) | |
re.IGNORECASE | |
) | |
# Helper to convert H:MM:SS.xx to seconds | |
def time_to_seconds(time_str): | |
try: | |
parts = time_str.split(':') | |
h = int(parts[0]) | |
m = int(parts[1]) | |
s_parts = parts[2].split('.') | |
s = int(s_parts[0]) | |
cs = int(s_parts[1]) | |
return h * 3600 + m * 60 + s + cs / 100.0 | |
except Exception as e: | |
logger.error(f"Failed to parse time string '{time_str}': {e}") | |
return 0.0 # Return 0 on failure to avoid crashing, but log it | |
logger.info("Attempting to parse ASS content...") | |
lines_parsed = 0 | |
for line in ass_content.splitlines(): | |
line = line.strip() | |
if not line.lower().startswith('dialogue:'): | |
continue | |
match = pattern.match(line) | |
if match: | |
lines_parsed += 1 | |
try: | |
start_sec = time_to_seconds(match.group('start')) | |
end_sec = time_to_seconds(match.group('end')) | |
text = match.group('text').strip() # Already handles \N from ASS spec | |
if end_sec < start_sec: | |
logger.warning(f"ASS parse warning: End time {end_sec} before start time {start_sec} in line: '{line}'. Skipping.") | |
continue | |
if not text: | |
logger.warning(f"ASS parse warning: Empty text content in line: '{line}'. Skipping.") | |
continue | |
dialogue.append({'start': start_sec, 'end': end_sec, 'text': text}) | |
except Exception as e: | |
logger.warning(f"Could not parse ASS dialogue line: '{line}'. Error: {e}") | |
else: | |
logger.warning(f"ASS dialogue line did not match expected pattern: '{line}'") | |
if lines_parsed == 0 and len(ass_content) > 50: # Check if content was substantial | |
logger.error(f"ASS parsing failed. No dialogue lines matched the expected pattern in content starting with: {ass_content[:200]}...") | |
logger.info(f"Parsed {len(dialogue)} dialogue events from {lines_parsed} matched ASS lines.") | |
return dialogue | |
def process_video_with_captions(video, captions, caption_type, font_name, font_size, | |
primary_color, alignment, auto_caption): | |
"""Main processing function.""" | |
progress = gr.Progress(track_tqdm=True) | |
temp_dir = None | |
try: | |
progress(0, desc="Initializing...") | |
temp_dir = tempfile.mkdtemp() | |
logger.info(f"Created temp dir: {temp_dir}") | |
video_path = os.path.join(temp_dir, "input_video.mp4") | |
output_path = os.path.join(temp_dir, "output_video.mp4") | |
# Removed initial_subtitle_path, only need final | |
final_ass_path = os.path.join(temp_dir, "captions_final.ass") | |
# --- Handle Video Input --- | |
progress(0.05, desc="Saving video...") | |
if hasattr(video, 'name') and video.name and os.path.exists(video.name): | |
import shutil | |
shutil.copy(video.name, video_path) | |
logger.info(f"Copied input video from Gradio temp file {video.name} to {video_path}") | |
elif isinstance(video, str) and os.path.exists(video): | |
import shutil | |
shutil.copy(video, video_path) | |
logger.info(f"Copied input video from path {video} to {video_path}") | |
else: | |
raise gr.Error("Could not access uploaded video file. Please try uploading again.") | |
# --- Prepare Styles --- | |
progress(0.1, desc="Preparing styles...") | |
generated_captions_display_text = "" | |
alignment_map = {"Bottom Center": 2, "Bottom Left": 1, "Bottom Right": 3} | |
style_options = { | |
'font_name': font_name, | |
'font_size': font_size, | |
'primary_color': primary_color, | |
'alignment': alignment_map.get(alignment, 2) | |
} | |
# --- Auto-Generate or Process Provided Captions --- | |
dialogue_events = [] # To hold {'start': float, 'end': float, 'text': str} | |
if auto_caption: | |
logger.info("Auto-generating captions...") | |
progress(0.15, desc="Extracting audio...") | |
audio_path = os.path.join(temp_dir, "audio.wav") | |
success, error_msg = extract_audio(video_path, audio_path) | |
if not success: raise gr.Error(f"Audio extraction failed: {error_msg}") | |
progress(0.25, desc="Transcribing audio...") | |
transcript = transcribe_audio(audio_path, progress=progress) | |
if not transcript or not transcript.get("segments"): raise gr.Error("No speech detected.") | |
dialogue_events = transcript["segments"] # Use segments directly | |
progress(0.6, desc="Generating ASS captions...") | |
else: # Use provided captions | |
logger.info(f"Using provided {caption_type} captions.") | |
if not captions or captions.strip() == "": raise gr.Error("Caption input is empty.") | |
progress(0.6, desc=f"Processing {caption_type} captions...") | |
if caption_type.lower() == 'ass': | |
logger.info("Parsing provided ASS content.") | |
dialogue_events = parse_ass_to_dialogue(captions) | |
if not dialogue_events: | |
raise gr.Error("Could not parse dialogue lines from provided ASS content.") | |
elif caption_type.lower() in ['srt', 'vtt']: | |
logger.info(f"Parsing provided {caption_type} content.") | |
dialogue_events = parse_srt_to_dialogue(captions) | |
if not dialogue_events: | |
raise gr.Error(f"Could not parse provided {caption_type} content.") | |
else: | |
raise gr.Error(f"Unsupported caption type: {caption_type}") | |
# --- Generate Final ASS File --- | |
if not dialogue_events: | |
raise gr.Error("No caption dialogue events found or generated.") | |
logger.info(f"Generating final ASS file with {len(dialogue_events)} events and UI styles.") | |
final_ass_content = generate_ass_from_transcript(dialogue_events, style_options) | |
generated_captions_display_text = final_ass_content # Show the final generated ASS | |
with open(final_ass_path, 'w', encoding='utf-8') as f: | |
f.write(final_ass_content) | |
logger.info(f"Written final styled ASS to {final_ass_path}") | |
# Verify file creation | |
if not os.path.exists(final_ass_path) or os.path.getsize(final_ass_path) == 0: | |
raise gr.Error(f"Internal error: Failed to write final ASS file to {final_ass_path}") | |
# --- Burn Subtitles --- | |
progress(0.7, desc="Burning subtitles into video...") | |
success, error_msg = run_ffmpeg_with_subtitles( | |
video_path, final_ass_path, output_path, style_options | |
) | |
if not success: | |
logger.error(f"Subtitle burning failed. Video: {video_path}, ASS: {final_ass_path}") | |
raise gr.Error(f"FFmpeg failed to burn subtitles: {error_msg}") | |
progress(1.0, desc="Processing complete!") | |
logger.info(f"Output video generated: {output_path}") | |
return output_path, generated_captions_display_text | |
except Exception as e: | |
logger.exception(f"Error in process_video_with_captions") | |
if temp_dir and os.path.exists(temp_dir): | |
try: | |
files = os.listdir(temp_dir) | |
logger.error(f"Files in temp dir {temp_dir} during error: {files}") | |
except Exception as list_e: | |
logger.error(f"Could not list temp dir {temp_dir}: {list_e}") | |
if isinstance(e, gr.Error): raise e | |
else: raise gr.Error(f"An unexpected error occurred: {str(e)}") | |
# Function to toggle interactivity | |
def toggle_captions_input(auto_generate): | |
"""Toggle the interactivity of the captions input.""" | |
return gr.update(interactive=not auto_generate) | |
# --- Gradio Interface --- | |
with gr.Blocks(title="Video Caption Generator") as app: | |
gr.Markdown("## Video Caption Generator") | |
gr.Markdown("Upload a video, choose styling, and add captions. Use auto-generation or provide your own SRT/ASS/VTT.") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("**Input & Options**") | |
video_input = gr.Video(label="Upload Video") | |
auto_caption = gr.Checkbox(label="Auto-generate captions (Overrides below)", value=False) | |
captions_input = gr.Textbox( | |
label="Or Enter Captions Manually", | |
placeholder="1\n00:00:01,000 --> 00:00:05,000\nHello World\n\n2\n...", | |
lines=8, | |
interactive=True | |
) | |
caption_type = gr.Dropdown( | |
choices=["srt", "ass", "vtt"], | |
value="srt", | |
label="Format (if providing captions manually)" | |
) | |
gr.Markdown("**Caption Styling** (Applied to auto-generated or converted ASS)") | |
with gr.Row(): | |
font_name = gr.Dropdown( | |
choices=ACCEPTABLE_FONTS, | |
value=ACCEPTABLE_FONTS[0] if ACCEPTABLE_FONTS else "Arial", | |
label="Font" | |
) | |
font_size = gr.Slider(minimum=10, maximum=60, value=24, step=1, label="Font Size") | |
with gr.Row(): | |
primary_color = gr.ColorPicker(value="#FFFFFF", label="Text Color") | |
alignment = gr.Dropdown( | |
choices=["Bottom Center", "Bottom Left", "Bottom Right"], | |
value="Bottom Center", | |
label="Alignment" | |
) | |
process_btn = gr.Button("Generate Captioned Video", variant="primary") | |
with gr.Column(scale=1): | |
gr.Markdown("**Output**") | |
video_output = gr.Video(label="Captioned Video") | |
generated_captions_output = gr.Textbox( | |
label="Generated Captions (ASS format if auto-generated)", | |
lines=10, | |
interactive=False | |
) | |
# Link checkbox to captions input interactivity | |
auto_caption.change( | |
fn=toggle_captions_input, | |
inputs=[auto_caption], | |
outputs=[captions_input] | |
) | |
# Define the main processing function call for the button | |
process_btn.click( | |
fn=process_video_with_captions, | |
inputs=[ | |
video_input, | |
captions_input, | |
caption_type, | |
font_name, | |
font_size, | |
primary_color, | |
alignment, | |
auto_caption | |
], | |
outputs=[video_output, generated_captions_output], | |
# api_name="generate_captions" | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
app.launch(debug=True, share=False) # Enable debug for local testing |