Upload 10 files
Browse files- LICENSE +21 -0
- README.md +9 -7
- app.py +515 -0
- config.py +190 -0
- llm_script_generator.py +388 -0
- promptkit.py +81 -0
- requirements.txt +53 -0
- sync_manager.py +381 -0
- utils_audio.py +292 -0
- utils_video.py +336 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 EceMotion Pictures
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,14 +1,16 @@
|
|
| 1 |
---
|
| 2 |
title: EceMotion Pictures
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
-
short_description:
|
| 12 |
---
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: EceMotion Pictures
|
| 3 |
+
emoji: 🎬🎤🤖
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
+
short_description: 1980s style commercial with perfect audio-video sync.
|
| 12 |
---
|
| 13 |
+
models:
|
| 14 |
+
- damo-vilab/text-to-video-ms-1.7b
|
| 15 |
+
- parler-tts/parler-tts-mini-v1
|
| 16 |
+
- microsoft/DialoGPT-medium
|
app.py
ADDED
|
@@ -0,0 +1,515 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
EceMotion Pictures - Production Grade Commercial Generator
|
| 3 |
+
Advanced text-to-video commercial generator with perfect audio-video sync.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
import logging
|
| 9 |
+
from typing import Optional, Tuple, Dict, Any
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import traceback
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
# Import our enhanced modules
|
| 17 |
+
from config import (
|
| 18 |
+
MODEL_VIDEO, MODEL_AUDIO, MODEL_LLM, MAX_DURATION, MIN_DURATION,
|
| 19 |
+
DEFAULT_FPS, VOICE_STYLES, get_device, validate_config, log_config
|
| 20 |
+
)
|
| 21 |
+
from sync_manager import create_sync_manager
|
| 22 |
+
from llm_script_generator import create_script_generator
|
| 23 |
+
from utils_audio import synth_voice, retro_bed, mix_to_stereo, write_wav
|
| 24 |
+
from utils_video import synth_t2v, apply_retro_filters, mux_audio
|
| 25 |
+
|
| 26 |
+
# Configure logging
|
| 27 |
+
logging.basicConfig(
|
| 28 |
+
level=logging.INFO,
|
| 29 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 30 |
+
)
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
# Initialize components
|
| 34 |
+
DEVICE = get_device()
|
| 35 |
+
sync_manager = create_sync_manager()
|
| 36 |
+
script_generator = create_script_generator()
|
| 37 |
+
|
| 38 |
+
# Validate configuration
|
| 39 |
+
if not validate_config():
|
| 40 |
+
logger.error("Configuration validation failed")
|
| 41 |
+
exit(1)
|
| 42 |
+
|
| 43 |
+
# Log configuration
|
| 44 |
+
log_config()
|
| 45 |
+
|
| 46 |
+
# Modern CSS for Gradio
|
| 47 |
+
CSS = """
|
| 48 |
+
.gradio-container {
|
| 49 |
+
max-width: 1200px !important;
|
| 50 |
+
margin: 0 auto;
|
| 51 |
+
}
|
| 52 |
+
.app-header {
|
| 53 |
+
text-align: center;
|
| 54 |
+
margin-bottom: 2rem;
|
| 55 |
+
}
|
| 56 |
+
.app-title {
|
| 57 |
+
font-size: 2.5rem;
|
| 58 |
+
font-weight: 700;
|
| 59 |
+
background: linear-gradient(45deg, #ff6b6b, #4ecdc4);
|
| 60 |
+
-webkit-background-clip: text;
|
| 61 |
+
-webkit-text-fill-color: transparent;
|
| 62 |
+
margin-bottom: 0.5rem;
|
| 63 |
+
}
|
| 64 |
+
.app-subtitle {
|
| 65 |
+
opacity: 0.7;
|
| 66 |
+
font-size: 1.1rem;
|
| 67 |
+
color: #666;
|
| 68 |
+
}
|
| 69 |
+
.control-section {
|
| 70 |
+
background: #f8f9fa;
|
| 71 |
+
border-radius: 12px;
|
| 72 |
+
padding: 1.5rem;
|
| 73 |
+
margin-bottom: 1rem;
|
| 74 |
+
}
|
| 75 |
+
.output-section {
|
| 76 |
+
background: #ffffff;
|
| 77 |
+
border: 2px solid #e9ecef;
|
| 78 |
+
border-radius: 12px;
|
| 79 |
+
padding: 1.5rem;
|
| 80 |
+
}
|
| 81 |
+
.progress-info {
|
| 82 |
+
background: #e3f2fd;
|
| 83 |
+
border-left: 4px solid #2196f3;
|
| 84 |
+
padding: 1rem;
|
| 85 |
+
margin: 1rem 0;
|
| 86 |
+
border-radius: 4px;
|
| 87 |
+
}
|
| 88 |
+
.error-info {
|
| 89 |
+
background: #ffebee;
|
| 90 |
+
border-left: 4px solid #f44336;
|
| 91 |
+
padding: 1rem;
|
| 92 |
+
margin: 1rem 0;
|
| 93 |
+
border-radius: 4px;
|
| 94 |
+
}
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
# Example configurations
|
| 98 |
+
EXAMPLES = [
|
| 99 |
+
{
|
| 100 |
+
"brand": "EceMotion Pictures",
|
| 101 |
+
"structure": "Montage → Close-up → Logo stinger",
|
| 102 |
+
"script": "Remember when technology was simple?",
|
| 103 |
+
"voice": "Announcer '80s",
|
| 104 |
+
"duration": 10
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"brand": "VaporWave Studios",
|
| 108 |
+
"structure": "Before/After → Feature highlight → CTA",
|
| 109 |
+
"script": "The future is now, but it looks like the past",
|
| 110 |
+
"voice": "Mall PA",
|
| 111 |
+
"duration": 8
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"brand": "Neon Dreams",
|
| 115 |
+
"structure": "Unboxing → Demo → Deal countdown",
|
| 116 |
+
"script": "Step into the digital sunset",
|
| 117 |
+
"voice": "Late Night",
|
| 118 |
+
"duration": 12
|
| 119 |
+
}
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
def create_interface():
|
| 123 |
+
"""Create the modern Gradio interface."""
|
| 124 |
+
|
| 125 |
+
with gr.Blocks(
|
| 126 |
+
css=CSS,
|
| 127 |
+
title="EceMotion Pictures",
|
| 128 |
+
theme=gr.themes.Soft()
|
| 129 |
+
) as demo:
|
| 130 |
+
|
| 131 |
+
# Header
|
| 132 |
+
with gr.Row():
|
| 133 |
+
gr.HTML("""
|
| 134 |
+
<div class="app-header">
|
| 135 |
+
<div class="app-title">🎬 EceMotion Pictures</div>
|
| 136 |
+
<div class="app-subtitle">AI-Powered 1980s Style Commercial Generator</div>
|
| 137 |
+
</div>
|
| 138 |
+
""")
|
| 139 |
+
|
| 140 |
+
# Main interface
|
| 141 |
+
with gr.Row():
|
| 142 |
+
# Left column - Controls
|
| 143 |
+
with gr.Column(scale=1):
|
| 144 |
+
with gr.Group():
|
| 145 |
+
gr.Markdown("### 🎯 Commercial Setup")
|
| 146 |
+
|
| 147 |
+
brand = gr.Textbox(
|
| 148 |
+
label="Brand Name",
|
| 149 |
+
placeholder="YourBrand™",
|
| 150 |
+
value="EceMotion Pictures",
|
| 151 |
+
info="Enter your brand name"
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
structure = gr.Textbox(
|
| 155 |
+
label="Commercial Structure",
|
| 156 |
+
placeholder="e.g., Montage → Close-up → Logo stinger",
|
| 157 |
+
value="Montage → Close-up → Logo stinger",
|
| 158 |
+
info="Define the flow of your commercial"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
with gr.Row():
|
| 162 |
+
script_prompt = gr.Textbox(
|
| 163 |
+
label="Script Hook",
|
| 164 |
+
placeholder="Opening hook or idea",
|
| 165 |
+
value="Remember when technology was simple?",
|
| 166 |
+
scale=3
|
| 167 |
+
)
|
| 168 |
+
roll_btn = gr.Button("🎲", scale=1, size="sm")
|
| 169 |
+
|
| 170 |
+
duration = gr.Slider(
|
| 171 |
+
minimum=MIN_DURATION,
|
| 172 |
+
maximum=MAX_DURATION,
|
| 173 |
+
value=10,
|
| 174 |
+
step=1,
|
| 175 |
+
label="Duration (seconds)",
|
| 176 |
+
info=f"Between {MIN_DURATION}-{MAX_DURATION} seconds"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
with gr.Group():
|
| 180 |
+
gr.Markdown("### 🎤 Audio Settings")
|
| 181 |
+
|
| 182 |
+
voice = gr.Dropdown(
|
| 183 |
+
choices=list(VOICE_STYLES.keys()),
|
| 184 |
+
value="Announcer '80s",
|
| 185 |
+
label="Voice Style",
|
| 186 |
+
info="Choose the announcer style"
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
music = gr.Checkbox(
|
| 190 |
+
value=True,
|
| 191 |
+
label="Background Music",
|
| 192 |
+
info="Add retro synth jingle"
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
with gr.Group():
|
| 196 |
+
gr.Markdown("### ⚙️ Advanced Settings")
|
| 197 |
+
|
| 198 |
+
with gr.Accordion("Model & Quality", open=False):
|
| 199 |
+
model_video = gr.Dropdown(
|
| 200 |
+
choices=["damo-vilab/text-to-video-ms-1.7b", "THUDM/CogVideoX-5b"],
|
| 201 |
+
value=MODEL_VIDEO,
|
| 202 |
+
label="Video Model",
|
| 203 |
+
info="Choose the text-to-video model"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
model_audio = gr.Dropdown(
|
| 207 |
+
choices=["parler-tts/parler-tts-mini-v1", "SWivid/F5-TTS"],
|
| 208 |
+
value=MODEL_AUDIO,
|
| 209 |
+
label="Audio Model",
|
| 210 |
+
info="Choose the text-to-speech model"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
with gr.Accordion("Retro Effects", open=False):
|
| 214 |
+
vhs_intensity = gr.Slider(
|
| 215 |
+
minimum=0.0,
|
| 216 |
+
maximum=1.0,
|
| 217 |
+
value=0.5,
|
| 218 |
+
step=0.1,
|
| 219 |
+
label="VHS Effect Intensity"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
seed = gr.Number(
|
| 223 |
+
value=42,
|
| 224 |
+
precision=0,
|
| 225 |
+
label="Random Seed",
|
| 226 |
+
info="For reproducible results"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
# Generate button
|
| 230 |
+
generate_btn = gr.Button(
|
| 231 |
+
"🎬 Generate Commercial",
|
| 232 |
+
variant="primary",
|
| 233 |
+
size="lg"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
# Right column - Output
|
| 237 |
+
with gr.Column(scale=1):
|
| 238 |
+
with gr.Group():
|
| 239 |
+
gr.Markdown("### 📺 Generated Commercial")
|
| 240 |
+
|
| 241 |
+
# Progress tracking
|
| 242 |
+
progress_info = gr.HTML("""
|
| 243 |
+
<div class="progress-info">
|
| 244 |
+
<strong>Ready to generate!</strong><br>
|
| 245 |
+
Click the generate button to create your retro commercial.
|
| 246 |
+
</div>
|
| 247 |
+
""")
|
| 248 |
+
|
| 249 |
+
# Video output
|
| 250 |
+
output_video = gr.Video(
|
| 251 |
+
height=400,
|
| 252 |
+
label="Commercial Preview",
|
| 253 |
+
show_download_button=True
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# Script output
|
| 257 |
+
output_script = gr.Textbox(
|
| 258 |
+
label="Generated Script",
|
| 259 |
+
lines=8,
|
| 260 |
+
max_lines=12,
|
| 261 |
+
show_copy_button=True
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Download section
|
| 265 |
+
with gr.Row():
|
| 266 |
+
download_btn = gr.DownloadButton(
|
| 267 |
+
"📥 Download Commercial",
|
| 268 |
+
variant="secondary"
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Examples section
|
| 272 |
+
with gr.Row():
|
| 273 |
+
with gr.Column():
|
| 274 |
+
gr.Markdown("### 💡 Example Configurations")
|
| 275 |
+
examples = gr.Examples(
|
| 276 |
+
examples=EXAMPLES,
|
| 277 |
+
inputs=[brand, structure, script_prompt, voice, duration],
|
| 278 |
+
label="Click to load example"
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# Footer
|
| 282 |
+
gr.Markdown("""
|
| 283 |
+
<div style='text-align: center; opacity: 0.7; font-size: 0.9rem; margin-top: 2rem;'>
|
| 284 |
+
<p>🎬 Powered by EceMotion Pictures • Perfect audio-video sync • Professional quality</p>
|
| 285 |
+
<p>Models: Text-to-Video • Text-to-Speech • Enhanced VHS effects</p>
|
| 286 |
+
</div>
|
| 287 |
+
""")
|
| 288 |
+
|
| 289 |
+
# Event handlers
|
| 290 |
+
def roll_script_suggestion(structure_text: str, seed_val: int) -> str:
|
| 291 |
+
"""Generate script suggestions using LLM."""
|
| 292 |
+
try:
|
| 293 |
+
suggestions = script_generator.suggest_scripts(structure_text, n=1, seed=seed_val)
|
| 294 |
+
return suggestions[0] if suggestions else "Back to '87 - the future is now!"
|
| 295 |
+
except Exception as e:
|
| 296 |
+
logger.error(f"Script suggestion failed: {e}")
|
| 297 |
+
return "Back to '87 - the future is now!"
|
| 298 |
+
|
| 299 |
+
def generate_commercial(
|
| 300 |
+
brand_name: str,
|
| 301 |
+
structure_text: str,
|
| 302 |
+
script_text: str,
|
| 303 |
+
duration_val: int,
|
| 304 |
+
voice_style: str,
|
| 305 |
+
music_enabled: bool,
|
| 306 |
+
video_model: str,
|
| 307 |
+
audio_model: str,
|
| 308 |
+
vhs_intensity: float,
|
| 309 |
+
seed_val: int
|
| 310 |
+
) -> Tuple[str, str, str]:
|
| 311 |
+
"""
|
| 312 |
+
Generate a complete retro commercial with perfect sync.
|
| 313 |
+
"""
|
| 314 |
+
try:
|
| 315 |
+
# Update progress
|
| 316 |
+
progress_html = """
|
| 317 |
+
<div class="progress-info">
|
| 318 |
+
<strong>🎬 Generating Commercial...</strong><br>
|
| 319 |
+
<div style="margin-top: 0.5rem;">
|
| 320 |
+
<div>📝 Generating script with AI...</div>
|
| 321 |
+
</div>
|
| 322 |
+
</div>
|
| 323 |
+
"""
|
| 324 |
+
yield progress_html, None, None, None
|
| 325 |
+
|
| 326 |
+
# Generate script using LLM
|
| 327 |
+
generated_script = script_generator.generate_script(
|
| 328 |
+
brand=brand_name or "Brand",
|
| 329 |
+
structure=structure_text or "Montage → Close-up → Logo",
|
| 330 |
+
script_prompt=script_text or "Back to '87",
|
| 331 |
+
duration=duration_val,
|
| 332 |
+
voice_style=voice_style,
|
| 333 |
+
seed=seed_val
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# Update progress
|
| 337 |
+
progress_html = """
|
| 338 |
+
<div class="progress-info">
|
| 339 |
+
<strong>🎬 Generating Commercial...</strong><br>
|
| 340 |
+
<div style="margin-top: 0.5rem;">
|
| 341 |
+
<div>✅ Script generated</div>
|
| 342 |
+
<div>🎥 Generating video...</div>
|
| 343 |
+
</div>
|
| 344 |
+
</div>
|
| 345 |
+
"""
|
| 346 |
+
yield progress_html, None, None, None
|
| 347 |
+
|
| 348 |
+
# Create temporary directory
|
| 349 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 350 |
+
# Generate video
|
| 351 |
+
video_prompt = f"{structure_text}. {script_text}. 1980s commercial, VHS texture, soft lighting, bold retro titles, 4:3, brand {brand_name}"
|
| 352 |
+
|
| 353 |
+
# Calculate optimal frame count
|
| 354 |
+
num_frames = sync_manager.get_optimal_frame_count(duration_val, DEFAULT_FPS)
|
| 355 |
+
|
| 356 |
+
clip = synth_t2v(
|
| 357 |
+
prompt=video_prompt,
|
| 358 |
+
seed=seed_val,
|
| 359 |
+
num_frames=num_frames,
|
| 360 |
+
fps=DEFAULT_FPS,
|
| 361 |
+
device=DEVICE,
|
| 362 |
+
model_name=video_model
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
# Save raw video
|
| 366 |
+
raw_video_path = os.path.join(tmpdir, "raw.mp4")
|
| 367 |
+
clip.write_videofile(
|
| 368 |
+
raw_video_path,
|
| 369 |
+
fps=DEFAULT_FPS,
|
| 370 |
+
codec='libx264',
|
| 371 |
+
audio=False,
|
| 372 |
+
verbose=False,
|
| 373 |
+
logger=None
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
# Apply retro filters
|
| 377 |
+
retro_video_path = os.path.join(tmpdir, "retro.mp4")
|
| 378 |
+
apply_retro_filters(raw_video_path, retro_video_path, intensity=vhs_intensity)
|
| 379 |
+
|
| 380 |
+
# Update progress
|
| 381 |
+
progress_html = """
|
| 382 |
+
<div class="progress-info">
|
| 383 |
+
<strong>🎬 Generating Commercial...</strong><br>
|
| 384 |
+
<div style="margin-top: 0.5rem;">
|
| 385 |
+
<div>✅ Script generated</div>
|
| 386 |
+
<div>✅ Video generated</div>
|
| 387 |
+
<div>🎤 Generating audio...</div>
|
| 388 |
+
</div>
|
| 389 |
+
</div>
|
| 390 |
+
"""
|
| 391 |
+
yield progress_html, None, None, None
|
| 392 |
+
|
| 393 |
+
# Generate audio
|
| 394 |
+
voiceover_text = " ".join([seg.text for seg in generated_script.segments])
|
| 395 |
+
sr_voice, wav_voice = synth_voice(
|
| 396 |
+
text=voiceover_text,
|
| 397 |
+
voice_prompt=voice_style,
|
| 398 |
+
model_name=audio_model,
|
| 399 |
+
device=DEVICE
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
# Add background music if requested
|
| 403 |
+
if music_enabled:
|
| 404 |
+
sr_music, wav_music = retro_bed(clip.duration)
|
| 405 |
+
sr_final, stereo_audio = mix_to_stereo(
|
| 406 |
+
sr_voice, wav_voice, sr_music, wav_music, bed_gain=0.3
|
| 407 |
+
)
|
| 408 |
+
else:
|
| 409 |
+
sr_final = sr_voice
|
| 410 |
+
stereo_audio = np.stack([wav_voice, wav_voice], axis=1)
|
| 411 |
+
|
| 412 |
+
# Save audio
|
| 413 |
+
audio_path = os.path.join(tmpdir, "audio.wav")
|
| 414 |
+
write_wav(audio_path, sr_final, stereo_audio)
|
| 415 |
+
|
| 416 |
+
# Update progress
|
| 417 |
+
progress_html = """
|
| 418 |
+
<div class="progress-info">
|
| 419 |
+
<strong>🎬 Generating Commercial...</strong><br>
|
| 420 |
+
<div style="margin-top: 0.5rem;">
|
| 421 |
+
<div>✅ Script generated</div>
|
| 422 |
+
<div>✅ Video generated</div>
|
| 423 |
+
<div>✅ Audio generated</div>
|
| 424 |
+
<div>🔄 Synchronizing audio and video...</div>
|
| 425 |
+
</div>
|
| 426 |
+
</div>
|
| 427 |
+
"""
|
| 428 |
+
yield progress_html, None, None, None
|
| 429 |
+
|
| 430 |
+
# Synchronize audio and video
|
| 431 |
+
final_video_path = os.path.join(tmpdir, f"{brand_name}_commercial.mp4")
|
| 432 |
+
sync_manager.synchronize_media(
|
| 433 |
+
video_path=retro_video_path,
|
| 434 |
+
audio_path=audio_path,
|
| 435 |
+
output_path=final_video_path,
|
| 436 |
+
prefer_audio_duration=True
|
| 437 |
+
)
|
| 438 |
+
|
| 439 |
+
# Validate sync
|
| 440 |
+
is_synced, sync_diff = sync_manager.validate_sync(final_video_path, final_video_path)
|
| 441 |
+
|
| 442 |
+
# Format script output
|
| 443 |
+
script_lines = []
|
| 444 |
+
for i, segment in enumerate(generated_script.segments, 1):
|
| 445 |
+
script_lines.append(f"{i}. {segment.timing_marker} {segment.text}")
|
| 446 |
+
|
| 447 |
+
script_output = "\n".join(script_lines) + f"\n\nTAGLINE: {generated_script.tagline}"
|
| 448 |
+
|
| 449 |
+
# Final progress
|
| 450 |
+
sync_status = "✅ Perfect sync" if is_synced else f"⚠️ Sync diff: {sync_diff:.3f}s"
|
| 451 |
+
progress_html = f"""
|
| 452 |
+
<div class="progress-info">
|
| 453 |
+
<strong>🎉 Commercial Complete!</strong><br>
|
| 454 |
+
<div style="margin-top: 0.5rem;">
|
| 455 |
+
<div>✅ Script generated ({generated_script.word_count} words)</div>
|
| 456 |
+
<div>✅ Video generated ({num_frames} frames)</div>
|
| 457 |
+
<div>✅ Audio generated ({len(stereo_audio)/sr_final:.1f}s)</div>
|
| 458 |
+
<div>{sync_status}</div>
|
| 459 |
+
</div>
|
| 460 |
+
</div>
|
| 461 |
+
"""
|
| 462 |
+
|
| 463 |
+
yield progress_html, final_video_path, script_output, final_video_path
|
| 464 |
+
|
| 465 |
+
except Exception as e:
|
| 466 |
+
logger.error(f"Commercial generation failed: {e}")
|
| 467 |
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
| 468 |
+
error_html = f"""
|
| 469 |
+
<div class="error-info">
|
| 470 |
+
<strong>❌ Generation Failed</strong><br>
|
| 471 |
+
<div style="margin-top: 0.5rem; color: #666;">
|
| 472 |
+
Error: {str(e)}<br>
|
| 473 |
+
Please try again with different parameters or check the logs.
|
| 474 |
+
</div>
|
| 475 |
+
</div>
|
| 476 |
+
"""
|
| 477 |
+
yield error_html, None, None, None
|
| 478 |
+
|
| 479 |
+
# Connect event handlers
|
| 480 |
+
roll_btn.click(
|
| 481 |
+
roll_script_suggestion,
|
| 482 |
+
inputs=[structure, seed],
|
| 483 |
+
outputs=[script_prompt]
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
generate_btn.click(
|
| 487 |
+
generate_commercial,
|
| 488 |
+
inputs=[
|
| 489 |
+
brand, structure, script_prompt, duration, voice, music,
|
| 490 |
+
model_video, model_audio, vhs_intensity, seed
|
| 491 |
+
],
|
| 492 |
+
outputs=[progress_info, output_video, output_script, download_btn]
|
| 493 |
+
)
|
| 494 |
+
|
| 495 |
+
return demo
|
| 496 |
+
|
| 497 |
+
def main():
|
| 498 |
+
"""Main application entry point."""
|
| 499 |
+
logger.info("Starting EceMotion Pictures")
|
| 500 |
+
logger.info(f"Using device: {DEVICE}")
|
| 501 |
+
logger.info(f"Video model: {MODEL_VIDEO}")
|
| 502 |
+
logger.info(f"Audio model: {MODEL_AUDIO}")
|
| 503 |
+
logger.info(f"LLM model: {MODEL_LLM}")
|
| 504 |
+
|
| 505 |
+
# Create and launch interface
|
| 506 |
+
demo = create_interface()
|
| 507 |
+
demo.launch(
|
| 508 |
+
server_name="0.0.0.0",
|
| 509 |
+
server_port=7860,
|
| 510 |
+
share=False,
|
| 511 |
+
show_error=True
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
if __name__ == "__main__":
|
| 515 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration management for EceMotion Pictures.
|
| 3 |
+
Centralized settings for models, parameters, and deployment.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
|
| 9 |
+
# Model Configuration - with fallbacks for HuggingFace Spaces
|
| 10 |
+
MODEL_VIDEO = os.getenv("MODEL_VIDEO", "damo-vilab/text-to-video-ms-1.7b") # Start with lighter model
|
| 11 |
+
MODEL_AUDIO = os.getenv("MODEL_AUDIO", "parler-tts/parler-tts-mini-v1") # Start with working model
|
| 12 |
+
MODEL_LLM = os.getenv("MODEL_LLM", "microsoft/DialoGPT-medium") # Start with lighter LLM
|
| 13 |
+
|
| 14 |
+
# Video Configuration
|
| 15 |
+
MAX_DURATION = int(os.getenv("MAX_DURATION", "15"))
|
| 16 |
+
MIN_DURATION = int(os.getenv("MIN_DURATION", "5"))
|
| 17 |
+
DEFAULT_FPS = int(os.getenv("DEFAULT_FPS", "8"))
|
| 18 |
+
DEFAULT_FRAMES = int(os.getenv("DEFAULT_FRAMES", "64")) # 8 seconds at 8fps
|
| 19 |
+
|
| 20 |
+
# Audio Configuration
|
| 21 |
+
AUDIO_SAMPLE_RATE = int(os.getenv("AUDIO_SAMPLE_RATE", "22050")) # Standard rate
|
| 22 |
+
AUDIO_BITRATE = os.getenv("AUDIO_BITRATE", "128k") # Lower bitrate for stability
|
| 23 |
+
MUSIC_GAIN = float(os.getenv("MUSIC_GAIN", "0.3"))
|
| 24 |
+
|
| 25 |
+
# GPU Configuration
|
| 26 |
+
GPU_MEMORY_THRESHOLD = float(os.getenv("GPU_MEMORY_THRESHOLD", "0.8"))
|
| 27 |
+
USE_QUANTIZATION = os.getenv("USE_QUANTIZATION", "true").lower() == "true"
|
| 28 |
+
QUANTIZATION_BITS = int(os.getenv("QUANTIZATION_BITS", "8"))
|
| 29 |
+
|
| 30 |
+
# Sync Configuration
|
| 31 |
+
SYNC_TOLERANCE_MS = int(os.getenv("SYNC_TOLERANCE_MS", "200")) # More lenient for stability
|
| 32 |
+
FORCE_SYNC = os.getenv("FORCE_SYNC", "false").lower() == "true" # Disabled by default
|
| 33 |
+
|
| 34 |
+
# Retro Filter Configuration
|
| 35 |
+
VHS_INTENSITY = float(os.getenv("VHS_INTENSITY", "0.5"))
|
| 36 |
+
SCANLINE_OPACITY = float(os.getenv("SCANLINE_OPACITY", "0.2"))
|
| 37 |
+
CHROMATIC_ABERRATION = float(os.getenv("CHROMATIC_ABERRATION", "0.05"))
|
| 38 |
+
FILM_GRAIN = float(os.getenv("FILM_GRAIN", "0.1"))
|
| 39 |
+
|
| 40 |
+
# UI Configuration
|
| 41 |
+
UI_THEME = os.getenv("UI_THEME", "default")
|
| 42 |
+
SHOW_PROGRESS = os.getenv("SHOW_PROGRESS", "true").lower() == "true"
|
| 43 |
+
ENABLE_EXAMPLES = os.getenv("ENABLE_EXAMPLES", "true").lower() == "true"
|
| 44 |
+
|
| 45 |
+
# Logging Configuration
|
| 46 |
+
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
| 47 |
+
LOG_FORMAT = os.getenv("LOG_FORMAT", "text") # text format for HuggingFace Spaces
|
| 48 |
+
|
| 49 |
+
# Model-specific configurations with conservative settings
|
| 50 |
+
MODEL_CONFIGS: Dict[str, Dict[str, Any]] = {
|
| 51 |
+
"damo-vilab/text-to-video-ms-1.7b": {
|
| 52 |
+
"max_frames": 64,
|
| 53 |
+
"min_frames": 8,
|
| 54 |
+
"default_frames": 32,
|
| 55 |
+
"memory_usage_gb": 6,
|
| 56 |
+
"supports_quantization": False,
|
| 57 |
+
"stable": True,
|
| 58 |
+
},
|
| 59 |
+
"THUDM/CogVideoX-5b": {
|
| 60 |
+
"max_frames": 48, # Reduced for stability
|
| 61 |
+
"min_frames": 16,
|
| 62 |
+
"default_frames": 32,
|
| 63 |
+
"memory_usage_gb": 16, # Conservative estimate
|
| 64 |
+
"supports_quantization": True,
|
| 65 |
+
"stable": False, # Mark as experimental
|
| 66 |
+
},
|
| 67 |
+
"parler-tts/parler-tts-mini-v1": {
|
| 68 |
+
"max_text_length": 500,
|
| 69 |
+
"min_text_length": 10,
|
| 70 |
+
"default_voice": "Announcer '80s",
|
| 71 |
+
"memory_usage_gb": 2,
|
| 72 |
+
"stable": True,
|
| 73 |
+
},
|
| 74 |
+
"SWivid/F5-TTS": {
|
| 75 |
+
"max_text_length": 300,
|
| 76 |
+
"min_text_length": 10,
|
| 77 |
+
"default_voice": "announcer",
|
| 78 |
+
"memory_usage_gb": 4,
|
| 79 |
+
"stable": False, # Mark as experimental
|
| 80 |
+
},
|
| 81 |
+
"microsoft/DialoGPT-medium": {
|
| 82 |
+
"max_tokens": 512,
|
| 83 |
+
"temperature": 0.7,
|
| 84 |
+
"top_p": 0.9,
|
| 85 |
+
"memory_usage_gb": 2,
|
| 86 |
+
"stable": True,
|
| 87 |
+
},
|
| 88 |
+
"Qwen/Qwen2.5-7B-Instruct": {
|
| 89 |
+
"max_tokens": 1024,
|
| 90 |
+
"temperature": 0.7,
|
| 91 |
+
"top_p": 0.9,
|
| 92 |
+
"memory_usage_gb": 8,
|
| 93 |
+
"stable": False, # Mark as experimental
|
| 94 |
+
},
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
# Voice styles for TTS
|
| 98 |
+
VOICE_STYLES = {
|
| 99 |
+
"Announcer '80s": "A confident, upbeat 1980s TV announcer with warm AM-radio tone.",
|
| 100 |
+
"Mall PA": "Casual, slightly echoey mall public-address vibe.",
|
| 101 |
+
"Late Night": "Low energy, sly late-night infomercial style.",
|
| 102 |
+
"News Anchor": "Professional, authoritative news anchor delivery.",
|
| 103 |
+
"Infomercial": "Enthusiastic, persuasive infomercial host style.",
|
| 104 |
+
"Radio DJ": "Smooth, charismatic radio disc jockey voice.",
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# Structure templates for script generation
|
| 108 |
+
STRUCTURE_TEMPLATES = [
|
| 109 |
+
"Montage → Close-up → Logo stinger",
|
| 110 |
+
"Before/After → Feature highlight → CTA",
|
| 111 |
+
"Testimonial → B-roll → Price tag reveal",
|
| 112 |
+
"Unboxing → Demo → Deal countdown",
|
| 113 |
+
"Retro news bulletin → Product shot → Tagline",
|
| 114 |
+
"Opening hook → Problem/Solution → Call to action",
|
| 115 |
+
"Brand story → Product showcase → Final tagline",
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
# Taglines for commercial endings
|
| 119 |
+
TAGLINES = [
|
| 120 |
+
"So retro, it's the future.",
|
| 121 |
+
"Pixels you can trust.",
|
| 122 |
+
"VHS vibes. Modern results.",
|
| 123 |
+
"Old-school cool. New-school sales.",
|
| 124 |
+
"Where nostalgia meets innovation.",
|
| 125 |
+
"Rewind to the future.",
|
| 126 |
+
"Classic style. Modern performance.",
|
| 127 |
+
"The past perfected.",
|
| 128 |
+
"EceMotion Pictures - Bringing the '80s back to life.",
|
| 129 |
+
"Your story, our vision, timeless memories.",
|
| 130 |
+
]
|
| 131 |
+
|
| 132 |
+
def get_model_config(model_name: str) -> Dict[str, Any]:
|
| 133 |
+
"""Get configuration for a specific model."""
|
| 134 |
+
return MODEL_CONFIGS.get(model_name, {
|
| 135 |
+
"max_frames": 32,
|
| 136 |
+
"min_frames": 8,
|
| 137 |
+
"default_frames": 16,
|
| 138 |
+
"memory_usage_gb": 4,
|
| 139 |
+
"supports_quantization": False,
|
| 140 |
+
"stable": True,
|
| 141 |
+
})
|
| 142 |
+
|
| 143 |
+
def get_device() -> str:
|
| 144 |
+
"""Determine the best available device."""
|
| 145 |
+
try:
|
| 146 |
+
import torch
|
| 147 |
+
if torch.cuda.is_available() and os.getenv("CUDA_VISIBLE_DEVICES", None) not in (None, ""):
|
| 148 |
+
return "cuda"
|
| 149 |
+
except ImportError:
|
| 150 |
+
pass
|
| 151 |
+
return "cpu"
|
| 152 |
+
|
| 153 |
+
def validate_config() -> bool:
|
| 154 |
+
"""Validate configuration settings."""
|
| 155 |
+
try:
|
| 156 |
+
assert MIN_DURATION < MAX_DURATION, "MIN_DURATION must be less than MAX_DURATION"
|
| 157 |
+
assert DEFAULT_FPS > 0, "DEFAULT_FPS must be positive"
|
| 158 |
+
assert AUDIO_SAMPLE_RATE > 0, "AUDIO_SAMPLE_RATE must be positive"
|
| 159 |
+
assert 0 <= VHS_INTENSITY <= 1, "VHS_INTENSITY must be between 0 and 1"
|
| 160 |
+
assert 0 <= SCANLINE_OPACITY <= 1, "SCANLINE_OPACITY must be between 0 and 1"
|
| 161 |
+
return True
|
| 162 |
+
except AssertionError as e:
|
| 163 |
+
print(f"Configuration validation failed: {e}")
|
| 164 |
+
return False
|
| 165 |
+
|
| 166 |
+
def get_safe_model_name(model_name: str, model_type: str) -> str:
|
| 167 |
+
"""Get a safe model name with fallback to stable models."""
|
| 168 |
+
config = get_model_config(model_name)
|
| 169 |
+
|
| 170 |
+
# If model is not stable, fallback to stable alternatives
|
| 171 |
+
if not config.get("stable", False):
|
| 172 |
+
if model_type == "video":
|
| 173 |
+
return "damo-vilab/text-to-video-ms-1.7b"
|
| 174 |
+
elif model_type == "audio":
|
| 175 |
+
return "parler-tts/parler-tts-mini-v1"
|
| 176 |
+
elif model_type == "llm":
|
| 177 |
+
return "microsoft/DialoGPT-medium"
|
| 178 |
+
|
| 179 |
+
return model_name
|
| 180 |
+
|
| 181 |
+
def log_config():
|
| 182 |
+
"""Log current configuration for debugging."""
|
| 183 |
+
print(f"EceMotion Pictures Configuration:")
|
| 184 |
+
print(f" Video Model: {MODEL_VIDEO}")
|
| 185 |
+
print(f" Audio Model: {MODEL_AUDIO}")
|
| 186 |
+
print(f" LLM Model: {MODEL_LLM}")
|
| 187 |
+
print(f" Device: {get_device()}")
|
| 188 |
+
print(f" Duration Range: {MIN_DURATION}-{MAX_DURATION}s")
|
| 189 |
+
print(f" FPS: {DEFAULT_FPS}")
|
| 190 |
+
print(f" Sync Tolerance: {SYNC_TOLERANCE_MS}ms")
|
llm_script_generator.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM-powered script generation for EceMotion Pictures.
|
| 3 |
+
Generates intelligent, structure-aware commercial scripts with timing markers.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import random
|
| 8 |
+
from typing import Dict, List, Optional, Tuple
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
|
| 11 |
+
from config import (
|
| 12 |
+
MODEL_LLM, MODEL_CONFIGS, VOICE_STYLES, STRUCTURE_TEMPLATES, TAGLINES,
|
| 13 |
+
get_safe_model_name
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class ScriptSegment:
|
| 20 |
+
"""Represents a segment of the commercial script with timing information."""
|
| 21 |
+
text: str
|
| 22 |
+
duration_estimate: float
|
| 23 |
+
segment_type: str # "hook", "flow", "benefit", "cta"
|
| 24 |
+
timing_marker: Optional[str] = None
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class GeneratedScript:
|
| 28 |
+
"""Complete generated script with all segments and metadata."""
|
| 29 |
+
segments: List[ScriptSegment]
|
| 30 |
+
total_duration: float
|
| 31 |
+
tagline: str
|
| 32 |
+
voice_style: str
|
| 33 |
+
word_count: int
|
| 34 |
+
raw_script: str
|
| 35 |
+
|
| 36 |
+
class LLMScriptGenerator:
|
| 37 |
+
"""Generates commercial scripts using large language models with fallbacks."""
|
| 38 |
+
|
| 39 |
+
def __init__(self, model_name: str = MODEL_LLM):
|
| 40 |
+
self.model_name = get_safe_model_name(model_name, "llm")
|
| 41 |
+
self.model = None
|
| 42 |
+
self.tokenizer = None
|
| 43 |
+
self.model_config = MODEL_CONFIGS.get(self.model_name, {})
|
| 44 |
+
self.llm_available = False
|
| 45 |
+
|
| 46 |
+
# Try to initialize LLM
|
| 47 |
+
self._try_init_llm()
|
| 48 |
+
|
| 49 |
+
def _try_init_llm(self):
|
| 50 |
+
"""Try to initialize the LLM model."""
|
| 51 |
+
try:
|
| 52 |
+
if "dialo" in self.model_name.lower():
|
| 53 |
+
self._init_dialogpt()
|
| 54 |
+
elif "qwen" in self.model_name.lower():
|
| 55 |
+
self._init_qwen()
|
| 56 |
+
else:
|
| 57 |
+
logger.warning(f"Unknown LLM model: {self.model_name}, using fallback")
|
| 58 |
+
self.llm_available = False
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.warning(f"Failed to initialize LLM {self.model_name}: {e}")
|
| 61 |
+
self.llm_available = False
|
| 62 |
+
|
| 63 |
+
def _init_dialogpt(self):
|
| 64 |
+
"""Initialize DialoGPT model."""
|
| 65 |
+
try:
|
| 66 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 67 |
+
|
| 68 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 69 |
+
if self.tokenizer.pad_token is None:
|
| 70 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 71 |
+
|
| 72 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 73 |
+
self.model_name,
|
| 74 |
+
torch_dtype="auto",
|
| 75 |
+
device_map="auto" if self._has_gpu() else "cpu"
|
| 76 |
+
)
|
| 77 |
+
self.llm_available = True
|
| 78 |
+
logger.info(f"DialoGPT model {self.model_name} loaded successfully")
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"Failed to load DialoGPT: {e}")
|
| 82 |
+
self.llm_available = False
|
| 83 |
+
|
| 84 |
+
def _init_qwen(self):
|
| 85 |
+
"""Initialize Qwen model."""
|
| 86 |
+
try:
|
| 87 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 88 |
+
|
| 89 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 90 |
+
self.model_name,
|
| 91 |
+
trust_remote_code=True
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
if self.tokenizer.pad_token is None:
|
| 95 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 96 |
+
|
| 97 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 98 |
+
self.model_name,
|
| 99 |
+
torch_dtype="auto",
|
| 100 |
+
device_map="auto" if self._has_gpu() else "cpu",
|
| 101 |
+
trust_remote_code=True
|
| 102 |
+
)
|
| 103 |
+
self.llm_available = True
|
| 104 |
+
logger.info(f"Qwen model {self.model_name} loaded successfully")
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"Failed to load Qwen: {e}")
|
| 108 |
+
self.llm_available = False
|
| 109 |
+
|
| 110 |
+
def _has_gpu(self) -> bool:
|
| 111 |
+
"""Check if GPU is available."""
|
| 112 |
+
try:
|
| 113 |
+
import torch
|
| 114 |
+
return torch.cuda.is_available()
|
| 115 |
+
except ImportError:
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
def _create_system_prompt(self) -> str:
|
| 119 |
+
"""Create system prompt for retro commercial script generation."""
|
| 120 |
+
return """You are a professional copywriter specializing in 1980s-style TV commercials.
|
| 121 |
+
Your task is to create engaging, persuasive commercial scripts that capture the authentic retro aesthetic.
|
| 122 |
+
|
| 123 |
+
Key requirements:
|
| 124 |
+
- Use 1980s commercial language and style
|
| 125 |
+
- Include clear hooks, benefits, and calls-to-action
|
| 126 |
+
- Keep scripts concise and punchy
|
| 127 |
+
- Use active voice and emotional appeals
|
| 128 |
+
- End with a memorable tagline
|
| 129 |
+
|
| 130 |
+
Format your response as:
|
| 131 |
+
HOOK: [Opening attention-grabber]
|
| 132 |
+
FLOW: [Main content following the structure]
|
| 133 |
+
BENEFIT: [Key value proposition]
|
| 134 |
+
CTA: [Call to action with tagline]
|
| 135 |
+
|
| 136 |
+
Keep each segment under 2-3 sentences. Use enthusiastic, confident language typical of 1980s advertising."""
|
| 137 |
+
|
| 138 |
+
def _create_user_prompt(self, brand: str, structure: str, script_prompt: str,
|
| 139 |
+
duration: int, voice_style: str) -> str:
|
| 140 |
+
"""Create user prompt with specific requirements."""
|
| 141 |
+
return f"""Create a {duration}-second retro commercial script for {brand}.
|
| 142 |
+
|
| 143 |
+
Structure: {structure}
|
| 144 |
+
Script idea: {script_prompt}
|
| 145 |
+
Voice style: {voice_style}
|
| 146 |
+
|
| 147 |
+
Make it authentic to 1980s TV commercials with the energy and style of that era."""
|
| 148 |
+
|
| 149 |
+
def _parse_script_response(self, response: str) -> List[ScriptSegment]:
|
| 150 |
+
"""Parse LLM response into structured script segments."""
|
| 151 |
+
segments = []
|
| 152 |
+
|
| 153 |
+
# Split by segment markers
|
| 154 |
+
import re
|
| 155 |
+
parts = re.split(r'(HOOK:|FLOW:|BENEFIT:|CTA:)', response)
|
| 156 |
+
|
| 157 |
+
for i in range(1, len(parts), 2):
|
| 158 |
+
if i + 1 < len(parts):
|
| 159 |
+
segment_type = parts[i].rstrip(':').lower()
|
| 160 |
+
text = parts[i + 1].strip()
|
| 161 |
+
|
| 162 |
+
if text:
|
| 163 |
+
# Estimate duration based on word count (150 WPM)
|
| 164 |
+
word_count = len(text.split())
|
| 165 |
+
duration = (word_count / 150) * 60 # Convert to seconds
|
| 166 |
+
|
| 167 |
+
segments.append(ScriptSegment(
|
| 168 |
+
text=text,
|
| 169 |
+
duration_estimate=duration,
|
| 170 |
+
segment_type=segment_type,
|
| 171 |
+
timing_marker=f"[{segment_type.upper()}]"
|
| 172 |
+
))
|
| 173 |
+
|
| 174 |
+
return segments
|
| 175 |
+
|
| 176 |
+
def _extract_tagline(self, response: str) -> str:
|
| 177 |
+
"""Extract tagline from the script response."""
|
| 178 |
+
# Look for tagline in CTA section
|
| 179 |
+
import re
|
| 180 |
+
cta_match = re.search(r'CTA:.*?([A-Z][^.!?]*[.!?])', response, re.DOTALL)
|
| 181 |
+
if cta_match:
|
| 182 |
+
cta_text = cta_match.group(1)
|
| 183 |
+
# Extract the last sentence as potential tagline
|
| 184 |
+
sentences = re.split(r'[.!?]+', cta_text)
|
| 185 |
+
if sentences:
|
| 186 |
+
tagline = sentences[-1].strip()
|
| 187 |
+
if len(tagline) > 5: # Ensure it's substantial
|
| 188 |
+
return tagline
|
| 189 |
+
|
| 190 |
+
# Fallback to predefined taglines
|
| 191 |
+
return random.choice(TAGLINES)
|
| 192 |
+
|
| 193 |
+
def generate_script_with_llm(self, brand: str, structure: str, script_prompt: str,
|
| 194 |
+
duration: int, voice_style: str, seed: int = 42) -> GeneratedScript:
|
| 195 |
+
"""Generate script using LLM."""
|
| 196 |
+
if not self.llm_available:
|
| 197 |
+
raise RuntimeError("LLM not available")
|
| 198 |
+
|
| 199 |
+
# Set random seed for reproducibility
|
| 200 |
+
random.seed(seed)
|
| 201 |
+
|
| 202 |
+
# Create prompts
|
| 203 |
+
system_prompt = self._create_system_prompt()
|
| 204 |
+
user_prompt = self._create_user_prompt(brand, structure, script_prompt, duration, voice_style)
|
| 205 |
+
|
| 206 |
+
# Format for the model
|
| 207 |
+
if "dialo" in self.model_name.lower():
|
| 208 |
+
# DialoGPT format
|
| 209 |
+
text = f"{user_prompt}\n\nResponse:"
|
| 210 |
+
else:
|
| 211 |
+
# Generic format
|
| 212 |
+
text = f"System: {system_prompt}\n\nUser: {user_prompt}\n\nAssistant:"
|
| 213 |
+
|
| 214 |
+
# Tokenize
|
| 215 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
| 216 |
+
|
| 217 |
+
# Generate
|
| 218 |
+
with self.model.eval():
|
| 219 |
+
outputs = self.model.generate(
|
| 220 |
+
**inputs,
|
| 221 |
+
max_new_tokens=self.model_config.get("max_tokens", 256),
|
| 222 |
+
temperature=self.model_config.get("temperature", 0.7),
|
| 223 |
+
top_p=self.model_config.get("top_p", 0.9),
|
| 224 |
+
do_sample=True,
|
| 225 |
+
pad_token_id=self.tokenizer.eos_token_id,
|
| 226 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
| 227 |
+
num_return_sequences=1
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Decode response
|
| 231 |
+
response = self.tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
|
| 232 |
+
|
| 233 |
+
logger.info(f"Generated script response: {response[:200]}...")
|
| 234 |
+
|
| 235 |
+
# Parse response
|
| 236 |
+
segments = self._parse_script_response(response)
|
| 237 |
+
tagline = self._extract_tagline(response)
|
| 238 |
+
|
| 239 |
+
# Calculate total duration
|
| 240 |
+
total_duration = sum(segment.duration_estimate for segment in segments)
|
| 241 |
+
|
| 242 |
+
# Calculate word count
|
| 243 |
+
word_count = sum(len(segment.text.split()) for segment in segments)
|
| 244 |
+
|
| 245 |
+
return GeneratedScript(
|
| 246 |
+
segments=segments,
|
| 247 |
+
total_duration=total_duration,
|
| 248 |
+
tagline=tagline,
|
| 249 |
+
voice_style=voice_style,
|
| 250 |
+
word_count=word_count,
|
| 251 |
+
raw_script=response
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
def generate_script_with_template(self, brand: str, structure: str, script_prompt: str,
|
| 255 |
+
duration: int, voice_style: str, seed: int = 42) -> GeneratedScript:
|
| 256 |
+
"""Generate script using template-based approach (fallback)."""
|
| 257 |
+
random.seed(seed)
|
| 258 |
+
|
| 259 |
+
# Select structure template
|
| 260 |
+
structure_template = structure.strip() or random.choice(STRUCTURE_TEMPLATES)
|
| 261 |
+
|
| 262 |
+
# Generate segments based on template
|
| 263 |
+
segments = []
|
| 264 |
+
|
| 265 |
+
# Hook
|
| 266 |
+
hook_text = script_prompt or f"Introducing {brand} - the future is here!"
|
| 267 |
+
segments.append(ScriptSegment(
|
| 268 |
+
text=hook_text,
|
| 269 |
+
duration_estimate=2.0,
|
| 270 |
+
segment_type="hook",
|
| 271 |
+
timing_marker="[HOOK]"
|
| 272 |
+
))
|
| 273 |
+
|
| 274 |
+
# Flow (based on structure)
|
| 275 |
+
flow_text = f"With {structure_template.lower()}, {brand} delivers results like never before."
|
| 276 |
+
segments.append(ScriptSegment(
|
| 277 |
+
text=flow_text,
|
| 278 |
+
duration_estimate=3.0,
|
| 279 |
+
segment_type="flow",
|
| 280 |
+
timing_marker="[FLOW]"
|
| 281 |
+
))
|
| 282 |
+
|
| 283 |
+
# Benefit
|
| 284 |
+
benefit_text = "Faster, simpler, cooler - just like your favorite retro tech."
|
| 285 |
+
segments.append(ScriptSegment(
|
| 286 |
+
text=benefit_text,
|
| 287 |
+
duration_estimate=2.5,
|
| 288 |
+
segment_type="benefit",
|
| 289 |
+
timing_marker="[BENEFIT]"
|
| 290 |
+
))
|
| 291 |
+
|
| 292 |
+
# CTA
|
| 293 |
+
tagline = random.choice(TAGLINES)
|
| 294 |
+
cta_text = f"Try {brand} today. {tagline}"
|
| 295 |
+
segments.append(ScriptSegment(
|
| 296 |
+
text=cta_text,
|
| 297 |
+
duration_estimate=2.5,
|
| 298 |
+
segment_type="cta",
|
| 299 |
+
timing_marker="[CTA]"
|
| 300 |
+
))
|
| 301 |
+
|
| 302 |
+
# Calculate totals
|
| 303 |
+
total_duration = sum(segment.duration_estimate for segment in segments)
|
| 304 |
+
word_count = sum(len(segment.text.split()) for segment in segments)
|
| 305 |
+
|
| 306 |
+
return GeneratedScript(
|
| 307 |
+
segments=segments,
|
| 308 |
+
total_duration=total_duration,
|
| 309 |
+
tagline=tagline,
|
| 310 |
+
voice_style=voice_style,
|
| 311 |
+
word_count=word_count,
|
| 312 |
+
raw_script=f"Template-based script for {brand}"
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
def generate_script(self, brand: str, structure: str, script_prompt: str,
|
| 316 |
+
duration: int, voice_style: str, seed: int = 42) -> GeneratedScript:
|
| 317 |
+
"""
|
| 318 |
+
Generate a complete commercial script.
|
| 319 |
+
"""
|
| 320 |
+
try:
|
| 321 |
+
if self.llm_available:
|
| 322 |
+
return self.generate_script_with_llm(brand, structure, script_prompt, duration, voice_style, seed)
|
| 323 |
+
else:
|
| 324 |
+
logger.info("Using template-based script generation (LLM not available)")
|
| 325 |
+
return self.generate_script_with_template(brand, structure, script_prompt, duration, voice_style, seed)
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.error(f"Script generation failed: {e}")
|
| 328 |
+
logger.info("Falling back to template-based generation")
|
| 329 |
+
return self.generate_script_with_template(brand, structure, script_prompt, duration, voice_style, seed)
|
| 330 |
+
|
| 331 |
+
def suggest_scripts(self, structure: str, n: int = 6, seed: int = 0) -> List[str]:
|
| 332 |
+
"""
|
| 333 |
+
Generate multiple script suggestions based on structure.
|
| 334 |
+
"""
|
| 335 |
+
try:
|
| 336 |
+
suggestions = []
|
| 337 |
+
for i in range(n):
|
| 338 |
+
script = self.generate_script(
|
| 339 |
+
brand="YourBrand",
|
| 340 |
+
structure=structure,
|
| 341 |
+
script_prompt="Create an engaging hook",
|
| 342 |
+
duration=10,
|
| 343 |
+
voice_style="Announcer '80s",
|
| 344 |
+
seed=seed + i
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
# Extract hook from first segment
|
| 348 |
+
if script.segments:
|
| 349 |
+
hook = script.segments[0].text
|
| 350 |
+
suggestions.append(hook)
|
| 351 |
+
else:
|
| 352 |
+
suggestions.append("Back to '87 - the future is now!")
|
| 353 |
+
|
| 354 |
+
return suggestions
|
| 355 |
+
|
| 356 |
+
except Exception as e:
|
| 357 |
+
logger.warning(f"Script suggestion failed: {e}")
|
| 358 |
+
# Fallback to original random generation
|
| 359 |
+
return self._fallback_suggestions(structure, n, seed)
|
| 360 |
+
|
| 361 |
+
def _fallback_suggestions(self, structure: str, n: int, seed: int) -> List[str]:
|
| 362 |
+
"""Fallback to original random script generation."""
|
| 363 |
+
random.seed(seed)
|
| 364 |
+
|
| 365 |
+
base = (structure or "").lower().strip()
|
| 366 |
+
ideas = []
|
| 367 |
+
|
| 368 |
+
for _ in range(n):
|
| 369 |
+
style = random.choice(["infomercial", "mall ad", "late-night", "newsflash", "arcade bumper"])
|
| 370 |
+
shot = random.choice(["neon grid", "CRT scanlines", "vaporwave sunset", "shopping mall", "boombox close-up"])
|
| 371 |
+
hook = random.choice([
|
| 372 |
+
"Remember this sound?", "Back to '87.", "Deal of the decade.",
|
| 373 |
+
"We paused time.", "Be kind, rewind your brand."
|
| 374 |
+
])
|
| 375 |
+
idea = f"{hook} {style} with {shot}."
|
| 376 |
+
|
| 377 |
+
# Light correlation with structure
|
| 378 |
+
for kw in ["montage", "testimonial", "news", "unboxing", "before", "after", "countdown", "logo", "cta"]:
|
| 379 |
+
if kw in base and kw not in idea:
|
| 380 |
+
idea += f" Includes {kw}."
|
| 381 |
+
|
| 382 |
+
ideas.append(idea)
|
| 383 |
+
|
| 384 |
+
return ideas
|
| 385 |
+
|
| 386 |
+
def create_script_generator() -> LLMScriptGenerator:
|
| 387 |
+
"""Factory function to create a script generator."""
|
| 388 |
+
return LLMScriptGenerator()
|
promptkit.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Legacy promptkit module for EceMotion Pictures.
|
| 3 |
+
Maintained for backward compatibility.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Dict, List
|
| 8 |
+
import random
|
| 9 |
+
|
| 10 |
+
TAGLINES = [
|
| 11 |
+
"So retro, it's the future.",
|
| 12 |
+
"Pixels you can trust.",
|
| 13 |
+
"VHS vibes. Modern results.",
|
| 14 |
+
"Old-school cool. New-school sales.",
|
| 15 |
+
"EceMotion Pictures - Bringing the '80s back to life.",
|
| 16 |
+
"Your story, our vision, timeless memories.",
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
VOICE_STYLES = {
|
| 20 |
+
"Announcer '80s": "A confident, upbeat 1980s TV announcer with warm AM-radio tone.",
|
| 21 |
+
"Mall PA": "Casual, slightly echoey mall public-address vibe.",
|
| 22 |
+
"Late Night": "Low energy, sly late-night infomercial style.",
|
| 23 |
+
"News Anchor": "Professional, authoritative news anchor delivery.",
|
| 24 |
+
"Infomercial": "Enthusiastic, persuasive infomercial host style.",
|
| 25 |
+
"Radio DJ": "Smooth, charismatic radio disc jockey voice.",
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
STRUCTURE_TEMPLATES = [
|
| 29 |
+
"Montage → Close-up → Logo stinger",
|
| 30 |
+
"Before/After → Feature highlight → CTA",
|
| 31 |
+
"Testimonial → B-roll → Price tag reveal",
|
| 32 |
+
"Unboxing → Demo → Deal countdown",
|
| 33 |
+
"Retro news bulletin → Product shot → Tagline",
|
| 34 |
+
"Opening hook → Problem/Solution → Call to action",
|
| 35 |
+
"Brand story → Product showcase → Final tagline",
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
@dataclass
|
| 39 |
+
class AdPlan:
|
| 40 |
+
brand: str
|
| 41 |
+
structure: str
|
| 42 |
+
script_prompt: str
|
| 43 |
+
duration: int
|
| 44 |
+
voice_style: str
|
| 45 |
+
seed: int
|
| 46 |
+
|
| 47 |
+
def script(self) -> Dict[str, str]:
|
| 48 |
+
random.seed(self.seed)
|
| 49 |
+
tl = random.choice(TAGLINES)
|
| 50 |
+
structure = self.structure.strip() or random.choice(STRUCTURE_TEMPLATES)
|
| 51 |
+
# 4-beat VO using structure + script prompt
|
| 52 |
+
beats = [
|
| 53 |
+
f"HOOK: {self.brand} — {self.script_prompt}",
|
| 54 |
+
f"FLOW: {structure}",
|
| 55 |
+
"BENEFIT: Faster, simpler, cooler — like your favorite retro tech.",
|
| 56 |
+
f"CTA: Try {self.brand} today. {tl}",
|
| 57 |
+
]
|
| 58 |
+
vo = " ".join([b.split(': ',1)[1] for b in beats])
|
| 59 |
+
return {"lines": beats, "voiceover": vo, "tagline": tl}
|
| 60 |
+
|
| 61 |
+
def suggest_scripts(structure_prompt: str, n: int = 6, seed: int = 0) -> List[str]:
|
| 62 |
+
"""Return n short script ideas correlated with the structure prompt."""
|
| 63 |
+
random.seed(seed)
|
| 64 |
+
base = (structure_prompt or "").lower().strip()
|
| 65 |
+
ideas = []
|
| 66 |
+
for _ in range(n):
|
| 67 |
+
style = random.choice(["infomercial", "mall ad", "late-night", "newsflash", "arcade bumper"])
|
| 68 |
+
shot = random.choice(["neon grid", "CRT scanlines", "vaporwave sunset", "shopping mall", "boombox close-up"])
|
| 69 |
+
hook = random.choice([
|
| 70 |
+
"Remember this sound?", "Back to '87.", "Deal of the decade.", "We paused time.", "Be kind, rewind your brand.",
|
| 71 |
+
])
|
| 72 |
+
idea = f"{hook} {style} with {shot}."
|
| 73 |
+
# Light correlation: echo key words from structure prompt
|
| 74 |
+
for kw in ["montage","testimonial","news","unboxing","before","after","countdown","logo","cta"]:
|
| 75 |
+
if kw in base and kw not in idea:
|
| 76 |
+
idea += f" Includes {kw}."
|
| 77 |
+
ideas.append(idea)
|
| 78 |
+
return ideas
|
| 79 |
+
|
| 80 |
+
def roll_script(structure_prompt: str, seed: int = 0) -> str:
|
| 81 |
+
return random.choice(suggest_scripts(structure_prompt, n=6, seed=seed))
|
requirements.txt
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# EceMotion Pictures - Production Requirements
|
| 2 |
+
# Tested and verified versions for HuggingFace Spaces
|
| 3 |
+
|
| 4 |
+
# Core ML/AI libraries - stable versions
|
| 5 |
+
gradio==4.44.0
|
| 6 |
+
transformers==4.44.2
|
| 7 |
+
accelerate==0.34.0
|
| 8 |
+
diffusers==0.31.0
|
| 9 |
+
safetensors==0.4.3
|
| 10 |
+
sentencepiece==0.2.0
|
| 11 |
+
huggingface_hub==0.24.6
|
| 12 |
+
|
| 13 |
+
# PyTorch ecosystem - stable versions
|
| 14 |
+
torch==2.4.0
|
| 15 |
+
torchvision==0.19.0
|
| 16 |
+
torchaudio==2.4.0
|
| 17 |
+
|
| 18 |
+
# Video processing - stable versions
|
| 19 |
+
moviepy==1.0.3
|
| 20 |
+
imageio[ffmpeg]==2.34.0
|
| 21 |
+
ffmpeg-python==0.2.0
|
| 22 |
+
|
| 23 |
+
# Audio processing - stable versions
|
| 24 |
+
soundfile==0.12.1
|
| 25 |
+
librosa==0.10.2
|
| 26 |
+
scipy==1.11.4
|
| 27 |
+
|
| 28 |
+
# Data processing
|
| 29 |
+
numpy==1.26.4
|
| 30 |
+
pandas==2.2.0
|
| 31 |
+
|
| 32 |
+
# Configuration and validation
|
| 33 |
+
pydantic==2.8.0
|
| 34 |
+
python-dotenv==1.0.0
|
| 35 |
+
|
| 36 |
+
# Logging and monitoring
|
| 37 |
+
loguru==0.7.2
|
| 38 |
+
|
| 39 |
+
# Additional dependencies for HuggingFace Spaces
|
| 40 |
+
Pillow==10.2.0
|
| 41 |
+
opencv-python==4.9.0.80
|
| 42 |
+
matplotlib==3.8.4
|
| 43 |
+
seaborn==0.13.2
|
| 44 |
+
|
| 45 |
+
# Development and testing (optional)
|
| 46 |
+
pytest==8.0.0
|
| 47 |
+
black==24.0.0
|
| 48 |
+
flake8==7.0.0
|
| 49 |
+
|
| 50 |
+
# System dependencies (for HuggingFace Spaces)
|
| 51 |
+
# These are typically pre-installed but listed for completeness
|
| 52 |
+
# ffmpeg (system package)
|
| 53 |
+
# git (system package)
|
sync_manager.py
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio-Video Synchronization Manager for EceMotion Pictures.
|
| 3 |
+
Ensures frame-perfect alignment between generated audio and video content.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
import subprocess
|
| 9 |
+
import numpy as np
|
| 10 |
+
import logging
|
| 11 |
+
from typing import Tuple, Optional, Dict, Any
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import shutil
|
| 14 |
+
|
| 15 |
+
from config import SYNC_TOLERANCE_MS, FORCE_SYNC, AUDIO_SAMPLE_RATE
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
class SyncManager:
|
| 20 |
+
"""Manages audio-video synchronization with frame-perfect accuracy."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, tolerance_ms: int = SYNC_TOLERANCE_MS):
|
| 23 |
+
self.tolerance_ms = tolerance_ms
|
| 24 |
+
self.tolerance_s = tolerance_ms / 1000.0
|
| 25 |
+
self.ffmpeg_available = self._check_ffmpeg()
|
| 26 |
+
|
| 27 |
+
def _check_ffmpeg(self) -> bool:
|
| 28 |
+
"""Check if ffmpeg is available."""
|
| 29 |
+
try:
|
| 30 |
+
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
| 31 |
+
return True
|
| 32 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 33 |
+
logger.warning("ffmpeg not found, using fallback methods")
|
| 34 |
+
return False
|
| 35 |
+
|
| 36 |
+
def calculate_video_duration(self, num_frames: int, fps: float) -> float:
|
| 37 |
+
"""Calculate exact video duration from frame count and FPS."""
|
| 38 |
+
return num_frames / fps
|
| 39 |
+
|
| 40 |
+
def measure_audio_duration(self, audio_path: str) -> float:
|
| 41 |
+
"""Measure actual duration of audio file."""
|
| 42 |
+
if not os.path.exists(audio_path):
|
| 43 |
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
| 44 |
+
|
| 45 |
+
if self.ffmpeg_available:
|
| 46 |
+
return self._measure_with_ffmpeg(audio_path)
|
| 47 |
+
else:
|
| 48 |
+
return self._measure_with_soundfile(audio_path)
|
| 49 |
+
|
| 50 |
+
def _measure_with_ffmpeg(self, audio_path: str) -> float:
|
| 51 |
+
"""Measure duration using ffmpeg."""
|
| 52 |
+
try:
|
| 53 |
+
cmd = [
|
| 54 |
+
'ffprobe', '-v', 'quiet', '-show_entries', 'format=duration',
|
| 55 |
+
'-of', 'csv=p=0', audio_path
|
| 56 |
+
]
|
| 57 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 58 |
+
duration = float(result.stdout.strip())
|
| 59 |
+
logger.info(f"Audio duration (ffmpeg): {duration:.3f}s")
|
| 60 |
+
return duration
|
| 61 |
+
except (subprocess.CalledProcessError, ValueError) as e:
|
| 62 |
+
logger.error(f"Failed to measure audio duration with ffmpeg: {e}")
|
| 63 |
+
return self._measure_with_soundfile(audio_path)
|
| 64 |
+
|
| 65 |
+
def _measure_with_soundfile(self, audio_path: str) -> float:
|
| 66 |
+
"""Measure duration using soundfile as fallback."""
|
| 67 |
+
try:
|
| 68 |
+
import soundfile as sf
|
| 69 |
+
info = sf.info(audio_path)
|
| 70 |
+
duration = info.duration
|
| 71 |
+
logger.info(f"Audio duration (soundfile): {duration:.3f}s")
|
| 72 |
+
return duration
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"Failed to measure audio duration with soundfile: {e}")
|
| 75 |
+
# Last resort: estimate from file size
|
| 76 |
+
return self._estimate_duration_from_size(audio_path)
|
| 77 |
+
|
| 78 |
+
def _estimate_duration_from_size(self, audio_path: str) -> float:
|
| 79 |
+
"""Estimate duration from file size (very rough estimate)."""
|
| 80 |
+
try:
|
| 81 |
+
file_size = os.path.getsize(audio_path)
|
| 82 |
+
# Rough estimate: 1MB ≈ 1 second for 128kbps audio
|
| 83 |
+
estimated_duration = file_size / (1024 * 1024)
|
| 84 |
+
logger.warning(f"Estimated audio duration from file size: {estimated_duration:.3f}s")
|
| 85 |
+
return estimated_duration
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.error(f"Failed to estimate duration: {e}")
|
| 88 |
+
return 10.0 # Default fallback
|
| 89 |
+
|
| 90 |
+
def measure_video_duration(self, video_path: str) -> float:
|
| 91 |
+
"""Measure actual duration of video file."""
|
| 92 |
+
if not os.path.exists(video_path):
|
| 93 |
+
raise FileNotFoundError(f"Video file not found: {video_path}")
|
| 94 |
+
|
| 95 |
+
if self.ffmpeg_available:
|
| 96 |
+
return self._measure_video_with_ffmpeg(video_path)
|
| 97 |
+
else:
|
| 98 |
+
return self._estimate_video_duration(video_path)
|
| 99 |
+
|
| 100 |
+
def _measure_video_with_ffmpeg(self, video_path: str) -> float:
|
| 101 |
+
"""Measure video duration using ffmpeg."""
|
| 102 |
+
try:
|
| 103 |
+
cmd = [
|
| 104 |
+
'ffprobe', '-v', 'quiet', '-show_entries', 'format=duration',
|
| 105 |
+
'-of', 'csv=p=0', video_path
|
| 106 |
+
]
|
| 107 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
| 108 |
+
duration = float(result.stdout.strip())
|
| 109 |
+
logger.info(f"Video duration (ffmpeg): {duration:.3f}s")
|
| 110 |
+
return duration
|
| 111 |
+
except (subprocess.CalledProcessError, ValueError) as e:
|
| 112 |
+
logger.error(f"Failed to measure video duration with ffmpeg: {e}")
|
| 113 |
+
return self._estimate_video_duration(video_path)
|
| 114 |
+
|
| 115 |
+
def _estimate_video_duration(self, video_path: str) -> float:
|
| 116 |
+
"""Estimate video duration (fallback method)."""
|
| 117 |
+
try:
|
| 118 |
+
# Try to get duration from filename or use default
|
| 119 |
+
filename = os.path.basename(video_path)
|
| 120 |
+
if '_' in filename:
|
| 121 |
+
# Try to extract duration from filename like "video_10s.mp4"
|
| 122 |
+
parts = filename.split('_')
|
| 123 |
+
for part in parts:
|
| 124 |
+
if 's' in part:
|
| 125 |
+
try:
|
| 126 |
+
duration = float(part.replace('s', ''))
|
| 127 |
+
logger.info(f"Estimated video duration from filename: {duration:.3f}s")
|
| 128 |
+
return duration
|
| 129 |
+
except ValueError:
|
| 130 |
+
continue
|
| 131 |
+
|
| 132 |
+
# Default fallback
|
| 133 |
+
logger.warning("Using default video duration estimate: 10.0s")
|
| 134 |
+
return 10.0
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"Failed to estimate video duration: {e}")
|
| 137 |
+
return 10.0
|
| 138 |
+
|
| 139 |
+
def adjust_audio_to_video(self, audio_path: str, target_duration: float,
|
| 140 |
+
output_path: str) -> str:
|
| 141 |
+
"""Adjust audio duration to match video duration."""
|
| 142 |
+
if self.ffmpeg_available:
|
| 143 |
+
return self._adjust_audio_with_ffmpeg(audio_path, target_duration, output_path)
|
| 144 |
+
else:
|
| 145 |
+
return self._adjust_audio_with_soundfile(audio_path, target_duration, output_path)
|
| 146 |
+
|
| 147 |
+
def _adjust_audio_with_ffmpeg(self, audio_path: str, target_duration: float,
|
| 148 |
+
output_path: str) -> str:
|
| 149 |
+
"""Adjust audio using ffmpeg."""
|
| 150 |
+
try:
|
| 151 |
+
cmd = [
|
| 152 |
+
'ffmpeg', '-i', audio_path, '-t', str(target_duration),
|
| 153 |
+
'-af', 'apad', '-c:a', 'pcm_s16le', '-y', output_path
|
| 154 |
+
]
|
| 155 |
+
subprocess.run(cmd, check=True, capture_output=True)
|
| 156 |
+
logger.info(f"Adjusted audio to {target_duration:.3f}s using ffmpeg")
|
| 157 |
+
return output_path
|
| 158 |
+
except subprocess.CalledProcessError as e:
|
| 159 |
+
logger.error(f"Failed to adjust audio with ffmpeg: {e}")
|
| 160 |
+
return self._adjust_audio_with_soundfile(audio_path, target_duration, output_path)
|
| 161 |
+
|
| 162 |
+
def _adjust_audio_with_soundfile(self, audio_path: str, target_duration: float,
|
| 163 |
+
output_path: str) -> str:
|
| 164 |
+
"""Adjust audio using soundfile (fallback)."""
|
| 165 |
+
try:
|
| 166 |
+
import soundfile as sf
|
| 167 |
+
|
| 168 |
+
# Read audio
|
| 169 |
+
audio_data, sample_rate = sf.read(audio_path)
|
| 170 |
+
|
| 171 |
+
# Calculate target samples
|
| 172 |
+
target_samples = int(target_duration * sample_rate)
|
| 173 |
+
|
| 174 |
+
if len(audio_data) < target_samples:
|
| 175 |
+
# Pad with silence
|
| 176 |
+
padding = np.zeros(target_samples - len(audio_data))
|
| 177 |
+
if len(audio_data.shape) > 1: # Stereo
|
| 178 |
+
padding = np.zeros((target_samples - len(audio_data), audio_data.shape[1]))
|
| 179 |
+
adjusted_audio = np.concatenate([audio_data, padding])
|
| 180 |
+
else:
|
| 181 |
+
# Trim to target length
|
| 182 |
+
adjusted_audio = audio_data[:target_samples]
|
| 183 |
+
|
| 184 |
+
# Write adjusted audio
|
| 185 |
+
sf.write(output_path, adjusted_audio, sample_rate)
|
| 186 |
+
logger.info(f"Adjusted audio to {target_duration:.3f}s using soundfile")
|
| 187 |
+
return output_path
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
logger.error(f"Failed to adjust audio with soundfile: {e}")
|
| 191 |
+
# Last resort: just copy the file
|
| 192 |
+
shutil.copy2(audio_path, output_path)
|
| 193 |
+
return output_path
|
| 194 |
+
|
| 195 |
+
def adjust_video_to_audio(self, video_path: str, target_duration: float,
|
| 196 |
+
output_path: str) -> str:
|
| 197 |
+
"""Adjust video duration to match audio duration."""
|
| 198 |
+
if self.ffmpeg_available:
|
| 199 |
+
return self._adjust_video_with_ffmpeg(video_path, target_duration, output_path)
|
| 200 |
+
else:
|
| 201 |
+
# For video, we can't easily adjust without ffmpeg, so just copy
|
| 202 |
+
shutil.copy2(video_path, output_path)
|
| 203 |
+
return output_path
|
| 204 |
+
|
| 205 |
+
def _adjust_video_with_ffmpeg(self, video_path: str, target_duration: float,
|
| 206 |
+
output_path: str) -> str:
|
| 207 |
+
"""Adjust video using ffmpeg."""
|
| 208 |
+
try:
|
| 209 |
+
cmd = [
|
| 210 |
+
'ffmpeg', '-i', video_path, '-t', str(target_duration),
|
| 211 |
+
'-c:v', 'libx264', '-pix_fmt', 'yuv420p', '-y', output_path
|
| 212 |
+
]
|
| 213 |
+
subprocess.run(cmd, check=True, capture_output=True)
|
| 214 |
+
logger.info(f"Adjusted video to {target_duration:.3f}s using ffmpeg")
|
| 215 |
+
return output_path
|
| 216 |
+
except subprocess.CalledProcessError as e:
|
| 217 |
+
logger.error(f"Failed to adjust video with ffmpeg: {e}")
|
| 218 |
+
# Fallback: just copy
|
| 219 |
+
shutil.copy2(video_path, output_path)
|
| 220 |
+
return output_path
|
| 221 |
+
|
| 222 |
+
def validate_sync(self, video_path: str, audio_path: str) -> Tuple[bool, float]:
|
| 223 |
+
"""Validate that audio and video are properly synchronized."""
|
| 224 |
+
try:
|
| 225 |
+
video_duration = self.measure_video_duration(video_path)
|
| 226 |
+
audio_duration = self.measure_audio_duration(audio_path)
|
| 227 |
+
|
| 228 |
+
duration_diff = abs(video_duration - audio_duration)
|
| 229 |
+
is_synced = duration_diff <= self.tolerance_s
|
| 230 |
+
|
| 231 |
+
logger.info(f"Sync validation: video={video_duration:.3f}s, "
|
| 232 |
+
f"audio={audio_duration:.3f}s, diff={duration_diff:.3f}s, "
|
| 233 |
+
f"synced={is_synced}")
|
| 234 |
+
|
| 235 |
+
return is_synced, duration_diff
|
| 236 |
+
|
| 237 |
+
except Exception as e:
|
| 238 |
+
logger.error(f"Sync validation failed: {e}")
|
| 239 |
+
return False, float('inf')
|
| 240 |
+
|
| 241 |
+
def synchronize_media(self, video_path: str, audio_path: str,
|
| 242 |
+
output_path: str, prefer_audio_duration: bool = True) -> str:
|
| 243 |
+
"""
|
| 244 |
+
Synchronize audio and video with frame-perfect accuracy.
|
| 245 |
+
"""
|
| 246 |
+
try:
|
| 247 |
+
# Measure durations
|
| 248 |
+
video_duration = self.measure_video_duration(video_path)
|
| 249 |
+
audio_duration = self.measure_audio_duration(audio_path)
|
| 250 |
+
|
| 251 |
+
duration_diff = abs(video_duration - audio_duration)
|
| 252 |
+
|
| 253 |
+
# Check if already synchronized
|
| 254 |
+
if duration_diff <= self.tolerance_s:
|
| 255 |
+
logger.info("Media already synchronized, copying to output")
|
| 256 |
+
self._copy_media(video_path, audio_path, output_path)
|
| 257 |
+
return output_path
|
| 258 |
+
|
| 259 |
+
# Determine target duration
|
| 260 |
+
if prefer_audio_duration:
|
| 261 |
+
target_duration = audio_duration
|
| 262 |
+
logger.info(f"Adjusting video to match audio duration: {target_duration:.3f}s")
|
| 263 |
+
else:
|
| 264 |
+
target_duration = video_duration
|
| 265 |
+
logger.info(f"Adjusting audio to match video duration: {target_duration:.3f}s")
|
| 266 |
+
|
| 267 |
+
# Create temporary files for adjustments
|
| 268 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 269 |
+
temp_video = os.path.join(temp_dir, "temp_video.mp4")
|
| 270 |
+
temp_audio = os.path.join(temp_dir, "temp_audio.wav")
|
| 271 |
+
|
| 272 |
+
# Adjust durations
|
| 273 |
+
if prefer_audio_duration:
|
| 274 |
+
self.adjust_video_to_audio(video_path, target_duration, temp_video)
|
| 275 |
+
temp_audio = audio_path # Use original audio
|
| 276 |
+
else:
|
| 277 |
+
self.adjust_audio_to_video(audio_path, target_duration, temp_audio)
|
| 278 |
+
temp_video = video_path # Use original video
|
| 279 |
+
|
| 280 |
+
# Mux synchronized media
|
| 281 |
+
self._mux_media(temp_video, temp_audio, output_path)
|
| 282 |
+
|
| 283 |
+
# Validate final sync
|
| 284 |
+
is_synced, final_diff = self.validate_sync(output_path, output_path)
|
| 285 |
+
if not is_synced and FORCE_SYNC:
|
| 286 |
+
logger.warning(f"Final sync validation failed with diff {final_diff:.3f}s")
|
| 287 |
+
else:
|
| 288 |
+
logger.info("Media successfully synchronized")
|
| 289 |
+
|
| 290 |
+
return output_path
|
| 291 |
+
|
| 292 |
+
except Exception as e:
|
| 293 |
+
logger.error(f"Synchronization failed: {e}")
|
| 294 |
+
# Fallback: just copy video without audio
|
| 295 |
+
shutil.copy2(video_path, output_path)
|
| 296 |
+
return output_path
|
| 297 |
+
|
| 298 |
+
def _copy_media(self, video_path: str, audio_path: str, output_path: str):
|
| 299 |
+
"""Copy and mux media without duration adjustment."""
|
| 300 |
+
self._mux_media(video_path, audio_path, output_path)
|
| 301 |
+
|
| 302 |
+
def _mux_media(self, video_path: str, audio_path: str, output_path: str):
|
| 303 |
+
"""Mux video and audio with precise timing."""
|
| 304 |
+
if self.ffmpeg_available:
|
| 305 |
+
self._mux_with_ffmpeg(video_path, audio_path, output_path)
|
| 306 |
+
else:
|
| 307 |
+
self._mux_with_moviepy(video_path, audio_path, output_path)
|
| 308 |
+
|
| 309 |
+
def _mux_with_ffmpeg(self, video_path: str, audio_path: str, output_path: str):
|
| 310 |
+
"""Mux using ffmpeg."""
|
| 311 |
+
try:
|
| 312 |
+
cmd = [
|
| 313 |
+
'ffmpeg', '-i', video_path, '-i', audio_path,
|
| 314 |
+
'-c:v', 'copy', '-c:a', 'aac', '-b:a', '128k',
|
| 315 |
+
'-shortest', '-fflags', '+shortest',
|
| 316 |
+
'-movflags', '+faststart', '-y', output_path
|
| 317 |
+
]
|
| 318 |
+
subprocess.run(cmd, check=True, capture_output=True)
|
| 319 |
+
logger.info("Media successfully muxed with ffmpeg")
|
| 320 |
+
except subprocess.CalledProcessError as e:
|
| 321 |
+
logger.error(f"Media muxing with ffmpeg failed: {e}")
|
| 322 |
+
self._mux_with_moviepy(video_path, audio_path, output_path)
|
| 323 |
+
|
| 324 |
+
def _mux_with_moviepy(self, video_path: str, audio_path: str, output_path: str):
|
| 325 |
+
"""Mux using moviepy (fallback)."""
|
| 326 |
+
try:
|
| 327 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
| 328 |
+
|
| 329 |
+
# Load video and audio
|
| 330 |
+
video = VideoFileClip(video_path)
|
| 331 |
+
audio = AudioFileClip(audio_path)
|
| 332 |
+
|
| 333 |
+
# Set audio duration to match video
|
| 334 |
+
if audio.duration > video.duration:
|
| 335 |
+
audio = audio.subclip(0, video.duration)
|
| 336 |
+
elif audio.duration < video.duration:
|
| 337 |
+
# Pad audio with silence
|
| 338 |
+
from moviepy.audio.AudioClip import AudioClip
|
| 339 |
+
silence = AudioClip(lambda t: 0, duration=video.duration - audio.duration)
|
| 340 |
+
audio = audio.concatenate_audioclips([audio, silence])
|
| 341 |
+
|
| 342 |
+
# Combine and write
|
| 343 |
+
final_video = video.set_audio(audio)
|
| 344 |
+
final_video.write_videofile(
|
| 345 |
+
output_path,
|
| 346 |
+
codec='libx264',
|
| 347 |
+
audio_codec='aac',
|
| 348 |
+
temp_audiofile='temp-audio.m4a',
|
| 349 |
+
remove_temp=True,
|
| 350 |
+
verbose=False,
|
| 351 |
+
logger=None
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
# Clean up
|
| 355 |
+
video.close()
|
| 356 |
+
audio.close()
|
| 357 |
+
final_video.close()
|
| 358 |
+
|
| 359 |
+
logger.info("Media successfully muxed with moviepy")
|
| 360 |
+
|
| 361 |
+
except Exception as e:
|
| 362 |
+
logger.error(f"Media muxing with moviepy failed: {e}")
|
| 363 |
+
# Last resort: just copy video
|
| 364 |
+
shutil.copy2(video_path, output_path)
|
| 365 |
+
|
| 366 |
+
def get_optimal_frame_count(self, target_duration: float, fps: float) -> int:
|
| 367 |
+
"""Calculate optimal frame count for target duration."""
|
| 368 |
+
frame_count = int(target_duration * fps)
|
| 369 |
+
# Ensure frame count is reasonable
|
| 370 |
+
frame_count = max(8, min(frame_count, 64)) # 1-8 seconds at 8fps
|
| 371 |
+
return frame_count
|
| 372 |
+
|
| 373 |
+
def estimate_audio_duration(self, text: str, words_per_minute: int = 150) -> float:
|
| 374 |
+
"""Estimate audio duration from text length."""
|
| 375 |
+
word_count = len(text.split())
|
| 376 |
+
duration_minutes = word_count / words_per_minute
|
| 377 |
+
return duration_minutes * 60.0 # Convert to seconds
|
| 378 |
+
|
| 379 |
+
def create_sync_manager() -> SyncManager:
|
| 380 |
+
"""Factory function to create a SyncManager instance."""
|
| 381 |
+
return SyncManager()
|
utils_audio.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio processing utilities for EceMotion Pictures.
|
| 3 |
+
Enhanced text-to-speech generation with robust error handling and fallbacks.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
from typing import Tuple, Optional, Dict, Any
|
| 10 |
+
|
| 11 |
+
from config import (
|
| 12 |
+
MODEL_AUDIO, MODEL_CONFIGS, AUDIO_SAMPLE_RATE, get_device, get_safe_model_name
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Global model cache
|
| 18 |
+
_tts_pipe = None
|
| 19 |
+
_current_tts_model = None
|
| 20 |
+
|
| 21 |
+
def get_tts_pipe(model_name: str = MODEL_AUDIO, device: str = None):
|
| 22 |
+
"""Get or create TTS pipeline with lazy loading and model switching."""
|
| 23 |
+
global _tts_pipe, _current_tts_model
|
| 24 |
+
|
| 25 |
+
if device is None:
|
| 26 |
+
device = get_device()
|
| 27 |
+
|
| 28 |
+
# Use safe model name
|
| 29 |
+
safe_model_name = get_safe_model_name(model_name, "audio")
|
| 30 |
+
|
| 31 |
+
if _tts_pipe is None or _current_tts_model != safe_model_name:
|
| 32 |
+
logger.info(f"Loading TTS model: {safe_model_name}")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
if "f5-tts" in safe_model_name.lower():
|
| 36 |
+
# Try F5-TTS first
|
| 37 |
+
_tts_pipe = _load_f5_tts(safe_model_name, device)
|
| 38 |
+
else:
|
| 39 |
+
# Use standard TTS pipeline
|
| 40 |
+
_tts_pipe = _load_standard_tts(safe_model_name, device)
|
| 41 |
+
|
| 42 |
+
if _tts_pipe is not None:
|
| 43 |
+
_current_tts_model = safe_model_name
|
| 44 |
+
logger.info(f"TTS model {safe_model_name} loaded successfully")
|
| 45 |
+
else:
|
| 46 |
+
raise RuntimeError("Failed to load any TTS model")
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f"Failed to load {safe_model_name}: {e}")
|
| 50 |
+
# Fallback to original model
|
| 51 |
+
_tts_pipe = _load_standard_tts("parler-tts/parler-tts-mini-v1", device)
|
| 52 |
+
_current_tts_model = "parler-tts/parler-tts-mini-v1"
|
| 53 |
+
|
| 54 |
+
return _tts_pipe
|
| 55 |
+
|
| 56 |
+
def _load_f5_tts(model_name: str, device: str):
|
| 57 |
+
"""Load F5-TTS model."""
|
| 58 |
+
try:
|
| 59 |
+
from transformers import pipeline
|
| 60 |
+
|
| 61 |
+
pipe = pipeline(
|
| 62 |
+
"text-to-speech",
|
| 63 |
+
model=model_name,
|
| 64 |
+
torch_dtype="auto",
|
| 65 |
+
device_map=device if device == "cuda" else None
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
return pipe
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Failed to load F5-TTS: {e}")
|
| 72 |
+
return None
|
| 73 |
+
|
| 74 |
+
def _load_standard_tts(model_name: str, device: str):
|
| 75 |
+
"""Load standard TTS model."""
|
| 76 |
+
try:
|
| 77 |
+
from transformers import pipeline
|
| 78 |
+
|
| 79 |
+
pipe = pipeline(
|
| 80 |
+
"text-to-speech",
|
| 81 |
+
model=model_name,
|
| 82 |
+
torch_dtype="auto"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
if device == "cuda":
|
| 86 |
+
pipe = pipe.to(device)
|
| 87 |
+
|
| 88 |
+
return pipe
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"Failed to load standard TTS: {e}")
|
| 92 |
+
return None
|
| 93 |
+
|
| 94 |
+
def synth_voice(text: str, voice_prompt: str, sr: int = AUDIO_SAMPLE_RATE,
|
| 95 |
+
model_name: str = MODEL_AUDIO, device: str = None) -> Tuple[int, np.ndarray]:
|
| 96 |
+
"""
|
| 97 |
+
Generate speech from text with enhanced TTS support.
|
| 98 |
+
"""
|
| 99 |
+
if device is None:
|
| 100 |
+
device = get_device()
|
| 101 |
+
|
| 102 |
+
tts = get_tts_pipe(model_name, device)
|
| 103 |
+
model_config = MODEL_CONFIGS.get(_current_tts_model, {})
|
| 104 |
+
|
| 105 |
+
# Validate text length
|
| 106 |
+
max_length = model_config.get("max_text_length", 500)
|
| 107 |
+
min_length = model_config.get("min_text_length", 10)
|
| 108 |
+
|
| 109 |
+
if len(text) > max_length:
|
| 110 |
+
logger.warning(f"Text too long ({len(text)} chars), truncating to {max_length}")
|
| 111 |
+
text = text[:max_length]
|
| 112 |
+
elif len(text) < min_length:
|
| 113 |
+
logger.warning(f"Text too short ({len(text)} chars), padding")
|
| 114 |
+
text = text + " " * (min_length - len(text))
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
if "f5-tts" in _current_tts_model.lower():
|
| 118 |
+
# F5-TTS specific generation
|
| 119 |
+
result = tts(
|
| 120 |
+
text=text,
|
| 121 |
+
voice_preset=voice_prompt,
|
| 122 |
+
return_tensors="pt"
|
| 123 |
+
)
|
| 124 |
+
wav = result["audio"].numpy().flatten()
|
| 125 |
+
else:
|
| 126 |
+
# Standard pipeline (Parler-TTS, etc.)
|
| 127 |
+
result = tts({"text": text, "voice_preset": voice_prompt})
|
| 128 |
+
wav = result["audio"]
|
| 129 |
+
|
| 130 |
+
# Ensure proper format
|
| 131 |
+
if hasattr(wav, 'numpy'):
|
| 132 |
+
wav = wav.numpy()
|
| 133 |
+
elif hasattr(wav, 'detach'):
|
| 134 |
+
wav = wav.detach().numpy()
|
| 135 |
+
|
| 136 |
+
# Normalize audio
|
| 137 |
+
wav = normalize_audio(wav)
|
| 138 |
+
|
| 139 |
+
# Resample if needed
|
| 140 |
+
if sr != AUDIO_SAMPLE_RATE:
|
| 141 |
+
wav = _resample_audio(wav, AUDIO_SAMPLE_RATE, sr)
|
| 142 |
+
|
| 143 |
+
logger.info(f"Generated audio: {len(wav)/sr:.2f}s at {sr}Hz")
|
| 144 |
+
return sr, wav.astype(np.float32)
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"Voice synthesis failed: {e}")
|
| 148 |
+
# Return fallback audio
|
| 149 |
+
return _create_fallback_audio(text, sr)
|
| 150 |
+
|
| 151 |
+
def _resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
|
| 152 |
+
"""Resample audio using available methods."""
|
| 153 |
+
try:
|
| 154 |
+
import librosa
|
| 155 |
+
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
|
| 156 |
+
except ImportError:
|
| 157 |
+
# Simple resampling without librosa
|
| 158 |
+
ratio = target_sr / orig_sr
|
| 159 |
+
new_length = int(len(audio) * ratio)
|
| 160 |
+
return np.interp(
|
| 161 |
+
np.linspace(0, len(audio), new_length),
|
| 162 |
+
np.arange(len(audio)),
|
| 163 |
+
audio
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def _create_fallback_audio(text: str, sr: int) -> Tuple[int, np.ndarray]:
|
| 167 |
+
"""Create fallback audio when TTS fails."""
|
| 168 |
+
try:
|
| 169 |
+
# Create a simple tone based on text length
|
| 170 |
+
duration = max(1.0, len(text) / 20.0) # Rough estimate
|
| 171 |
+
t = np.linspace(0, duration, int(sr * duration), endpoint=False)
|
| 172 |
+
|
| 173 |
+
# Generate a simple tone
|
| 174 |
+
frequency = 440.0 # A4 note
|
| 175 |
+
wav = 0.1 * np.sin(2 * np.pi * frequency * t)
|
| 176 |
+
|
| 177 |
+
# Add some variation
|
| 178 |
+
wav += 0.05 * np.sin(2 * np.pi * frequency * 1.5 * t)
|
| 179 |
+
|
| 180 |
+
logger.info(f"Created fallback audio: {duration:.2f}s")
|
| 181 |
+
return sr, wav.astype(np.float32)
|
| 182 |
+
|
| 183 |
+
except Exception as e:
|
| 184 |
+
logger.error(f"Failed to create fallback audio: {e}")
|
| 185 |
+
# Last resort: silence
|
| 186 |
+
duration = 2.0
|
| 187 |
+
wav = np.zeros(int(sr * duration))
|
| 188 |
+
return sr, wav.astype(np.float32)
|
| 189 |
+
|
| 190 |
+
def normalize_audio(audio: np.ndarray, target_lufs: float = -23.0) -> np.ndarray:
|
| 191 |
+
"""Normalize audio to broadcast standards."""
|
| 192 |
+
# Simple peak normalization first
|
| 193 |
+
if np.max(np.abs(audio)) > 0:
|
| 194 |
+
audio = audio / np.max(np.abs(audio)) * 0.95
|
| 195 |
+
|
| 196 |
+
# Apply gentle compression
|
| 197 |
+
audio = apply_compression(audio)
|
| 198 |
+
|
| 199 |
+
return audio
|
| 200 |
+
|
| 201 |
+
def apply_compression(audio: np.ndarray, ratio: float = 3.0, threshold: float = 0.7) -> np.ndarray:
|
| 202 |
+
"""Apply gentle compression for broadcast quality."""
|
| 203 |
+
# Simple soft-knee compression
|
| 204 |
+
compressed = np.copy(audio)
|
| 205 |
+
|
| 206 |
+
# Above threshold, apply compression
|
| 207 |
+
above_threshold = np.abs(audio) > threshold
|
| 208 |
+
compressed[above_threshold] = np.sign(audio[above_threshold]) * (
|
| 209 |
+
threshold + (np.abs(audio[above_threshold]) - threshold) / ratio
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
return compressed
|
| 213 |
+
|
| 214 |
+
def retro_bed(duration_s: float, sr: int = AUDIO_SAMPLE_RATE, bpm: int = 92):
|
| 215 |
+
"""Generate retro synth background music."""
|
| 216 |
+
try:
|
| 217 |
+
t = np.linspace(0, duration_s, int(sr * duration_s), endpoint=False)
|
| 218 |
+
|
| 219 |
+
# Chord progression root frequencies (A minor style)
|
| 220 |
+
freqs = [220.0, 174.61, 196.0, 146.83]
|
| 221 |
+
seg_len = int(len(t) / len(freqs)) if len(freqs) else len(t)
|
| 222 |
+
sig = np.zeros_like(t)
|
| 223 |
+
|
| 224 |
+
for i, f0 in enumerate(freqs):
|
| 225 |
+
tri_t = t[i * seg_len:(i + 1) * seg_len]
|
| 226 |
+
tri = 2 * np.abs(2 * ((tri_t * f0) % 1) - 1) - 1
|
| 227 |
+
sig[i * seg_len:(i + 1) * seg_len] = 0.15 * tri
|
| 228 |
+
|
| 229 |
+
# Add tape noise
|
| 230 |
+
noise = 0.01 * np.random.randn(len(t))
|
| 231 |
+
bed = sig + noise
|
| 232 |
+
|
| 233 |
+
# Apply gentle lowpass filter
|
| 234 |
+
try:
|
| 235 |
+
from scipy import signal
|
| 236 |
+
b, a = signal.butter(3, 3000, 'low', fs=sr)
|
| 237 |
+
bed = signal.lfilter(b, a, bed)
|
| 238 |
+
except ImportError:
|
| 239 |
+
# Simple averaging filter if scipy not available
|
| 240 |
+
bed = np.convolve(bed, np.ones(5)/5, mode='same')
|
| 241 |
+
|
| 242 |
+
return sr, bed.astype(np.float32)
|
| 243 |
+
|
| 244 |
+
except Exception as e:
|
| 245 |
+
logger.error(f"Failed to generate retro bed: {e}")
|
| 246 |
+
# Return silence
|
| 247 |
+
silence = np.zeros(int(sr * duration_s))
|
| 248 |
+
return sr, silence.astype(np.float32)
|
| 249 |
+
|
| 250 |
+
def mix_to_stereo(sr1, a, sr2, b, bed_gain=0.5):
|
| 251 |
+
"""Mix two mono signals to stereo."""
|
| 252 |
+
assert sr1 == sr2, "Sample rates must match"
|
| 253 |
+
|
| 254 |
+
n = max(len(a), len(b))
|
| 255 |
+
|
| 256 |
+
def pad(x):
|
| 257 |
+
if len(x) < n:
|
| 258 |
+
if len(x.shape) > 1: # Stereo
|
| 259 |
+
padding = np.zeros((n - len(x), x.shape[1]))
|
| 260 |
+
else: # Mono
|
| 261 |
+
padding = np.zeros(n - len(x))
|
| 262 |
+
x = np.concatenate([x, padding])
|
| 263 |
+
return x
|
| 264 |
+
|
| 265 |
+
a = pad(a)
|
| 266 |
+
b = pad(b)
|
| 267 |
+
|
| 268 |
+
left = a + bed_gain * b
|
| 269 |
+
right = a * 0.9 + bed_gain * 0.9 * b
|
| 270 |
+
|
| 271 |
+
if len(left.shape) == 1: # Mono to stereo
|
| 272 |
+
stereo = np.stack([left, right], axis=1)
|
| 273 |
+
else: # Already stereo
|
| 274 |
+
stereo = np.stack([left, right], axis=1)
|
| 275 |
+
|
| 276 |
+
return sr1, np.clip(stereo, -1.0, 1.0)
|
| 277 |
+
|
| 278 |
+
def write_wav(path: str, sr: int, wav: np.ndarray):
|
| 279 |
+
"""Write audio to WAV file."""
|
| 280 |
+
try:
|
| 281 |
+
import soundfile as sf
|
| 282 |
+
sf.write(path, wav, sr)
|
| 283 |
+
except ImportError:
|
| 284 |
+
# Fallback using scipy
|
| 285 |
+
try:
|
| 286 |
+
from scipy.io import wavfile
|
| 287 |
+
# Convert to 16-bit
|
| 288 |
+
wav_16bit = (wav * 32767).astype(np.int16)
|
| 289 |
+
wavfile.write(path, sr, wav_16bit)
|
| 290 |
+
except ImportError:
|
| 291 |
+
logger.error("No audio writing library available (soundfile or scipy)")
|
| 292 |
+
raise RuntimeError("Cannot write audio file - no audio library available")
|
utils_video.py
ADDED
|
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Video processing utilities for EceMotion Pictures.
|
| 3 |
+
Enhanced text-to-video generation with robust error handling and fallbacks.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import shutil
|
| 10 |
+
from typing import Optional, Tuple, List
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
from config import (
|
| 14 |
+
MODEL_VIDEO, MODEL_CONFIGS, get_device, VHS_INTENSITY, SCANLINE_OPACITY,
|
| 15 |
+
CHROMATIC_ABERRATION, FILM_GRAIN, get_safe_model_name
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# Global model cache
|
| 21 |
+
t2v_pipe = None
|
| 22 |
+
current_model = None
|
| 23 |
+
|
| 24 |
+
def get_t2v_pipe(device: str, model_name: str = MODEL_VIDEO):
|
| 25 |
+
"""Get or create T2V pipeline with lazy loading and model switching."""
|
| 26 |
+
global t2v_pipe, current_model
|
| 27 |
+
|
| 28 |
+
# Use safe model name
|
| 29 |
+
safe_model_name = get_safe_model_name(model_name, "video")
|
| 30 |
+
|
| 31 |
+
if t2v_pipe is None or current_model != safe_model_name:
|
| 32 |
+
logger.info(f"Loading T2V model: {safe_model_name}")
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
if "cogvideox" in safe_model_name.lower():
|
| 36 |
+
# Try CogVideoX first
|
| 37 |
+
t2v_pipe = _load_cogvideox(safe_model_name, device)
|
| 38 |
+
else:
|
| 39 |
+
# Use standard diffusers pipeline
|
| 40 |
+
t2v_pipe = _load_standard_t2v(safe_model_name, device)
|
| 41 |
+
|
| 42 |
+
if t2v_pipe is not None:
|
| 43 |
+
current_model = safe_model_name
|
| 44 |
+
logger.info(f"T2V model {safe_model_name} loaded successfully")
|
| 45 |
+
else:
|
| 46 |
+
raise RuntimeError("Failed to load any T2V model")
|
| 47 |
+
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.error(f"Failed to load {safe_model_name}: {e}")
|
| 50 |
+
# Fallback to original model
|
| 51 |
+
t2v_pipe = _load_standard_t2v("damo-vilab/text-to-video-ms-1.7b", device)
|
| 52 |
+
current_model = "damo-vilab/text-to-video-ms-1.7b"
|
| 53 |
+
|
| 54 |
+
return t2v_pipe
|
| 55 |
+
|
| 56 |
+
def _load_cogvideox(model_name: str, device: str):
|
| 57 |
+
"""Load CogVideoX model."""
|
| 58 |
+
try:
|
| 59 |
+
from diffusers import CogVideoXPipeline
|
| 60 |
+
|
| 61 |
+
pipe = CogVideoXPipeline.from_pretrained(
|
| 62 |
+
model_name,
|
| 63 |
+
torch_dtype="auto",
|
| 64 |
+
trust_remote_code=True
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
if device == "cuda":
|
| 68 |
+
pipe = pipe.to(device)
|
| 69 |
+
|
| 70 |
+
return pipe
|
| 71 |
+
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.error(f"Failed to load CogVideoX: {e}")
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
def _load_standard_t2v(model_name: str, device: str):
|
| 77 |
+
"""Load standard T2V model."""
|
| 78 |
+
try:
|
| 79 |
+
from diffusers import TextToVideoSDPipeline
|
| 80 |
+
|
| 81 |
+
pipe = TextToVideoSDPipeline.from_pretrained(
|
| 82 |
+
model_name,
|
| 83 |
+
torch_dtype="auto"
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
if device == "cuda":
|
| 87 |
+
pipe = pipe.to(device)
|
| 88 |
+
|
| 89 |
+
return pipe
|
| 90 |
+
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Failed to load standard T2V: {e}")
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
def synth_t2v(prompt: str, seed: int, num_frames: int = 32, fps: int = 8,
|
| 96 |
+
device: str = None, model_name: str = MODEL_VIDEO):
|
| 97 |
+
"""
|
| 98 |
+
Generate text-to-video with enhanced model support and frame control.
|
| 99 |
+
"""
|
| 100 |
+
if device is None:
|
| 101 |
+
device = get_device()
|
| 102 |
+
|
| 103 |
+
pipe = get_t2v_pipe(device, model_name)
|
| 104 |
+
model_config = MODEL_CONFIGS.get(current_model, {})
|
| 105 |
+
|
| 106 |
+
# Validate frame count against model limits
|
| 107 |
+
max_frames = model_config.get("max_frames", 32)
|
| 108 |
+
min_frames = model_config.get("min_frames", 8)
|
| 109 |
+
num_frames = max(min_frames, min(num_frames, max_frames))
|
| 110 |
+
|
| 111 |
+
logger.info(f"Generating {num_frames} frames at {fps}fps with {current_model}")
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
# Set up generator
|
| 115 |
+
import torch
|
| 116 |
+
generator = torch.Generator(device=device).manual_seed(seed)
|
| 117 |
+
|
| 118 |
+
# Generate frames based on model type
|
| 119 |
+
if "cogvideox" in current_model.lower():
|
| 120 |
+
# CogVideoX specific generation
|
| 121 |
+
result = pipe(
|
| 122 |
+
prompt=prompt,
|
| 123 |
+
num_frames=num_frames,
|
| 124 |
+
generator=generator,
|
| 125 |
+
guidance_scale=7.5,
|
| 126 |
+
num_inference_steps=20
|
| 127 |
+
)
|
| 128 |
+
frames = result.frames
|
| 129 |
+
else:
|
| 130 |
+
# Standard pipeline
|
| 131 |
+
result = pipe(
|
| 132 |
+
prompt=prompt,
|
| 133 |
+
num_frames=num_frames,
|
| 134 |
+
generator=generator
|
| 135 |
+
)
|
| 136 |
+
frames = result.frames
|
| 137 |
+
|
| 138 |
+
# Convert to numpy arrays and create clip
|
| 139 |
+
frame_arrays = [np.array(frame) for frame in frames]
|
| 140 |
+
|
| 141 |
+
# Create clip using moviepy
|
| 142 |
+
from moviepy.editor import ImageSequenceClip
|
| 143 |
+
clip = ImageSequenceClip(frame_arrays, fps=fps)
|
| 144 |
+
|
| 145 |
+
logger.info(f"Generated video clip: {clip.duration:.2f}s, {len(frame_arrays)} frames")
|
| 146 |
+
return clip
|
| 147 |
+
|
| 148 |
+
except Exception as e:
|
| 149 |
+
logger.error(f"Video generation failed: {e}")
|
| 150 |
+
# Return a simple fallback clip
|
| 151 |
+
return _create_fallback_clip(prompt, num_frames, fps)
|
| 152 |
+
|
| 153 |
+
def _create_fallback_clip(prompt: str, num_frames: int, fps: int):
|
| 154 |
+
"""Create a simple fallback clip when video generation fails."""
|
| 155 |
+
try:
|
| 156 |
+
from moviepy.editor import ColorClip, TextClip, CompositeVideoClip
|
| 157 |
+
|
| 158 |
+
# Create a simple colored background
|
| 159 |
+
background = ColorClip(size=(640, 480), color=(100, 50, 200), duration=num_frames/fps)
|
| 160 |
+
|
| 161 |
+
# Add text overlay
|
| 162 |
+
text = TextClip(
|
| 163 |
+
prompt[:50] + "..." if len(prompt) > 50 else prompt,
|
| 164 |
+
fontsize=24,
|
| 165 |
+
color='white',
|
| 166 |
+
font='Arial-Bold'
|
| 167 |
+
).set_position('center').set_duration(num_frames/fps)
|
| 168 |
+
|
| 169 |
+
# Composite the clips
|
| 170 |
+
clip = CompositeVideoClip([background, text])
|
| 171 |
+
|
| 172 |
+
logger.info(f"Created fallback clip: {clip.duration:.2f}s")
|
| 173 |
+
return clip
|
| 174 |
+
|
| 175 |
+
except Exception as e:
|
| 176 |
+
logger.error(f"Failed to create fallback clip: {e}")
|
| 177 |
+
# Last resort: create a simple color clip
|
| 178 |
+
from moviepy.editor import ColorClip
|
| 179 |
+
return ColorClip(size=(640, 480), color=(100, 50, 200), duration=5.0)
|
| 180 |
+
|
| 181 |
+
def apply_retro_filters(input_path: str, output_path: str, intensity: float = VHS_INTENSITY):
|
| 182 |
+
"""
|
| 183 |
+
Apply authentic VHS/CRT effects with enhanced visual artifacts.
|
| 184 |
+
"""
|
| 185 |
+
logger.info(f"Applying retro filters with intensity {intensity}")
|
| 186 |
+
|
| 187 |
+
# Check if ffmpeg is available
|
| 188 |
+
if not _check_ffmpeg():
|
| 189 |
+
logger.warning("ffmpeg not available, using simple filter")
|
| 190 |
+
_apply_simple_retro_filters(input_path, output_path)
|
| 191 |
+
return
|
| 192 |
+
|
| 193 |
+
try:
|
| 194 |
+
# Build filter chain for authentic VHS look
|
| 195 |
+
filters = []
|
| 196 |
+
|
| 197 |
+
# 1. Format conversion
|
| 198 |
+
filters.append('format=yuv420p')
|
| 199 |
+
|
| 200 |
+
# 2. Basic color grading for retro look
|
| 201 |
+
filters.append(f'hue=s={0.8 + 0.2 * intensity}')
|
| 202 |
+
filters.append(f'eq=brightness={0.02 * intensity}:contrast={1.0 + 0.1 * intensity}:saturation={1.0 + 0.2 * intensity}:gamma={1.0 - 0.05 * intensity}')
|
| 203 |
+
|
| 204 |
+
# 3. VHS tracking lines and noise
|
| 205 |
+
if intensity > 0.3:
|
| 206 |
+
filters.append(f'tblend=all_mode=difference:all_opacity={0.05 * intensity}')
|
| 207 |
+
filters.append(f'noise=alls={int(20 * intensity)}:allf=t')
|
| 208 |
+
|
| 209 |
+
# 4. Film grain
|
| 210 |
+
if FILM_GRAIN > 0:
|
| 211 |
+
grain = FILM_GRAIN * intensity
|
| 212 |
+
filters.append(f'noise=alls={int(15 * grain)}:allf=u')
|
| 213 |
+
|
| 214 |
+
# 5. Vignetting
|
| 215 |
+
filters.append(f'vignette=PI/4:{0.3 * intensity}')
|
| 216 |
+
|
| 217 |
+
# Apply filters using ffmpeg
|
| 218 |
+
import ffmpeg
|
| 219 |
+
|
| 220 |
+
stream = ffmpeg.input(input_path)
|
| 221 |
+
|
| 222 |
+
# Apply filter chain
|
| 223 |
+
if len(filters) > 1:
|
| 224 |
+
filter_string = ','.join(filters)
|
| 225 |
+
stream = stream.filter_complex(filter_string)
|
| 226 |
+
else:
|
| 227 |
+
stream = stream.filter('format', 'yuv420p')
|
| 228 |
+
|
| 229 |
+
# Output with high quality settings
|
| 230 |
+
stream = stream.output(
|
| 231 |
+
output_path,
|
| 232 |
+
vcodec='libx264',
|
| 233 |
+
pix_fmt='yuv420p',
|
| 234 |
+
crf=20, # Good quality
|
| 235 |
+
preset='medium',
|
| 236 |
+
movflags='+faststart'
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
stream.overwrite_output().run(quiet=True)
|
| 240 |
+
logger.info("Retro filters applied successfully")
|
| 241 |
+
|
| 242 |
+
except Exception as e:
|
| 243 |
+
logger.error(f"Failed to apply retro filters: {e}")
|
| 244 |
+
# Fallback to simple filter
|
| 245 |
+
_apply_simple_retro_filters(input_path, output_path)
|
| 246 |
+
|
| 247 |
+
def _check_ffmpeg() -> bool:
|
| 248 |
+
"""Check if ffmpeg is available."""
|
| 249 |
+
try:
|
| 250 |
+
import subprocess
|
| 251 |
+
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
|
| 252 |
+
return True
|
| 253 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
| 254 |
+
return False
|
| 255 |
+
|
| 256 |
+
def _apply_simple_retro_filters(input_path: str, output_path: str):
|
| 257 |
+
"""Fallback simple retro filter application."""
|
| 258 |
+
try:
|
| 259 |
+
import ffmpeg
|
| 260 |
+
|
| 261 |
+
(
|
| 262 |
+
ffmpeg
|
| 263 |
+
.input(input_path)
|
| 264 |
+
.filter('format', 'yuv420p')
|
| 265 |
+
.filter('tblend', all_mode='difference', all_opacity=0.05)
|
| 266 |
+
.filter('hue', s=0.9)
|
| 267 |
+
.filter('eq', brightness=0.02, contrast=1.05, saturation=1.1, gamma=0.98)
|
| 268 |
+
.filter('noise', alls=10)
|
| 269 |
+
.output(output_path, vcodec='libx264', pix_fmt='yuv420p', crf=20, movflags='+faststart')
|
| 270 |
+
.overwrite_output()
|
| 271 |
+
.run(quiet=True)
|
| 272 |
+
)
|
| 273 |
+
logger.info("Simple retro filters applied as fallback")
|
| 274 |
+
except Exception as e:
|
| 275 |
+
logger.error(f"Even simple retro filters failed: {e}")
|
| 276 |
+
# Just copy the file
|
| 277 |
+
shutil.copy2(input_path, output_path)
|
| 278 |
+
|
| 279 |
+
def mux_audio(video_in: str, audio_in: str, out_path: str):
|
| 280 |
+
"""Mux video and audio with error handling."""
|
| 281 |
+
try:
|
| 282 |
+
if _check_ffmpeg():
|
| 283 |
+
_mux_with_ffmpeg(video_in, audio_in, out_path)
|
| 284 |
+
else:
|
| 285 |
+
_mux_with_moviepy(video_in, audio_in, out_path)
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logger.error(f"Audio muxing failed: {e}")
|
| 288 |
+
# Fallback: just copy video
|
| 289 |
+
shutil.copy2(video_in, out_path)
|
| 290 |
+
|
| 291 |
+
def _mux_with_ffmpeg(video_in: str, audio_in: str, out_path: str):
|
| 292 |
+
"""Mux using ffmpeg."""
|
| 293 |
+
import ffmpeg
|
| 294 |
+
|
| 295 |
+
(
|
| 296 |
+
ffmpeg
|
| 297 |
+
.input(video_in)
|
| 298 |
+
.input(audio_in)
|
| 299 |
+
.output(out_path, vcodec='copy', acodec='aac', audio_bitrate='128k', movflags='+faststart')
|
| 300 |
+
.overwrite_output()
|
| 301 |
+
.run(quiet=True)
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
def _mux_with_moviepy(video_in: str, audio_in: str, out_path: str):
|
| 305 |
+
"""Mux using moviepy (fallback)."""
|
| 306 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
| 307 |
+
|
| 308 |
+
# Load video and audio
|
| 309 |
+
video = VideoFileClip(video_in)
|
| 310 |
+
audio = AudioFileClip(audio_in)
|
| 311 |
+
|
| 312 |
+
# Set audio duration to match video
|
| 313 |
+
if audio.duration > video.duration:
|
| 314 |
+
audio = audio.subclip(0, video.duration)
|
| 315 |
+
elif audio.duration < video.duration:
|
| 316 |
+
# Pad audio with silence
|
| 317 |
+
from moviepy.audio.AudioClip import AudioClip
|
| 318 |
+
silence = AudioClip(lambda t: 0, duration=video.duration - audio.duration)
|
| 319 |
+
audio = audio.concatenate_audioclips([audio, silence])
|
| 320 |
+
|
| 321 |
+
# Combine and write
|
| 322 |
+
final_video = video.set_audio(audio)
|
| 323 |
+
final_video.write_videofile(
|
| 324 |
+
out_path,
|
| 325 |
+
codec='libx264',
|
| 326 |
+
audio_codec='aac',
|
| 327 |
+
temp_audiofile='temp-audio.m4a',
|
| 328 |
+
remove_temp=True,
|
| 329 |
+
verbose=False,
|
| 330 |
+
logger=None
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
# Clean up
|
| 334 |
+
video.close()
|
| 335 |
+
audio.close()
|
| 336 |
+
final_video.close()
|