File size: 15,451 Bytes
ef107f6
 
8b05224
 
 
76e06c0
8b05224
 
76e06c0
 
 
8b05224
 
 
 
 
 
 
 
 
 
449544a
 
 
 
 
 
 
8b05224
76e06c0
8b05224
 
76e06c0
 
 
 
 
 
8b05224
449544a
 
 
 
a5854e9
d269828
449544a
ac57303
449544a
 
ac57303
449544a
 
ac57303
449544a
ac57303
 
 
 
 
449544a
76e06c0
 
8b05224
 
76e06c0
8b05224
 
 
 
 
76e06c0
 
 
 
 
 
8b05224
76e06c0
8b05224
76e06c0
 
8b05224
 
 
 
 
 
76e06c0
8b05224
 
 
 
 
76e06c0
8b05224
 
 
0940c19
 
 
8b05224
449544a
8b05224
449544a
 
76e06c0
 
 
 
 
 
 
 
 
 
 
8b05224
 
76e06c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0940c19
 
 
8b05224
 
 
76e06c0
 
8b05224
0940c19
8b05224
0940c19
 
 
76e06c0
8b05224
76e06c0
 
 
 
8b05224
0940c19
76e06c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b05224
 
76e06c0
8b05224
 
76e06c0
8b05224
76e06c0
 
 
 
 
8b05224
76e06c0
 
 
 
 
8b05224
76e06c0
 
8b05224
76e06c0
 
8b05224
 
 
 
 
76e06c0
8b05224
76e06c0
8b05224
 
76e06c0
8b05224
76e06c0
 
 
 
8b05224
76e06c0
 
 
 
 
8b05224
76e06c0
 
 
 
 
 
 
 
 
 
 
 
 
8b05224
76e06c0
 
8b05224
 
 
76e06c0
 
8b05224
76e06c0
 
 
 
 
8b05224
76e06c0
 
8b05224
76e06c0
 
 
 
 
8b05224
76e06c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b05224
 
76e06c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b05224
76e06c0
9326a98
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#app.py
from __future__ import annotations 
import gradio as gr
import os
import shutil
import datetime
from typing import List, Optional

# ──────────────────────────────────────────────────────────────────────────────
# Import project‑specific helpers β€” unchanged from initial version
# ──────────────────────────────────────────────────────────────────────────────
from scripts.generate_scripts import generate_script, generate_title, generate_description
from scripts.generate_voice import generate_voice
from scripts.get_footage import get_video_montage_from_folder
from scripts.edit_video import edit_video
from scripts.generate_subtitles import (
    transcribe_audio_to_subs,
    chunk_text_by_words,
    add_subtitles_to_video,
)

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

# ──────────────────────────────────────────────────────────────────────────────
# Constants & utilities
# ──────────────────────────────────────────────────────────────────────────────
WORDS_PER_SECOND = 2.3  # ≃ 140 wpm
ASSETS_DIRS = (
    "./assets/audio",
    "./assets/backgrounds",
    "./assets/output",
    "./assets/video_music",
)


# ────────────────────────────────────────────────────────
#   CONFIGURATION
# ────────────────────────────────────────────────────────
MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-4B")
DTYPE    = torch.float16                          # or torch.float16

print(f"πŸ”„ Loading {MODEL_ID} (dtype = {DTYPE}) …")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=DTYPE,
    trust_remote_code=True,
)
model.to("cuda" if torch.cuda.is_available() else "cpu")  # single-device move

DEVICE = next(model.parameters()).device
print(f"βœ… Model ready on {DEVICE}.")

for d in ASSETS_DIRS:
    os.makedirs(d, exist_ok=True)

def safe_copy(src: str, dst: str) -> str:
    """Copy src β†’ dst unless they are the same file, returns destination path."""
    if os.path.abspath(src) == os.path.abspath(dst):
        return src
    shutil.copy(src, dst)
    return dst

# Wrapper util to timestamp generated files so different runs don't overwrite each other

def timestamped_filename(prefix: str, ext: str) -> str:
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    return os.path.join("./assets/output", f"{prefix}_{ts}.{ext}")

# ──────────────────────────────────────────────────────────────────────────────
# Independent functional endpoints (Gradio callbacks)
# ──────────────────────────────────────────────────────────────────────────────

def cb_generate_script(

    context: str,

    instruction: str,

    target_duration: int,

    script_mode: str,

    custom_script: Optional[str],

):
    """Generate (or accept) a script + title/description."""

    approx_words = int(target_duration * WORDS_PER_SECOND)

    if script_mode == "Use my script":
        if not custom_script or not custom_script.strip():
            raise gr.Error("❌ You selected 'Use my script' but the script field is empty!")
        script = custom_script.strip()
    else:
        prompt = (
            f"You are a video creation expert. Here is the context: {context.strip()}\n"
            f"Instruction: {instruction.strip()}\n"
            f"πŸ”΄ Strict target duration: {target_duration}s β€” β‰ˆ {approx_words} words (must be respected)."
        )
        script = generate_script(model,tokenizer,prompt)

    title = generate_title(model,tokenizer,script)
    description = generate_description(model,tokenizer,script)
    return script, title, description, script  # last return for state update


def cb_generate_voice(script: str):
    """TTS generation from a given script, returns path to MP3."""
    if not script or not script.strip():
        raise gr.Error("❌ Script text is empty – generate or paste a script first.")

    voice_path = timestamped_filename("voice", "mp3")
    generate_voice(script, voice_path)
    return voice_path, voice_path  # second value updates state


def accumulate_files(new: List[str], state: List[str] | None):
    """Append only new valid MP4 files to state."""
    state = state or []
    for f in new or []:
        if (
            isinstance(f, str)
            and os.path.isfile(f)
            and f.lower().endswith(".mp4")
            and f not in state
        ):
            state.append(f)
    return state


def cb_create_montage(

    accumulated_videos: List[str],

    voice_path: str,

    lum: float,

    contrast: float,

    gamma: float,

    show_bar: bool,

):
    """Create the background‑video montage synced to the narration audio."""

    if not accumulated_videos:
        raise gr.Error("❌ Please upload at least one background video (.mp4) before generating the montage.")
    if not voice_path or not os.path.isfile(voice_path):
        raise gr.Error("❌ A narration audio file (.mp3) is required – generate or upload one first.")

    # Clean previous backgrounds, then copy new ones
    for f in os.listdir("./assets/backgrounds"):
        if f.lower().endswith(".mp4"):
            os.remove(os.path.join("./assets/backgrounds", f))
    for idx, v in enumerate(accumulated_videos):
        safe_copy(v, os.path.join("./assets/backgrounds", f"video_{idx:03d}.mp4"))

    montage_path = timestamped_filename("montage", "mp4")
    _ = get_video_montage_from_folder(
        folder_path="./assets/backgrounds",
        audio_path=voice_path,
        output_dir="./assets/video_music",
        lum=lum,
        contrast=contrast,
        gamma=gamma,
        show_progress_bar=show_bar,
    )
    # get_video_montage_from_folder already saves the file – we just need its path
    # It returns the path, so capture it
    montage_path = _
    return montage_path, montage_path


def cb_mix_audio(

    montage_path: str,

    voice_path: str,

    music_file: Optional[str] = None,

):
    """Combine montage video, voice audio, and optional background music."""
    if not montage_path or not os.path.isfile(montage_path):
        raise gr.Error("❌ Please generate a montage video first.")
    if not voice_path or not os.path.isfile(voice_path):
        raise gr.Error("❌ Narration audio missing – generate or upload it.")

    music_path = music_file if music_file and os.path.isfile(music_file) else None
    final_no_subs = timestamped_filename("final_no_subs", "mp4")
    edit_video(montage_path, voice_path, music_path, final_no_subs)
    return final_no_subs, final_no_subs


def cb_add_subtitles(final_no_subs: str, voice_path: str):
    """Overlay dynamic subtitles on the mixed video."""
    if not final_no_subs or not os.path.isfile(final_no_subs):
        raise gr.Error("❌ Mixed video not found – run the 'Mix Audio/Video' step first.")
    if not voice_path or not os.path.isfile(voice_path):
        raise gr.Error("❌ Narration audio missing.")

    segments = transcribe_audio_to_subs(voice_path)
    subs = chunk_text_by_words(segments, max_words=3)
    final_with_subs = timestamped_filename("final_with_subs", "mp4")
    add_subtitles_to_video(final_no_subs, subs, final_with_subs)
    return final_with_subs

# ──────────────────────────────────────────────────────────────────────────────
# Gradio UI – one tab per function
# ──────────────────────────────────────────────────────────────────────────────

demo = gr.Blocks(theme="gradio/soft")

with demo:
    gr.Markdown("# 🎬 Modular AI Video Toolkit")
    gr.Markdown(
        "Each tab exposes **one single processing step** so you can mix & match them as you like. πŸ’‘"
    )

    # Shared state across tabs
    script_state = gr.State("")
    voice_state = gr.State("")
    montage_state = gr.State("")
    final_no_subs_state = gr.State("")

    # ───────────────────────── Script generation ─────────────────────────
    with gr.Tab("1️⃣ Generate Script"):
        with gr.Row():
            context_in = gr.Textbox(label="🧠 Context", lines=4)
            instruction_in = gr.Textbox(label="🎯 Instruction", lines=4)
        duration_slider = gr.Slider(5, 120, 1, 60, label="⏱️ Target duration (s)")
        script_mode = gr.Radio([
            "Generate script with AI",
            "Use my script",
        ], value="Generate script with AI", label="Script mode")
        custom_script_in = gr.Textbox(label="✍️ My script", lines=8, interactive=False)

        def _toggle(mode):
            return gr.update(interactive=(mode == "Use my script"))

        script_mode.change(_toggle, inputs=script_mode, outputs=custom_script_in)

        gen_script_btn = gr.Button("πŸ“ Create Script", variant="primary")
        script_out = gr.Textbox(label="Script", lines=8, interactive=False)
        title_out = gr.Textbox(label="Title", lines=1, interactive=False)
        desc_out = gr.Textbox(label="Description", lines=3, interactive=False)

        gen_script_btn.click(
            cb_generate_script,
            [context_in, instruction_in, duration_slider, script_mode, custom_script_in],
            [script_out, title_out, desc_out, script_state],
        )

    # ───────────────────────── Voice generation ─────────────────────────
    with gr.Tab("2️⃣ Generate Voice"):
        script_in_voice = gr.Textbox(label="Script (paste or use from previous step)", lines=8)
        gen_voice_btn = gr.Button("πŸ”ˆ Synthesize Voice", variant="primary")
        voice_audio = gr.Audio(label="Generated voice", interactive=False)

        gen_voice_btn.click(
            cb_generate_voice,
            inputs=[script_in_voice],
            outputs=[voice_audio, voice_state],
        )
        # Auto‑populate script textbox with state when it updates
        script_state.change(lambda s: s, script_state, script_in_voice, queue=False)

    # ───────────────────────── Montage creation ─────────────────────────
    with gr.Tab("3️⃣ Create Montage"):
        videos_dropzone = gr.Files(label="🎞️ Background videos (MP4)", file_types=[".mp4"], type="filepath")
        videos_state = gr.State([])
        videos_dropzone.upload(accumulate_files, [videos_dropzone, videos_state], videos_state, queue=False)
        videos_display = gr.Textbox(label="Selected videos", interactive=False)
        videos_state.change(lambda s: "\n".join(os.path.basename(f) for f in s), videos_state, videos_display, queue=False)

        with gr.Accordion("🎨 Visual settings", open=False):
            lum_slider = gr.Slider(0, 20, 6, step=0.5, label="Brightness (0–20)")
            contrast_slider = gr.Slider(0.5, 2.0, 1.0, step=0.05, label="Contrast (0.5–2.0)")
            gamma_slider = gr.Slider(0.5, 2.0, 1.0, step=0.05, label="Gamma (0.5–2.0)")
        show_bar = gr.Checkbox(label="Show progress bar", value=True)

        create_montage_btn = gr.Button("🎞️ Build Montage", variant="primary")
        montage_video = gr.Video(label="Montage Preview")

        create_montage_btn.click(
            cb_create_montage,
            [videos_state, voice_state, lum_slider, contrast_slider, gamma_slider, show_bar],
            [montage_video, montage_state],
        )

    # ───────────────────────── Mixing (voice + music) ─────────────────────────
    with gr.Tab("4️⃣ Mix Audio / Video"):
        voice_in = gr.File(label="Narration MP3 (optional – leave empty to use state)", file_types=[".mp3"], type="filepath")
        montage_in = gr.File(label="Montage MP4 (optional – leave empty to use state)", file_types=[".mp4"], type="filepath")
        music_in = gr.File(label="Background music (MP3 – optional)", file_types=[".mp3"], type="filepath")

        def _use_state(file, state):
            return file if file else state

        mix_btn = gr.Button("🎚️ Mix", variant="primary")
        final_no_subs_vid = gr.Video(label="Mixed video (no subtitles)")

        mix_btn.click(
            lambda montage, voice, music, montage_state_val, voice_state_val: cb_mix_audio(
                _use_state(montage, montage_state_val),
                _use_state(voice, voice_state_val),
                music,
            ),
            [montage_in, voice_in, music_in, montage_state, voice_state],
            [final_no_subs_vid, final_no_subs_state],
        )

    # ───────────────────────── Subtitles ─────────────────────────
    with gr.Tab("5️⃣ Add Subtitles"):
        video_in_sub = gr.File(label="Video MP4 (optional – defaults to last mixed video)", type="filepath", file_types=[".mp4"])
        voice_in_sub = gr.File(label="Narration MP3 (optional – defaults to last generated voice)", type="filepath", file_types=[".mp3"])
        add_subs_btn = gr.Button("πŸ”€ Add Subtitles", variant="primary")
        final_subs_video = gr.Video(label="Final video with subtitles")

        add_subs_btn.click(
            lambda v_in, a_in, v_state, a_state: cb_add_subtitles(
                v_in if v_in else v_state,
                a_in if a_in else a_state,
            ),
            [video_in_sub, voice_in_sub, final_no_subs_state, voice_state],
            final_subs_video,
        )

    # Startup
    demo.launch()