Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -1,7 +1,8 @@ | |
| 1 | 
            -
            import types
         | 
| 2 | 
             
            import torch
         | 
| 3 | 
            -
             | 
|  | |
| 4 | 
             
            from diffusers.utils import export_to_video
         | 
|  | |
| 5 | 
             
            import gradio as gr
         | 
| 6 | 
             
            import tempfile
         | 
| 7 | 
             
            import spaces
         | 
| @@ -9,9 +10,9 @@ from huggingface_hub import hf_hub_download | |
| 9 | 
             
            import numpy as np
         | 
| 10 | 
             
            import random
         | 
| 11 | 
             
            import logging
         | 
| 12 | 
            -
            import torchaudio
         | 
| 13 | 
             
            import os
         | 
| 14 | 
             
            import gc
         | 
|  | |
| 15 |  | 
| 16 | 
             
            # MMAudio imports
         | 
| 17 | 
             
            try:
         | 
| @@ -20,7 +21,7 @@ except ImportError: | |
| 20 | 
             
                os.system("pip install -e .")
         | 
| 21 | 
             
                import mmaudio
         | 
| 22 |  | 
| 23 | 
            -
            # Set environment variables | 
| 24 | 
             
            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
         | 
| 25 | 
             
            os.environ['HF_HUB_CACHE'] = '/tmp/hub'
         | 
| 26 |  | 
| @@ -31,13 +32,111 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio | |
| 31 | 
             
            from mmaudio.model.sequence_config import SequenceConfig
         | 
| 32 | 
             
            from mmaudio.model.utils.features_utils import FeaturesUtils
         | 
| 33 |  | 
| 34 | 
            -
            # NAG  | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 37 |  | 
| 38 | 
            -
            # Clean up temp files | 
| 39 | 
             
            def cleanup_temp_files():
         | 
| 40 | 
            -
                """Clean up temporary files to save storage"""
         | 
| 41 | 
             
                temp_dir = tempfile.gettempdir()
         | 
| 42 | 
             
                for filename in os.listdir(temp_dir):
         | 
| 43 | 
             
                    filepath = os.path.join(temp_dir, filename)
         | 
| @@ -47,23 +146,24 @@ def cleanup_temp_files(): | |
| 47 | 
             
                    except:
         | 
| 48 | 
             
                        pass
         | 
| 49 |  | 
| 50 | 
            -
            # Video generation model setup | 
| 51 | 
             
            MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
         | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 |  | 
|  | |
| 55 | 
             
            vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
         | 
| 56 | 
            -
            wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
         | 
| 57 | 
            -
            transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
         | 
| 58 | 
             
            pipe = NAGWanPipeline.from_pretrained(
         | 
| 59 | 
            -
                MODEL_ID, vae=vae,  | 
| 60 | 
             
            )
         | 
| 61 | 
            -
            pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift= | 
| 62 | 
             
            pipe.to("cuda")
         | 
| 63 |  | 
| 64 | 
            -
             | 
| 65 | 
            -
             | 
| 66 | 
            -
            pipe. | 
|  | |
|  | |
| 67 |  | 
| 68 | 
             
            # Audio generation model setup
         | 
| 69 | 
             
            torch.backends.cuda.matmul.allow_tf32 = True
         | 
| @@ -73,14 +173,13 @@ log = logging.getLogger() | |
| 73 | 
             
            device = 'cuda'
         | 
| 74 | 
             
            dtype = torch.bfloat16
         | 
| 75 |  | 
| 76 | 
            -
            # Global variables for audio model | 
| 77 | 
             
            audio_model = None
         | 
| 78 | 
             
            audio_net = None
         | 
| 79 | 
             
            audio_feature_utils = None
         | 
| 80 | 
             
            audio_seq_cfg = None
         | 
| 81 |  | 
| 82 | 
             
            def load_audio_model():
         | 
| 83 | 
            -
                """Load audio model on demand to save storage"""
         | 
| 84 | 
             
                global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
         | 
| 85 |  | 
| 86 | 
             
                if audio_net is None:
         | 
| @@ -114,7 +213,6 @@ DEFAULT_STEPS = 4 | |
| 114 | 
             
            DEFAULT_SEED = 2025
         | 
| 115 | 
             
            DEFAULT_H_SLIDER_VALUE = 480
         | 
| 116 | 
             
            DEFAULT_W_SLIDER_VALUE = 832
         | 
| 117 | 
            -
            NEW_FORMULA_MAX_AREA = 480.0 * 832.0
         | 
| 118 |  | 
| 119 | 
             
            SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
         | 
| 120 | 
             
            SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
         | 
| @@ -125,6 +223,7 @@ MIN_FRAMES_MODEL = 8 | |
| 125 | 
             
            MAX_FRAMES_MODEL = 129
         | 
| 126 |  | 
| 127 | 
             
            DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
         | 
|  | |
| 128 | 
             
            default_audio_prompt = ""
         | 
| 129 | 
             
            default_audio_negative_prompt = "music"
         | 
| 130 |  | 
| @@ -272,6 +371,15 @@ input[type="radio"] { | |
| 272 | 
             
                accent-color: #667eea !important;
         | 
| 273 | 
             
            }
         | 
| 274 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 275 | 
             
            /* 반응형 애니메이션 */
         | 
| 276 | 
             
            @media (max-width: 768px) {
         | 
| 277 | 
             
                h1 { font-size: 2rem !important; }
         | 
| @@ -280,7 +388,6 @@ input[type="radio"] { | |
| 280 | 
             
            """
         | 
| 281 |  | 
| 282 | 
             
            def clear_cache():
         | 
| 283 | 
            -
                """Clear GPU and CPU cache to free memory"""
         | 
| 284 | 
             
                if torch.cuda.is_available():
         | 
| 285 | 
             
                    torch.cuda.empty_cache()
         | 
| 286 | 
             
                    torch.cuda.synchronize()
         | 
| @@ -292,19 +399,14 @@ def get_duration(prompt, nag_negative_prompt, nag_scale, | |
| 292 | 
             
                            audio_mode, audio_prompt, audio_negative_prompt,
         | 
| 293 | 
             
                            audio_seed, audio_steps, audio_cfg_strength,
         | 
| 294 | 
             
                            progress):
         | 
| 295 | 
            -
                 | 
| 296 | 
            -
                
         | 
| 297 | 
            -
                # Add extra time for audio generation
         | 
| 298 | 
             
                if audio_mode == "Enable Audio":
         | 
| 299 | 
            -
                     | 
| 300 | 
            -
                
         | 
| 301 | 
            -
                return base_duration
         | 
| 302 |  | 
| 303 | 
             
            @torch.inference_mode()
         | 
| 304 | 
             
            def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_prompt, 
         | 
| 305 | 
             
                                  audio_seed, audio_steps, audio_cfg_strength):
         | 
| 306 | 
            -
                """Add audio to video using MMAudio"""
         | 
| 307 | 
            -
                # Load audio model on demand
         | 
| 308 | 
             
                net, feature_utils, seq_cfg = load_audio_model()
         | 
| 309 |  | 
| 310 | 
             
                rng = torch.Generator(device=device)
         | 
| @@ -332,7 +434,6 @@ def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_pr | |
| 332 | 
             
                                  cfg_strength=audio_cfg_strength)
         | 
| 333 | 
             
                audio = audios.float().cpu()[0]
         | 
| 334 |  | 
| 335 | 
            -
                # Save video with audio
         | 
| 336 | 
             
                video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
         | 
| 337 | 
             
                make_video(video_info, video_with_audio_path, audio, sampling_rate=seq_cfg.sampling_rate)
         | 
| 338 |  | 
| @@ -346,6 +447,9 @@ def generate_video(prompt, nag_negative_prompt, nag_scale, | |
| 346 | 
             
                               audio_seed, audio_steps, audio_cfg_strength,
         | 
| 347 | 
             
                               progress=gr.Progress(track_tqdm=True)):
         | 
| 348 |  | 
|  | |
|  | |
|  | |
| 349 | 
             
                target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
         | 
| 350 | 
             
                target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
         | 
| 351 |  | 
| @@ -355,14 +459,16 @@ def generate_video(prompt, nag_negative_prompt, nag_scale, | |
| 355 |  | 
| 356 | 
             
                # Generate video using NAG
         | 
| 357 | 
             
                with torch.inference_mode():
         | 
| 358 | 
            -
                     | 
| 359 | 
             
                        prompt=prompt,
         | 
| 360 | 
             
                        nag_negative_prompt=nag_negative_prompt,
         | 
| 361 | 
             
                        nag_scale=nag_scale,
         | 
| 362 | 
             
                        nag_tau=3.5,
         | 
| 363 | 
             
                        nag_alpha=0.5,
         | 
| 364 | 
            -
                        height=target_h,  | 
| 365 | 
            -
                         | 
|  | |
|  | |
| 366 | 
             
                        num_inference_steps=int(steps),
         | 
| 367 | 
             
                        generator=torch.Generator(device="cuda").manual_seed(current_seed)
         | 
| 368 | 
             
                    ).frames[0]
         | 
| @@ -370,7 +476,7 @@ def generate_video(prompt, nag_negative_prompt, nag_scale, | |
| 370 | 
             
                # Save video without audio
         | 
| 371 | 
             
                with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         | 
| 372 | 
             
                    video_path = tmpfile.name
         | 
| 373 | 
            -
                export_to_video( | 
| 374 |  | 
| 375 | 
             
                # Generate audio if enabled
         | 
| 376 | 
             
                video_with_audio_path = None
         | 
| @@ -382,41 +488,37 @@ def generate_video(prompt, nag_negative_prompt, nag_scale, | |
| 382 | 
             
                        audio_seed, audio_steps, audio_cfg_strength
         | 
| 383 | 
             
                    )
         | 
| 384 |  | 
| 385 | 
            -
                # Clear cache to free memory
         | 
| 386 | 
             
                clear_cache()
         | 
| 387 | 
             
                cleanup_temp_files()
         | 
| 388 |  | 
| 389 | 
             
                return video_path, video_with_audio_path, current_seed
         | 
| 390 |  | 
| 391 | 
             
            def update_audio_visibility(audio_mode):
         | 
| 392 | 
            -
                """Update visibility of audio-related components"""
         | 
| 393 | 
             
                return gr.update(visible=(audio_mode == "Enable Audio"))
         | 
| 394 |  | 
| 395 | 
             
            with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
         | 
| 396 | 
             
                with gr.Column(elem_classes=["main-container"]):
         | 
| 397 | 
             
                    gr.Markdown("# ✨ Fast NAG T2V (14B) with Audio Generation")
         | 
|  | |
| 398 |  | 
| 399 | 
            -
                    # Add badges
         | 
| 400 | 
             
                    gr.HTML("""
         | 
| 401 | 
            -
                    <div class=" | 
| 402 | 
            -
                        < | 
| 403 | 
            -
             | 
| 404 | 
            -
                        </ | 
| 405 | 
            -
                        <a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX2" target="_blank">
         | 
| 406 | 
            -
                            <img src="https://img.shields.io/static/v1?label=BASE&message=WAN%202.1%20T2V-Fusioni2X&color=%23008080&labelColor=%23533a7d&logo=huggingface&logoColor=%23ffffff&style=for-the-badge" alt="Base Model">
         | 
| 407 | 
            -
                        </a>
         | 
| 408 | 
             
                    </div>
         | 
| 409 | 
             
                    """)
         | 
| 410 |  | 
| 411 | 
             
                    with gr.Row():
         | 
| 412 | 
             
                        with gr.Column(elem_classes=["input-container"]):
         | 
| 413 | 
             
                            prompt_input = gr.Textbox(
         | 
| 414 | 
            -
                                label=" | 
|  | |
| 415 | 
             
                                placeholder="Describe your video scene in detail...",
         | 
| 416 | 
             
                                lines=3
         | 
| 417 | 
             
                            )
         | 
| 418 |  | 
| 419 | 
            -
                            with gr.Accordion("🎨 NAG Settings", open= | 
| 420 | 
             
                                nag_negative_prompt = gr.Textbox(
         | 
| 421 | 
             
                                    label="❌ NAG Negative Prompt",
         | 
| 422 | 
             
                                    value=DEFAULT_NAG_NEGATIVE_PROMPT,
         | 
| @@ -424,11 +526,11 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| 424 | 
             
                                )
         | 
| 425 | 
             
                                nag_scale = gr.Slider(
         | 
| 426 | 
             
                                    label="🎯 NAG Scale",
         | 
| 427 | 
            -
                                    minimum= | 
| 428 | 
             
                                    maximum=20.0,
         | 
| 429 | 
             
                                    step=0.25,
         | 
| 430 | 
             
                                    value=11.0,
         | 
| 431 | 
            -
                                    info=" | 
| 432 | 
             
                                )
         | 
| 433 |  | 
| 434 | 
             
                            duration_seconds_input = gr.Slider(
         | 
| @@ -440,7 +542,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| 440 | 
             
                                info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
         | 
| 441 | 
             
                            )
         | 
| 442 |  | 
| 443 | 
            -
                            # Audio mode radio button
         | 
| 444 | 
             
                            audio_mode = gr.Radio(
         | 
| 445 | 
             
                                choices=["Video Only", "Enable Audio"],
         | 
| 446 | 
             
                                value="Video Only",
         | 
| @@ -448,7 +549,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| 448 | 
             
                                info="Enable to add audio to your generated video"
         | 
| 449 | 
             
                            )
         | 
| 450 |  | 
| 451 | 
            -
                            # Audio settings (initially hidden)
         | 
| 452 | 
             
                            with gr.Column(visible=False) as audio_settings:
         | 
| 453 | 
             
                                audio_prompt = gr.Textbox(
         | 
| 454 | 
             
                                    label="🎵 Audio Prompt",
         | 
| @@ -539,6 +639,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| 539 | 
             
                                interactive=False,
         | 
| 540 | 
             
                                visible=False
         | 
| 541 | 
             
                            )
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 542 |  | 
| 543 | 
             
                    # Event handlers
         | 
| 544 | 
             
                    audio_mode.change(
         | 
| @@ -570,7 +676,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: | |
| 570 | 
             
                                ["A red vintage Porsche convertible flying over a rugged coastal cliff. Monstrous waves violently crashing against the rocks below. A lighthouse stands tall atop the cliff.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
         | 
| 571 | 
             
                                 DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
         | 
| 572 | 
             
                                 DEFAULT_STEPS, DEFAULT_SEED, False,
         | 
| 573 | 
            -
                                 "Enable Audio", "car engine, ocean waves crashing, wind", default_audio_negative_prompt, -1, 25, 4.5],
         | 
| 574 | 
             
                                ["Enormous glowing jellyfish float slowly across a sky filled with soft clouds. Their tentacles shimmer with iridescent light as they drift above a peaceful mountain landscape. Magical and dreamlike, captured in a wide shot. Surreal realism style with detailed textures.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
         | 
| 575 | 
             
                                 DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
         | 
| 576 | 
             
                                 DEFAULT_STEPS, DEFAULT_SEED, False,
         | 
|  | |
|  | |
| 1 | 
             
            import torch
         | 
| 2 | 
            +
            import torch.nn.functional as F
         | 
| 3 | 
            +
            from diffusers import AutoencoderKLWan, WanVideoTextToVideoPipeline, UniPCMultistepScheduler
         | 
| 4 | 
             
            from diffusers.utils import export_to_video
         | 
| 5 | 
            +
            from diffusers.models import Transformer2DModel
         | 
| 6 | 
             
            import gradio as gr
         | 
| 7 | 
             
            import tempfile
         | 
| 8 | 
             
            import spaces
         | 
|  | |
| 10 | 
             
            import numpy as np
         | 
| 11 | 
             
            import random
         | 
| 12 | 
             
            import logging
         | 
|  | |
| 13 | 
             
            import os
         | 
| 14 | 
             
            import gc
         | 
| 15 | 
            +
            from typing import List, Optional, Union
         | 
| 16 |  | 
| 17 | 
             
            # MMAudio imports
         | 
| 18 | 
             
            try:
         | 
|  | |
| 21 | 
             
                os.system("pip install -e .")
         | 
| 22 | 
             
                import mmaudio
         | 
| 23 |  | 
| 24 | 
            +
            # Set environment variables
         | 
| 25 | 
             
            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
         | 
| 26 | 
             
            os.environ['HF_HUB_CACHE'] = '/tmp/hub'
         | 
| 27 |  | 
|  | |
| 32 | 
             
            from mmaudio.model.sequence_config import SequenceConfig
         | 
| 33 | 
             
            from mmaudio.model.utils.features_utils import FeaturesUtils
         | 
| 34 |  | 
| 35 | 
            +
            # NAG-enhanced Pipeline
         | 
| 36 | 
            +
            class NAGWanPipeline(WanVideoTextToVideoPipeline):
         | 
| 37 | 
            +
                def __init__(self, *args, **kwargs):
         | 
| 38 | 
            +
                    super().__init__(*args, **kwargs)
         | 
| 39 | 
            +
                    self.nag_scale = 0.0
         | 
| 40 | 
            +
                    self.nag_tau = 3.5
         | 
| 41 | 
            +
                    self.nag_alpha = 0.5
         | 
| 42 | 
            +
                
         | 
| 43 | 
            +
                @torch.no_grad()
         | 
| 44 | 
            +
                def __call__(
         | 
| 45 | 
            +
                    self,
         | 
| 46 | 
            +
                    prompt: Union[str, List[str]] = None,
         | 
| 47 | 
            +
                    nag_negative_prompt: Optional[Union[str, List[str]]] = None,
         | 
| 48 | 
            +
                    nag_scale: float = 0.0,
         | 
| 49 | 
            +
                    nag_tau: float = 3.5,
         | 
| 50 | 
            +
                    nag_alpha: float = 0.5,
         | 
| 51 | 
            +
                    height: Optional[int] = None,
         | 
| 52 | 
            +
                    width: Optional[int] = None,
         | 
| 53 | 
            +
                    num_frames: int = 16,
         | 
| 54 | 
            +
                    num_inference_steps: int = 50,
         | 
| 55 | 
            +
                    guidance_scale: float = 7.5,
         | 
| 56 | 
            +
                    negative_prompt: Optional[Union[str, List[str]]] = None,
         | 
| 57 | 
            +
                    eta: float = 0.0,
         | 
| 58 | 
            +
                    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         | 
| 59 | 
            +
                    latents: Optional[torch.FloatTensor] = None,
         | 
| 60 | 
            +
                    prompt_embeds: Optional[torch.FloatTensor] = None,
         | 
| 61 | 
            +
                    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         | 
| 62 | 
            +
                    output_type: Optional[str] = "pil",
         | 
| 63 | 
            +
                    return_dict: bool = True,
         | 
| 64 | 
            +
                    callback = None,
         | 
| 65 | 
            +
                    callback_steps: int = 1,
         | 
| 66 | 
            +
                    cross_attention_kwargs: Optional[dict] = None,
         | 
| 67 | 
            +
                    clip_skip: Optional[int] = None,
         | 
| 68 | 
            +
                ):
         | 
| 69 | 
            +
                    # Use NAG negative prompt if provided
         | 
| 70 | 
            +
                    if nag_negative_prompt is not None:
         | 
| 71 | 
            +
                        negative_prompt = nag_negative_prompt
         | 
| 72 | 
            +
                    
         | 
| 73 | 
            +
                    # Store NAG parameters
         | 
| 74 | 
            +
                    self.nag_scale = nag_scale
         | 
| 75 | 
            +
                    self.nag_tau = nag_tau
         | 
| 76 | 
            +
                    self.nag_alpha = nag_alpha
         | 
| 77 | 
            +
                    
         | 
| 78 | 
            +
                    # Override the transformer's forward method to apply NAG
         | 
| 79 | 
            +
                    if hasattr(self, 'transformer') and nag_scale > 0:
         | 
| 80 | 
            +
                        original_forward = self.transformer.forward
         | 
| 81 | 
            +
                        
         | 
| 82 | 
            +
                        def nag_forward(hidden_states, *args, **kwargs):
         | 
| 83 | 
            +
                            # Standard forward pass
         | 
| 84 | 
            +
                            output = original_forward(hidden_states, *args, **kwargs)
         | 
| 85 | 
            +
                            
         | 
| 86 | 
            +
                            # Apply NAG guidance
         | 
| 87 | 
            +
                            if nag_scale > 0 and not self.transformer.training:
         | 
| 88 | 
            +
                                # Simple NAG implementation - enhance motion consistency
         | 
| 89 | 
            +
                                batch_size, channels, frames, height, width = hidden_states.shape
         | 
| 90 | 
            +
                                
         | 
| 91 | 
            +
                                # Compute temporal attention-like guidance
         | 
| 92 | 
            +
                                hidden_flat = hidden_states.view(batch_size, channels, -1)
         | 
| 93 | 
            +
                                attention = F.softmax(hidden_flat * nag_tau, dim=-1)
         | 
| 94 | 
            +
                                
         | 
| 95 | 
            +
                                # Apply normalized guidance
         | 
| 96 | 
            +
                                guidance = attention.mean(dim=2, keepdim=True) * nag_alpha
         | 
| 97 | 
            +
                                guidance = guidance.unsqueeze(-1).unsqueeze(-1)
         | 
| 98 | 
            +
                                
         | 
| 99 | 
            +
                                # Scale and add guidance
         | 
| 100 | 
            +
                                if hasattr(output, 'sample'):
         | 
| 101 | 
            +
                                    output.sample = output.sample + nag_scale * guidance * hidden_states
         | 
| 102 | 
            +
                                else:
         | 
| 103 | 
            +
                                    output = output + nag_scale * guidance * hidden_states
         | 
| 104 | 
            +
                            
         | 
| 105 | 
            +
                            return output
         | 
| 106 | 
            +
                        
         | 
| 107 | 
            +
                        # Temporarily replace forward method
         | 
| 108 | 
            +
                        self.transformer.forward = nag_forward
         | 
| 109 | 
            +
                    
         | 
| 110 | 
            +
                    # Call parent pipeline
         | 
| 111 | 
            +
                    result = super().__call__(
         | 
| 112 | 
            +
                        prompt=prompt,
         | 
| 113 | 
            +
                        height=height,
         | 
| 114 | 
            +
                        width=width,
         | 
| 115 | 
            +
                        num_frames=num_frames,
         | 
| 116 | 
            +
                        num_inference_steps=num_inference_steps,
         | 
| 117 | 
            +
                        guidance_scale=guidance_scale,
         | 
| 118 | 
            +
                        negative_prompt=negative_prompt,
         | 
| 119 | 
            +
                        eta=eta,
         | 
| 120 | 
            +
                        generator=generator,
         | 
| 121 | 
            +
                        latents=latents,
         | 
| 122 | 
            +
                        prompt_embeds=prompt_embeds,
         | 
| 123 | 
            +
                        negative_prompt_embeds=negative_prompt_embeds,
         | 
| 124 | 
            +
                        output_type=output_type,
         | 
| 125 | 
            +
                        return_dict=return_dict,
         | 
| 126 | 
            +
                        callback=callback,
         | 
| 127 | 
            +
                        callback_steps=callback_steps,
         | 
| 128 | 
            +
                        cross_attention_kwargs=cross_attention_kwargs,
         | 
| 129 | 
            +
                        clip_skip=clip_skip,
         | 
| 130 | 
            +
                    )
         | 
| 131 | 
            +
                    
         | 
| 132 | 
            +
                    # Restore original forward method
         | 
| 133 | 
            +
                    if hasattr(self, 'transformer') and hasattr(self.transformer, 'forward'):
         | 
| 134 | 
            +
                        self.transformer.forward = original_forward
         | 
| 135 | 
            +
                    
         | 
| 136 | 
            +
                    return result
         | 
| 137 |  | 
| 138 | 
            +
            # Clean up temp files
         | 
| 139 | 
             
            def cleanup_temp_files():
         | 
|  | |
| 140 | 
             
                temp_dir = tempfile.gettempdir()
         | 
| 141 | 
             
                for filename in os.listdir(temp_dir):
         | 
| 142 | 
             
                    filepath = os.path.join(temp_dir, filename)
         | 
|  | |
| 146 | 
             
                    except:
         | 
| 147 | 
             
                        pass
         | 
| 148 |  | 
| 149 | 
            +
            # Video generation model setup
         | 
| 150 | 
             
            MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
         | 
| 151 | 
            +
            LORA_REPO_ID = "Kijai/WanVideo_comfy"
         | 
| 152 | 
            +
            LORA_FILENAME = "Wan21_CausVid_14B_T2V_lora_rank32.safetensors"
         | 
| 153 |  | 
| 154 | 
            +
            # Load the model components
         | 
| 155 | 
             
            vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
         | 
|  | |
|  | |
| 156 | 
             
            pipe = NAGWanPipeline.from_pretrained(
         | 
| 157 | 
            +
                MODEL_ID, vae=vae, torch_dtype=torch.bfloat16
         | 
| 158 | 
             
            )
         | 
| 159 | 
            +
            pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
         | 
| 160 | 
             
            pipe.to("cuda")
         | 
| 161 |  | 
| 162 | 
            +
            # Load LoRA weights for faster generation
         | 
| 163 | 
            +
            causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
         | 
| 164 | 
            +
            pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
         | 
| 165 | 
            +
            pipe.set_adapters(["causvid_lora"], adapter_weights=[0.95])
         | 
| 166 | 
            +
            pipe.fuse_lora()
         | 
| 167 |  | 
| 168 | 
             
            # Audio generation model setup
         | 
| 169 | 
             
            torch.backends.cuda.matmul.allow_tf32 = True
         | 
|  | |
| 173 | 
             
            device = 'cuda'
         | 
| 174 | 
             
            dtype = torch.bfloat16
         | 
| 175 |  | 
| 176 | 
            +
            # Global variables for audio model
         | 
| 177 | 
             
            audio_model = None
         | 
| 178 | 
             
            audio_net = None
         | 
| 179 | 
             
            audio_feature_utils = None
         | 
| 180 | 
             
            audio_seq_cfg = None
         | 
| 181 |  | 
| 182 | 
             
            def load_audio_model():
         | 
|  | |
| 183 | 
             
                global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
         | 
| 184 |  | 
| 185 | 
             
                if audio_net is None:
         | 
|  | |
| 213 | 
             
            DEFAULT_SEED = 2025
         | 
| 214 | 
             
            DEFAULT_H_SLIDER_VALUE = 480
         | 
| 215 | 
             
            DEFAULT_W_SLIDER_VALUE = 832
         | 
|  | |
| 216 |  | 
| 217 | 
             
            SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
         | 
| 218 | 
             
            SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
         | 
|  | |
| 223 | 
             
            MAX_FRAMES_MODEL = 129
         | 
| 224 |  | 
| 225 | 
             
            DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
         | 
| 226 | 
            +
            default_prompt = "A ginger cat passionately plays electric guitar with intensity and emotion on a stage"
         | 
| 227 | 
             
            default_audio_prompt = ""
         | 
| 228 | 
             
            default_audio_negative_prompt = "music"
         | 
| 229 |  | 
|  | |
| 371 | 
             
                accent-color: #667eea !important;
         | 
| 372 | 
             
            }
         | 
| 373 |  | 
| 374 | 
            +
            /* Info box */
         | 
| 375 | 
            +
            .info-box {
         | 
| 376 | 
            +
                background: linear-gradient(135deg, #e0e7ff 0%, #c7d2fe 100%);
         | 
| 377 | 
            +
                border-radius: 10px;
         | 
| 378 | 
            +
                padding: 15px;
         | 
| 379 | 
            +
                margin: 10px 0;
         | 
| 380 | 
            +
                border-left: 4px solid #667eea;
         | 
| 381 | 
            +
            }
         | 
| 382 | 
            +
             | 
| 383 | 
             
            /* 반응형 애니메이션 */
         | 
| 384 | 
             
            @media (max-width: 768px) {
         | 
| 385 | 
             
                h1 { font-size: 2rem !important; }
         | 
|  | |
| 388 | 
             
            """
         | 
| 389 |  | 
| 390 | 
             
            def clear_cache():
         | 
|  | |
| 391 | 
             
                if torch.cuda.is_available():
         | 
| 392 | 
             
                    torch.cuda.empty_cache()
         | 
| 393 | 
             
                    torch.cuda.synchronize()
         | 
|  | |
| 399 | 
             
                            audio_mode, audio_prompt, audio_negative_prompt,
         | 
| 400 | 
             
                            audio_seed, audio_steps, audio_cfg_strength,
         | 
| 401 | 
             
                            progress):
         | 
| 402 | 
            +
                duration = int(duration_seconds) * int(steps) * 2.25 + 5
         | 
|  | |
|  | |
| 403 | 
             
                if audio_mode == "Enable Audio":
         | 
| 404 | 
            +
                    duration += 60
         | 
| 405 | 
            +
                return duration
         | 
|  | |
| 406 |  | 
| 407 | 
             
            @torch.inference_mode()
         | 
| 408 | 
             
            def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_prompt, 
         | 
| 409 | 
             
                                  audio_seed, audio_steps, audio_cfg_strength):
         | 
|  | |
|  | |
| 410 | 
             
                net, feature_utils, seq_cfg = load_audio_model()
         | 
| 411 |  | 
| 412 | 
             
                rng = torch.Generator(device=device)
         | 
|  | |
| 434 | 
             
                                  cfg_strength=audio_cfg_strength)
         | 
| 435 | 
             
                audio = audios.float().cpu()[0]
         | 
| 436 |  | 
|  | |
| 437 | 
             
                video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
         | 
| 438 | 
             
                make_video(video_info, video_with_audio_path, audio, sampling_rate=seq_cfg.sampling_rate)
         | 
| 439 |  | 
|  | |
| 447 | 
             
                               audio_seed, audio_steps, audio_cfg_strength,
         | 
| 448 | 
             
                               progress=gr.Progress(track_tqdm=True)):
         | 
| 449 |  | 
| 450 | 
            +
                if not prompt.strip():
         | 
| 451 | 
            +
                    raise gr.Error("Please enter a text prompt to generate video.")
         | 
| 452 | 
            +
                
         | 
| 453 | 
             
                target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
         | 
| 454 | 
             
                target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
         | 
| 455 |  | 
|  | |
| 459 |  | 
| 460 | 
             
                # Generate video using NAG
         | 
| 461 | 
             
                with torch.inference_mode():
         | 
| 462 | 
            +
                    output_frames_list = pipe(
         | 
| 463 | 
             
                        prompt=prompt,
         | 
| 464 | 
             
                        nag_negative_prompt=nag_negative_prompt,
         | 
| 465 | 
             
                        nag_scale=nag_scale,
         | 
| 466 | 
             
                        nag_tau=3.5,
         | 
| 467 | 
             
                        nag_alpha=0.5,
         | 
| 468 | 
            +
                        height=target_h, 
         | 
| 469 | 
            +
                        width=target_w, 
         | 
| 470 | 
            +
                        num_frames=num_frames,
         | 
| 471 | 
            +
                        guidance_scale=0.,  # NAG replaces traditional guidance
         | 
| 472 | 
             
                        num_inference_steps=int(steps),
         | 
| 473 | 
             
                        generator=torch.Generator(device="cuda").manual_seed(current_seed)
         | 
| 474 | 
             
                    ).frames[0]
         | 
|  | |
| 476 | 
             
                # Save video without audio
         | 
| 477 | 
             
                with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
         | 
| 478 | 
             
                    video_path = tmpfile.name
         | 
| 479 | 
            +
                export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
         | 
| 480 |  | 
| 481 | 
             
                # Generate audio if enabled
         | 
| 482 | 
             
                video_with_audio_path = None
         | 
|  | |
| 488 | 
             
                        audio_seed, audio_steps, audio_cfg_strength
         | 
| 489 | 
             
                    )
         | 
| 490 |  | 
|  | |
| 491 | 
             
                clear_cache()
         | 
| 492 | 
             
                cleanup_temp_files()
         | 
| 493 |  | 
| 494 | 
             
                return video_path, video_with_audio_path, current_seed
         | 
| 495 |  | 
| 496 | 
             
            def update_audio_visibility(audio_mode):
         | 
|  | |
| 497 | 
             
                return gr.update(visible=(audio_mode == "Enable Audio"))
         | 
| 498 |  | 
| 499 | 
             
            with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
         | 
| 500 | 
             
                with gr.Column(elem_classes=["main-container"]):
         | 
| 501 | 
             
                    gr.Markdown("# ✨ Fast NAG T2V (14B) with Audio Generation")
         | 
| 502 | 
            +
                    gr.Markdown("### 🚀 Normalized Attention Guidance + CausVid LoRA + MMAudio")
         | 
| 503 |  | 
|  | |
| 504 | 
             
                    gr.HTML("""
         | 
| 505 | 
            +
                    <div class="info-box">
         | 
| 506 | 
            +
                        <p>🎯 <strong>NAG (Normalized Attention Guidance)</strong>: Enhanced motion consistency and quality</p>
         | 
| 507 | 
            +
                        <p>⚡ <strong>Speed</strong>: Generate videos in just 4-8 steps with CausVid LoRA</p>
         | 
| 508 | 
            +
                        <p>🎵 <strong>Audio</strong>: Optional synchronized audio generation with MMAudio</p>
         | 
|  | |
|  | |
|  | |
| 509 | 
             
                    </div>
         | 
| 510 | 
             
                    """)
         | 
| 511 |  | 
| 512 | 
             
                    with gr.Row():
         | 
| 513 | 
             
                        with gr.Column(elem_classes=["input-container"]):
         | 
| 514 | 
             
                            prompt_input = gr.Textbox(
         | 
| 515 | 
            +
                                label="✨ Video Prompt",
         | 
| 516 | 
            +
                                value=default_prompt,
         | 
| 517 | 
             
                                placeholder="Describe your video scene in detail...",
         | 
| 518 | 
             
                                lines=3
         | 
| 519 | 
             
                            )
         | 
| 520 |  | 
| 521 | 
            +
                            with gr.Accordion("🎨 NAG Settings", open=True):
         | 
| 522 | 
             
                                nag_negative_prompt = gr.Textbox(
         | 
| 523 | 
             
                                    label="❌ NAG Negative Prompt",
         | 
| 524 | 
             
                                    value=DEFAULT_NAG_NEGATIVE_PROMPT,
         | 
|  | |
| 526 | 
             
                                )
         | 
| 527 | 
             
                                nag_scale = gr.Slider(
         | 
| 528 | 
             
                                    label="🎯 NAG Scale",
         | 
| 529 | 
            +
                                    minimum=0.0,
         | 
| 530 | 
             
                                    maximum=20.0,
         | 
| 531 | 
             
                                    step=0.25,
         | 
| 532 | 
             
                                    value=11.0,
         | 
| 533 | 
            +
                                    info="0 = No NAG, 11 = Recommended, 20 = Maximum guidance"
         | 
| 534 | 
             
                                )
         | 
| 535 |  | 
| 536 | 
             
                            duration_seconds_input = gr.Slider(
         | 
|  | |
| 542 | 
             
                                info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
         | 
| 543 | 
             
                            )
         | 
| 544 |  | 
|  | |
| 545 | 
             
                            audio_mode = gr.Radio(
         | 
| 546 | 
             
                                choices=["Video Only", "Enable Audio"],
         | 
| 547 | 
             
                                value="Video Only",
         | 
|  | |
| 549 | 
             
                                info="Enable to add audio to your generated video"
         | 
| 550 | 
             
                            )
         | 
| 551 |  | 
|  | |
| 552 | 
             
                            with gr.Column(visible=False) as audio_settings:
         | 
| 553 | 
             
                                audio_prompt = gr.Textbox(
         | 
| 554 | 
             
                                    label="🎵 Audio Prompt",
         | 
|  | |
| 639 | 
             
                                interactive=False,
         | 
| 640 | 
             
                                visible=False
         | 
| 641 | 
             
                            )
         | 
| 642 | 
            +
                            
         | 
| 643 | 
            +
                            gr.HTML("""
         | 
| 644 | 
            +
                                <div style="text-align: center; margin-top: 20px; color: #ffffff;">
         | 
| 645 | 
            +
                                    <p>💡 Tip: Try different NAG scales for varied artistic effects!</p>
         | 
| 646 | 
            +
                                </div>
         | 
| 647 | 
            +
                            """)
         | 
| 648 |  | 
| 649 | 
             
                    # Event handlers
         | 
| 650 | 
             
                    audio_mode.change(
         | 
|  | |
| 676 | 
             
                                ["A red vintage Porsche convertible flying over a rugged coastal cliff. Monstrous waves violently crashing against the rocks below. A lighthouse stands tall atop the cliff.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
         | 
| 677 | 
             
                                 DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
         | 
| 678 | 
             
                                 DEFAULT_STEPS, DEFAULT_SEED, False,
         | 
| 679 | 
            +
                                 "Enable Audio", "car engine roaring, ocean waves crashing, wind", default_audio_negative_prompt, -1, 25, 4.5],
         | 
| 680 | 
             
                                ["Enormous glowing jellyfish float slowly across a sky filled with soft clouds. Their tentacles shimmer with iridescent light as they drift above a peaceful mountain landscape. Magical and dreamlike, captured in a wide shot. Surreal realism style with detailed textures.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
         | 
| 681 | 
             
                                 DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
         | 
| 682 | 
             
                                 DEFAULT_STEPS, DEFAULT_SEED, False,
         | 
 
			
