FramePack_rotate_landscape

Running

File size: 44,977 Bytes

493eaa3
 
 
 
 
 
 
 
 
 
 
 
50f328c
438653e
 
 
 
 
 
 
6dfa0f3
438653e
493eaa3
6dfa0f3
438653e
 
 
 
6dfa0f3
438653e
 
 
6dfa0f3
493eaa3
 
 
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
6dfa0f3
493eaa3
 
6dfa0f3
 
438653e
6dfa0f3
438653e
 
493eaa3
 
6dfa0f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493eaa3
 
6dfa0f3
 
 
 
 
 
 
 
 
 
 
493eaa3
 
 
 
438653e
 
 
6dfa0f3
493eaa3
 
 
 
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
6dfa0f3
493eaa3
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
 
6dfa0f3
 
493eaa3
6dfa0f3
438653e
493eaa3
6dfa0f3
 
438653e
 
6dfa0f3
 
 
438653e
493eaa3
438653e
6dfa0f3
438653e
493eaa3
6dfa0f3
 
438653e
6dfa0f3
 
 
438653e
 
493eaa3
6dfa0f3
493eaa3
6dfa0f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493eaa3
6dfa0f3
 
493eaa3
 
 
 
6dfa0f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493eaa3
6dfa0f3
 
 
 
 
 
493eaa3
 
 
 
 
 
 
 
 
 
438653e
 
6dfa0f3
493eaa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438653e
493eaa3
 
 
 
 
 
 
 
 
6dfa0f3
493eaa3
6dfa0f3
 
438653e
6dfa0f3
 
438653e
 
 
 
 
 
 
 
 
493eaa3
438653e
6dfa0f3
 
438653e
 
6dfa0f3
438653e
 
 
 
 
6dfa0f3
493eaa3
 
 
 
 
 
 
 
 
 
6dfa0f3
493eaa3
 
 
 
6dfa0f3
493eaa3
 
6dfa0f3
 
 
493eaa3
 
438653e
6dfa0f3
 
 
 
493eaa3
6dfa0f3
438653e
493eaa3
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
 
493eaa3
 
 
 
 
 
 
6dfa0f3
 
 
493eaa3
6dfa0f3
 
 
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
 
493eaa3
 
 
438653e
 
493eaa3
6dfa0f3
84eafe0
 
 
 
 
 
 
 
6dfa0f3
 
84eafe0
 
 
 
 
6dfa0f3
 
84eafe0
 
 
6dfa0f3
84eafe0
 
 
493eaa3
 
438653e
6dfa0f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493eaa3
438653e
6dfa0f3
 
5fc911a
493eaa3
 
6dfa0f3
 
 
 
 
493eaa3
 
 
6dfa0f3
 
 
 
 
 
 
 
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
438653e
 
6dfa0f3
 
 
 
 
438653e
6dfa0f3
438653e
6dfa0f3
 
 
493eaa3
 
 
 
 
438653e
493eaa3
438653e
 
6dfa0f3
438653e
493eaa3
 
 
 
 
 
 
6dfa0f3
438653e
6dfa0f3
493eaa3
6dfa0f3
438653e
493eaa3
 
 
 
438653e
493eaa3
 
 
 
 
 
 
 
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
 
 
6dfa0f3
493eaa3
 
6dfa0f3
84eafe0
6dfa0f3
493eaa3
6dfa0f3
 
 
84eafe0
6dfa0f3
 
493eaa3
 
 
 
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
 
 
6dfa0f3
493eaa3
6dfa0f3
438653e
493eaa3
 
 
 
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
 
 
 
 
6dfa0f3
ffb7037
493eaa3
 
 
 
 
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
 
 
6dfa0f3
493eaa3
 
 
 
 
 
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
 
 
6dfa0f3
493eaa3
6dfa0f3
493eaa3
 
 
5354773
493eaa3
6dfa0f3
 
 
 
493eaa3
 
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
 
 
6dfa0f3
493eaa3
6dfa0f3
493eaa3
 
 
6dfa0f3
493eaa3
 
 
 
 
6dfa0f3
 
 
493eaa3
6dfa0f3
493eaa3
 
 
6dfa0f3
438653e
493eaa3
 
6dfa0f3
 
 
 
 
 
 
 
 
493eaa3
 
6dfa0f3
 
493eaa3
6dfa0f3
 
493eaa3
 
 
 
 
 
 
 
6dfa0f3
 
 
493eaa3
6dfa0f3
493eaa3
 
 
 
 
6dfa0f3
493eaa3
 
 
 
 
 
 
 
6dfa0f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493eaa3
6dfa0f3
 
493eaa3
438653e
493eaa3
6dfa0f3
493eaa3
 
 
 
6dfa0f3
 
493eaa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6dfa0f3
 
 
 
 
493eaa3
 
6dfa0f3
493eaa3
 
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
 
 
6dfa0f3
493eaa3
 
 
6dfa0f3
 
493eaa3
6dfa0f3
 
493eaa3
 
438653e
 
493eaa3
 
 
 
 
6dfa0f3
 
493eaa3
 
 
6dfa0f3
493eaa3
 
 
 
 
6dfa0f3
 
 
493eaa3
 
6dfa0f3
438653e
493eaa3
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
 
 
438653e
493eaa3
 
 
 
 
 
 
 
 
 
79a9771
493eaa3
 
 
6dfa0f3
493eaa3
 
 
6dfa0f3
 
493eaa3
 
 
 
438653e
6dfa0f3
438653e
493eaa3
 
 
 
 
6dfa0f3
 
493eaa3
 
6dfa0f3
 
493eaa3
6dfa0f3
493eaa3
438653e
 
6dfa0f3
493eaa3
 
47a17d8
493eaa3
6dfa0f3
438653e
6dfa0f3
47a17d8
 
 
 
 
 
 
493eaa3
 
 
6dfa0f3
 
 
 
 
 
493eaa3
 
 
 
 
 
 
 
 
 
 
 
6dfa0f3
493eaa3
 
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
 
 
3fd62c7
493eaa3
 
 
 
6dfa0f3
 
 
493eaa3
3fd62c7
493eaa3
3fd62c7
493eaa3
 
6dfa0f3
493eaa3
3fd62c7
493eaa3
 
 
47a17d8
493eaa3
6dfa0f3
438653e
47a17d8
 
 
 
 
 
 
493eaa3
 
 
6dfa0f3
 
 
 
 
 
493eaa3
 
 
 
 
 
 
 
 
 
 
 
6dfa0f3
493eaa3
 
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
 
 
3fd62c7
493eaa3
 
 
 
6dfa0f3
 
 
493eaa3
3fd62c7
493eaa3
3fd62c7
493eaa3
 
6dfa0f3
493eaa3
3fd62c7
6dfa0f3
438653e
493eaa3
6dfa0f3
 
 
 
 
493eaa3
 
 
6dfa0f3
493eaa3
6dfa0f3
493eaa3
 
6dfa0f3
493eaa3
6dfa0f3
493eaa3
6dfa0f3
493eaa3
438653e
 
493eaa3
6dfa0f3
493eaa3
438653e
 
6dfa0f3
 
 
 
 
 
493eaa3
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
 
 
 
 
6dfa0f3
 
493eaa3
 
 
 
6dfa0f3
493eaa3
6dfa0f3
 
 
493eaa3
6dfa0f3
 
 
493eaa3
6dfa0f3
 
493eaa3
6dfa0f3
 
 
 
493eaa3
 
6dfa0f3
 
493eaa3
 
6dfa0f3
 
 
493eaa3
 
 
 
 
6dfa0f3
493eaa3
 
6dfa0f3
 
 
 
 
 
493eaa3
6dfa0f3
 
 
 
 
 
 
 
 
 
493eaa3
438653e
6dfa0f3
493eaa3
438653e
6dfa0f3
438653e
 
6dfa0f3
493eaa3
 
6dfa0f3
438653e
6dfa0f3
 
 
438653e
 
6dfa0f3
438653e
6dfa0f3
 
2a8ac85
6dfa0f3
493eaa3
6dfa0f3
 
 
438653e
2a8ac85
6dfa0f3
 
 
 
 
 
 
493eaa3
 
 
6dfa0f3
 
493eaa3
 
 
6dfa0f3
 
493eaa3
 
 
6dfa0f3
 
 
 
 
 
 
 
 
 
 
493eaa3
6dfa0f3
493eaa3
6dfa0f3
 
493eaa3
 
 
 
6dfa0f3
 
493eaa3
6dfa0f3
 
 
493eaa3
6dfa0f3
 
 
 
 
8498895
493eaa3
6dfa0f3
 
 
493eaa3
 
6dfa0f3
cc74d7f
6dfa0f3
 
 
 
 
493eaa3
 
6dfa0f3

from diffusers_helper.hf_login import login

import os
import threading
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import json

os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))

import gradio as gr
import torch
import traceback
import einops
import safetensors.torch as sf
import numpy as np
import math

# Check if running in Hugging Face Space
IN_HF_SPACE = os.environ.get('SPACE_ID') is not None

# Track GPU availability
GPU_AVAILABLE = False
GPU_INITIALIZED = False
last_update_time = time.time()

# If running in a HF Space, import spaces
if IN_HF_SPACE:
    try:
        import spaces
        print("Running inside a Hugging Face Space, 'spaces' module imported.")
        
        try:
            GPU_AVAILABLE = torch.cuda.is_available()
            print(f"GPU available: {GPU_AVAILABLE}")
            if GPU_AVAILABLE:
                print(f"GPU device name: {torch.cuda.get_device_name(0)}")
                print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")
                
                # Small GPU operation test
                test_tensor = torch.zeros(1, device='cuda') + 1
                del test_tensor
                print("GPU test operation succeeded.")
            else:
                print("Warning: CUDA says it's available, but no GPU device was detected.")
        except Exception as e:
            GPU_AVAILABLE = False
            print(f"Error checking GPU: {e}")
            print("Falling back to CPU mode.")
    except ImportError:
        print("Could not import 'spaces' module. Possibly not in a HF Space.")
        GPU_AVAILABLE = torch.cuda.is_available()

from PIL import Image
from diffusers import AutoencoderKLHunyuanVideo
from transformers import (
    LlamaModel, 
    CLIPTextModel, 
    LlamaTokenizerFast, 
    CLIPTokenizer, 
    SiglipImageProcessor, 
    SiglipVisionModel
)
from diffusers_helper.hunyuan import (
    encode_prompt_conds, 
    vae_decode, 
    vae_encode, 
    vae_decode_fake
)
from diffusers_helper.utils import (
    save_bcthw_as_mp4, 
    crop_or_pad_yield_mask, 
    soft_append_bcthw, 
    resize_and_center_crop, 
    generate_timestamp
)
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
from diffusers_helper.memory import (
    cpu,
    gpu,
    get_cuda_free_memory_gb,
    move_model_to_device_with_memory_preservation,
    offload_model_from_device_for_memory_preservation,
    fake_diffusers_current_device,
    DynamicSwapInstaller,
    unload_complete_models,
    load_model_as_complete
)
from diffusers_helper.thread_utils import AsyncStream, async_run
from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
from diffusers_helper.clip_vision import hf_clip_vision_encode

outputs_folder = './outputs/'
os.makedirs(outputs_folder, exist_ok=True)

# Manage GPU memory if not in HF Space
if not IN_HF_SPACE:
    try:
        if torch.cuda.is_available():
            free_mem_gb = get_cuda_free_memory_gb(gpu)
            print(f'Free VRAM: {free_mem_gb} GB')
        else:
            free_mem_gb = 6.0
            print("CUDA not available, using default memory setting.")
    except Exception as e:
        free_mem_gb = 6.0
        print(f"Error getting CUDA memory: {e}, using default=6GB")
    high_vram = free_mem_gb > 60
    print(f'High-VRAM mode: {high_vram}')
else:
    print("Using default memory settings in a HF Space.")
    try:
        if GPU_AVAILABLE:
            free_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 * 0.9
            high_vram = free_mem_gb > 10
        else:
            free_mem_gb = 6.0
            high_vram = False
    except Exception as e:
        print(f"Error retrieving GPU memory: {e}")
        free_mem_gb = 6.0
        high_vram = False
    print(f'GPU mem: {free_mem_gb:.2f} GB, high_vram={high_vram}')

models = {}
cpu_fallback_mode = not GPU_AVAILABLE


def load_models():
    """
    Load the entire pipeline models (VAE, text encoders, image encoder, transformer).
    """
    global models, cpu_fallback_mode, GPU_INITIALIZED
    
    if GPU_INITIALIZED:
        print("Models are already loaded. Skipping duplicate loading.")
        return models
    
    print("Starting model load...")

    try:
        device = 'cuda' if (GPU_AVAILABLE and not cpu_fallback_mode) else 'cpu'
        model_device = 'cpu'

        dtype = torch.float16 if GPU_AVAILABLE else torch.float32
        transformer_dtype = torch.bfloat16 if GPU_AVAILABLE else torch.float32

        print(f"Device: {device}, VAE/encoders dtype={dtype}, transformer dtype={transformer_dtype}")

        try:
            text_encoder = LlamaModel.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo", 
                subfolder='text_encoder', 
                torch_dtype=dtype
            ).to(model_device)
            text_encoder_2 = CLIPTextModel.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo", 
                subfolder='text_encoder_2', 
                torch_dtype=dtype
            ).to(model_device)
            tokenizer = LlamaTokenizerFast.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo",
                subfolder='tokenizer'
            )
            tokenizer_2 = CLIPTokenizer.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo",
                subfolder='tokenizer_2'
            )
            vae = AutoencoderKLHunyuanVideo.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo",
                subfolder='vae',
                torch_dtype=dtype
            ).to(model_device)

            feature_extractor = SiglipImageProcessor.from_pretrained(
                "lllyasviel/flux_redux_bfl", 
                subfolder='feature_extractor'
            )
            image_encoder = SiglipVisionModel.from_pretrained(
                "lllyasviel/flux_redux_bfl",
                subfolder='image_encoder',
                torch_dtype=dtype
            ).to(model_device)

            # Use a custom rotating-landscape model (for example)
            transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
                "tori29umai/FramePackI2V_HY_rotate_landscape", 
                torch_dtype=transformer_dtype
            ).to(model_device)

            print("All models loaded successfully.")
        except Exception as e:
            print(f"Error loading models: {e}")
            print("Retry with float32 on CPU.")
            dtype = torch.float32
            transformer_dtype = torch.float32
            cpu_fallback_mode = True

            text_encoder = LlamaModel.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo", 
                subfolder='text_encoder', 
                torch_dtype=dtype
            ).to('cpu')
            text_encoder_2 = CLIPTextModel.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo", 
                subfolder='text_encoder_2', 
                torch_dtype=dtype
            ).to('cpu')
            tokenizer = LlamaTokenizerFast.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo",
                subfolder='tokenizer'
            )
            tokenizer_2 = CLIPTokenizer.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo",
                subfolder='tokenizer_2'
            )
            vae = AutoencoderKLHunyuanVideo.from_pretrained(
                "hunyuanvideo-community/HunyuanVideo",
                subfolder='vae',
                torch_dtype=dtype
            ).to('cpu')

            feature_extractor = SiglipImageProcessor.from_pretrained(
                "lllyasviel/flux_redux_bfl", 
                subfolder='feature_extractor'
            )
            image_encoder = SiglipVisionModel.from_pretrained(
                "lllyasviel/flux_redux_bfl",
                subfolder='image_encoder',
                torch_dtype=dtype
            ).to('cpu')

            transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained(
                "tori29umai/FramePackI2V_HY_rotate_landscape", 
                torch_dtype=transformer_dtype
            ).to('cpu')

            print("Models loaded in CPU-only fallback mode.")

        vae.eval()
        text_encoder.eval()
        text_encoder_2.eval()
        image_encoder.eval()
        transformer.eval()

        if not high_vram or cpu_fallback_mode:
            vae.enable_slicing()
            vae.enable_tiling()

        transformer.high_quality_fp32_output_for_inference = True
        print("transformer.high_quality_fp32_output_for_inference = True")

        if not cpu_fallback_mode:
            transformer.to(dtype=transformer_dtype)
            vae.to(dtype=dtype)
            image_encoder.to(dtype=dtype)
            text_encoder.to(dtype=dtype)
            text_encoder_2.to(dtype=dtype)

        vae.requires_grad_(False)
        text_encoder.requires_grad_(False)
        text_encoder_2.requires_grad_(False)
        image_encoder.requires_grad_(False)
        transformer.requires_grad_(False)

        if torch.cuda.is_available() and not cpu_fallback_mode:
            try:
                if not high_vram:
                    DynamicSwapInstaller.install_model(transformer, device=device)
                    DynamicSwapInstaller.install_model(text_encoder, device=device)
                else:
                    text_encoder.to(device)
                    text_encoder_2.to(device)
                    image_encoder.to(device)
                    vae.to(device)
                    transformer.to(device)
                print(f"Successfully moved models to {device}")
            except Exception as e:
                print(f"Error moving models to {device}: {e}")
                print("Falling back to CPU.")
                cpu_fallback_mode = True

        models_local = {
            'text_encoder': text_encoder,
            'text_encoder_2': text_encoder_2,
            'tokenizer': tokenizer,
            'tokenizer_2': tokenizer_2,
            'vae': vae,
            'feature_extractor': feature_extractor,
            'image_encoder': image_encoder,
            'transformer': transformer
        }
        
        GPU_INITIALIZED = True
        models.update(models_local)
        print(f"Model load complete. Mode: {'CPU' if cpu_fallback_mode else 'GPU'}")
        return models
    except Exception as e:
        print(f"Unexpected error in load_models(): {e}")
        traceback.print_exc()
        cpu_fallback_mode = True
        return {}


# Use GPU decorator if in HF Space
if IN_HF_SPACE and 'spaces' in globals() and GPU_AVAILABLE:
    try:
        @spaces.GPU
        def initialize_models():
            global GPU_INITIALIZED
            try:
                result = load_models()
                GPU_INITIALIZED = True
                return result
            except Exception as e:
                print(f"Error in @spaces.GPU model init: {e}")
                global cpu_fallback_mode
                cpu_fallback_mode = True
                return load_models()
    except Exception as e:
        print(f"Error creating spaces.GPU decorator: {e}")
        def initialize_models():
            return load_models()
else:
    def initialize_models():
        return load_models()


def get_models():
    """
    Retrieve the global models or load them if not yet loaded.
    """
    global models
    model_loading_key = "__model_loading__"

    if not models:
        if model_loading_key in globals():
            print("Models are loading. Please wait.")
            import time
            start_time = time.time()
            while (not models) and (model_loading_key in globals()):
                time.sleep(0.5)
                if time.time() - start_time > 60:
                    print("Timed out waiting for model load.")
                    break
            if models:
                return models
        try:
            globals()[model_loading_key] = True
            if IN_HF_SPACE and 'spaces' in globals() and GPU_AVAILABLE and not cpu_fallback_mode:
                try:
                    print("Loading models via @spaces.GPU")
                    models_local = initialize_models()
                    models.update(models_local)
                except Exception as e:
                    print(f"GPU decorator load error: {e}, fallback to direct load.")
                    models_local = load_models()
                    models.update(models_local)
            else:
                models_local = load_models()
                models.update(models_local)
        except Exception as e:
            print(f"Unexpected error while loading models: {e}")
            models.clear()
        finally:
            if model_loading_key in globals():
                del globals()[model_loading_key]
    return models


# Predefined resolutions for a rotating-landscape scenario
PREDEFINED_RESOLUTIONS = [
    (416, 960), (448, 864), (480, 832), (512, 768), (544, 704), 
    (576, 672), (608, 640), (640, 608), (672, 576), (704, 544), 
    (768, 512), (832, 480), (864, 448), (960, 416)
]

def find_closest_aspect_ratio(width, height, target_resolutions):
    """
    Find the resolution in 'target_resolutions' whose aspect ratio
    is closest to the original image aspect ratio (width/height).
    """
    original_aspect = width / height
    min_diff = float('inf')
    closest_resolution = None
    
    for tw, th in target_resolutions:
        target_aspect = tw / th
        diff = abs(original_aspect - target_aspect)
        if diff < min_diff:
            min_diff = diff
            closest_resolution = (tw, th)
    return closest_resolution


stream = AsyncStream()

@torch.no_grad()
def worker(
    input_image, 
    prompt, 
    n_prompt, 
    seed, 
    total_second_length, 
    latent_window_size, 
    steps, 
    cfg, 
    gs, 
    rs, 
    gpu_memory_preservation, 
    use_teacache
):
    """
    Background worker that performs the actual generation.
    """
    global last_update_time
    last_update_time = time.time()

    # For demonstration, limit max length to 3 seconds
    total_second_length = min(total_second_length, 3.0)
    
    try:
        models_local = get_models()
        if not models_local:
            err_msg = "Failed to load models. Check logs for details."
            print(err_msg)
            stream.output_queue.push(('error', err_msg))
            stream.output_queue.push(('end', None))
            return
        
        text_encoder = models_local['text_encoder']
        text_encoder_2 = models_local['text_encoder_2']
        tokenizer = models_local['tokenizer']
        tokenizer_2 = models_local['tokenizer_2']
        vae = models_local['vae']
        feature_extractor = models_local['feature_extractor']
        image_encoder = models_local['image_encoder']
        transformer = models_local['transformer']
    except Exception as e:
        err = f"Error retrieving models: {e}"
        print(err)
        traceback.print_exc()
        stream.output_queue.push(('error', err))
        stream.output_queue.push(('end', None))
        return

    device = 'cuda' if (GPU_AVAILABLE and not cpu_fallback_mode) else 'cpu'
    print(f"Inference device: {device}")

    # Adjust parameters if in CPU fallback
    if cpu_fallback_mode:
        print("CPU fallback mode: using smaller parameters for performance.")
        latent_window_size = min(latent_window_size, 5)
        steps = min(steps, 15)
        total_second_length = min(total_second_length, 2.0)

    total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
    total_latent_sections = int(max(round(total_latent_sections), 1))

    job_id = generate_timestamp()
    last_output_filename = None
    history_pixels = None
    history_latents = None
    total_generated_latent_frames = 0

    stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))

    try:
        if not high_vram and not cpu_fallback_mode:
            try:
                unload_complete_models(
                    text_encoder, text_encoder_2, image_encoder, vae, transformer
                )
            except Exception as e:
                print(f"Error unloading models: {e}")

        # Text encode
        last_update_time = time.time()
        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Encoding text ...'))))

        try:
            if not high_vram and not cpu_fallback_mode:
                fake_diffusers_current_device(text_encoder, device)
                load_model_as_complete(text_encoder_2, target_device=device)

            llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
            if cfg == 1:
                llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
            else:
                llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)

            llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
            llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
        except Exception as e:
            err = f"Text encoding error: {e}"
            print(err)
            traceback.print_exc()
            stream.output_queue.push(('error', err))
            stream.output_queue.push(('end', None))
            return

        # Process input image
        try:
            H, W, C = input_image.shape
            target_w, target_h = find_closest_aspect_ratio(W, H, PREDEFINED_RESOLUTIONS)
            
            # If CPU fallback, scale down
            if cpu_fallback_mode:
                scale_factor = min(320 / target_h, 320 / target_w)
                target_h = int(target_h * scale_factor)
                target_w = int(target_w * scale_factor)
            
            print(f"Original image: {W}x{H}, resizing to: {target_w}x{target_h}")
            input_image_np = resize_and_center_crop(input_image, target_width=target_w, target_height=target_h)
            Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))

            input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
            input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
        except Exception as e:
            err = f"Image processing error: {e}"
            print(err)
            traceback.print_exc()
            stream.output_queue.push(('error', err))
            stream.output_queue.push(('end', None))
            return

        # VAE encode
        last_update_time = time.time()
        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))

        try:
            if not high_vram and not cpu_fallback_mode:
                load_model_as_complete(vae, target_device=device)
            start_latent = vae_encode(input_image_pt, vae)
        except Exception as e:
            err = f"VAE encode error: {e}"
            print(err)
            traceback.print_exc()
            stream.output_queue.push(('error', err))
            stream.output_queue.push(('end', None))
            return

        # CLIP Vision
        last_update_time = time.time()
        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))

        try:
            if not high_vram and not cpu_fallback_mode:
                load_model_as_complete(image_encoder, target_device=device)
            image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
            image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
        except Exception as e:
            err = f"CLIP Vision encode error: {e}"
            print(err)
            traceback.print_exc()
            stream.output_queue.push(('error', err))
            stream.output_queue.push(('end', None))
            return

        # Convert dtype
        try:
            llama_vec = llama_vec.to(transformer.dtype)
            llama_vec_n = llama_vec_n.to(transformer.dtype)
            clip_l_pooler = clip_l_pooler.to(transformer.dtype)
            clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
            image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
        except Exception as e:
            err = f"Data type conversion error: {e}"
            print(err)
            traceback.print_exc()
            stream.output_queue.push(('error', err))
            stream.output_queue.push(('end', None))
            return

        # Sampling
        last_update_time = time.time()
        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting sampling...'))))

        rnd = torch.Generator("cpu").manual_seed(seed)
        num_frames = latent_window_size * 4 - 3

        try:
            history_latents = torch.zeros(
                size=(1, 16, 1 + 2 + 16, target_h // 8, target_w // 8), 
                dtype=torch.float32
            ).cpu()
            history_pixels = None
            total_generated_latent_frames = 0
        except Exception as e:
            err = f"Error initializing history latents: {e}"
            print(err)
            traceback.print_exc()
            stream.output_queue.push(('error', err))
            stream.output_queue.push(('end', None))
            return

        latent_paddings = list(reversed(range(total_latent_sections)))
        if total_latent_sections > 4:
            latent_paddings = [3] + [2]*(total_latent_sections - 3) + [1, 0]

        for latent_padding in latent_paddings:
            last_update_time = time.time()
            is_last_section = (latent_padding == 0)
            latent_padding_size = latent_padding * latent_window_size

            if stream.input_queue.top() == 'end':
                if history_pixels is not None and total_generated_latent_frames > 0:
                    try:
                        final_name = os.path.join(outputs_folder, f'{job_id}_final_{total_generated_latent_frames}.mp4')
                        save_bcthw_as_mp4(history_pixels, final_name, fps=30, crf=18)
                        stream.output_queue.push(('file', final_name))
                    except Exception as e:
                        print(f"Error saving final partial video: {e}")
                stream.output_queue.push(('end', None))
                return

            print(f'latent_padding_size = {latent_padding_size}, is_last_section={is_last_section}')

            try:
                indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
                (
                    cidx_pre,
                    blank_indices,
                    latent_indices,
                    cidx_post,
                    cidx_2x,
                    cidx_4x
                ) = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
                clean_latent_indices = torch.cat([cidx_pre, cidx_post], dim=1)

                clean_latents_pre = start_latent.to(history_latents)
                c_latents_post, c_latents_2x, c_latents_4x = history_latents[:, :, :1 + 2 + 16].split([1, 2, 16], dim=2)
                clean_latents = torch.cat([clean_latents_pre, c_latents_post], dim=2)
            except Exception as e:
                err = f"Error preparing sampling data: {e}"
                print(err)
                traceback.print_exc()
                if last_output_filename:
                    stream.output_queue.push(('file', last_output_filename))
                continue

            if not high_vram and not cpu_fallback_mode:
                try:
                    unload_complete_models()
                    move_model_to_device_with_memory_preservation(
                        transformer, target_device=device, preserved_memory_gb=gpu_memory_preservation
                    )
                except Exception as e:
                    print(f"Error moving transformer to GPU: {e}")

            if use_teacache and not cpu_fallback_mode:
                try:
                    transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
                except Exception as e:
                    print(f"Error initializing TeaCache: {e}")
                    transformer.initialize_teacache(enable_teacache=False)
            else:
                transformer.initialize_teacache(enable_teacache=False)

            def callback(d):
                global last_update_time
                last_update_time = time.time()
                try:
                    if stream.input_queue.top() == 'end':
                        stream.output_queue.push(('end', None))
                        raise KeyboardInterrupt('User requested stop.')
                    preview_latents = d['denoised']
                    preview_latents = vae_decode_fake(preview_latents)
                    preview_img = (preview_latents * 255.0).cpu().numpy().clip(0,255).astype(np.uint8)
                    preview_img = einops.rearrange(preview_img, 'b c t h w -> (b h) (t w) c')

                    curr_step = d['i'] + 1
                    percentage = int(100.0 * curr_step / steps)
                    hint = f'Sampling {curr_step}/{steps}'
                    desc = f'Generated frames so far: {int(max(0, total_generated_latent_frames * 4 - 3))}'
                    bar_html = make_progress_bar_html(percentage, hint)
                    stream.output_queue.push(('progress', (preview_img, desc, bar_html)))
                except KeyboardInterrupt:
                    raise
                except Exception as exc:
                    print(f"Error in sampling callback: {exc}")
                return

            try:
                print(f"Sampling: device={device}, dtype={transformer.dtype}, teacache={use_teacache}")
                try:
                    generated_latents = sample_hunyuan(
                        transformer=transformer,
                        sampler='unipc',
                        width=target_w,
                        height=target_h,
                        frames=num_frames,
                        real_guidance_scale=cfg,
                        distilled_guidance_scale=gs,
                        guidance_rescale=rs,
                        num_inference_steps=steps,
                        generator=rnd,
                        prompt_embeds=llama_vec,
                        prompt_embeds_mask=llama_attention_mask,
                        prompt_poolers=clip_l_pooler,
                        negative_prompt_embeds=llama_vec_n,
                        negative_prompt_embeds_mask=llama_attention_mask_n,
                        negative_prompt_poolers=clip_l_pooler_n,
                        device=device,
                        dtype=transformer.dtype,
                        image_embeddings=image_encoder_last_hidden_state,
                        latent_indices=latent_indices,
                        clean_latents=clean_latents,
                        clean_latent_indices=clean_latent_indices,
                        clean_latents_2x=c_latents_2x,
                        clean_latent_2x_indices=cidx_2x,
                        clean_latents_4x=c_latents_4x,
                        clean_latent_4x_indices=cidx_4x,
                        callback=callback
                    )
                except KeyboardInterrupt as e:
                    print(f"User interrupt: {e}")
                    if last_output_filename:
                        stream.output_queue.push(('file', last_output_filename))
                        err_msg = "User stopped generation; partial video returned."
                    else:
                        err_msg = "User stopped generation; no video produced."
                    stream.output_queue.push(('error', err_msg))
                    stream.output_queue.push(('end', None))
                    return
            except Exception as e:
                print(f"Error during sampling: {e}")
                traceback.print_exc()
                if last_output_filename:
                    stream.output_queue.push(('file', last_output_filename))
                    err_msg = f"Sampling error; partial video returned: {e}"
                    stream.output_queue.push(('error', err_msg))
                else:
                    err_msg = f"Sampling error; no video produced: {e}"
                    stream.output_queue.push(('error', err_msg))
                stream.output_queue.push(('end', None))
                return

            try:
                if is_last_section:
                    generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
                total_generated_latent_frames += int(generated_latents.shape[2])
                history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
            except Exception as e:
                err = f"Error merging latent outputs: {e}"
                print(err)
                traceback.print_exc()
                if last_output_filename:
                    stream.output_queue.push(('file', last_output_filename))
                stream.output_queue.push(('error', err))
                stream.output_queue.push(('end', None))
                return

            if not high_vram and not cpu_fallback_mode:
                try:
                    offload_model_from_device_for_memory_preservation(
                        transformer, target_device=device, preserved_memory_gb=8
                    )
                    load_model_as_complete(vae, target_device=device)
                except Exception as e:
                    print(f"Error managing model memory: {e}")

            try:
                real_history_latents = history_latents[:, :, :total_generated_latent_frames]
            except Exception as e:
                err = f"Error slicing latents history: {e}"
                print(err)
                if last_output_filename:
                    stream.output_queue.push(('file', last_output_filename))
                continue

            try:
                if history_pixels is None:
                    history_pixels = vae_decode(real_history_latents, vae).cpu()
                else:
                    section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
                    overlapped_frames = latent_window_size * 4 - 3
                    current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
                    history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)

                output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
                save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=18)
                last_output_filename = output_filename
                stream.output_queue.push(('file', output_filename))
            except Exception as e:
                print(f"Error decoding/saving video: {e}")
                traceback.print_exc()
                if last_output_filename:
                    stream.output_queue.push(('file', last_output_filename))
                err = f"Error decoding/saving video: {e}"
                stream.output_queue.push(('error', err))
                continue

            if is_last_section:
                break
    except Exception as e:
        print(f"Outer error: {e}, type={type(e)}")
        traceback.print_exc()
        if not high_vram and not cpu_fallback_mode:
            try:
                unload_complete_models(
                    text_encoder, text_encoder_2, image_encoder, vae, transformer
                )
            except Exception as ue:
                print(f"Unload error: {ue}")
        if last_output_filename:
            stream.output_queue.push(('file', last_output_filename))
        err = f"Error in worker: {e}"
        stream.output_queue.push(('error', err))

    print("Worker finished, pushing end.")
    stream.output_queue.push(('end', None))


# Create a processing function with or without the HF Spaces GPU decorator
if IN_HF_SPACE and 'spaces' in globals():
    @spaces.GPU
    def process_with_gpu(input_image, prompt, n_prompt, seed, total_second_length, use_teacache):
        global stream
        assert input_image is not None, "No input image provided."

        # Fix certain parameters for simplicity
        latent_window_size = 9
        steps = 25
        cfg = 1.0
        gs = 10.0
        rs = 0.0
        gpu_memory_preservation = 6

        yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
        try:
            stream = AsyncStream()
            async_run(
                worker,
                input_image, prompt, n_prompt, seed,
                total_second_length, latent_window_size, steps,
                cfg, gs, rs, gpu_memory_preservation, use_teacache
            )

            output_filename = None
            prev_output_filename = None
            error_message = None

            while True:
                try:
                    flag, data = stream.output_queue.next()
                    if flag == 'file':
                        output_filename = data
                        prev_output_filename = output_filename
                        yield output_filename, gr.update(), gr.update(), '', gr.update(interactive=False), gr.update(interactive=True)
                    elif flag == 'progress':
                        preview, desc, html = data
                        yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
                    elif flag == 'error':
                        error_message = data
                        print(f"Received error: {error_message}")
                    elif flag == 'end':
                        if output_filename is None and prev_output_filename is not None:
                            output_filename = prev_output_filename
                        if error_message:
                            yield output_filename, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)
                        else:
                            yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
                        break
                except Exception as e:
                    print(f"Error processing output: {e}")
                    if (time.time() - last_update_time) > 60:
                        print(f"No updates for {(time.time()-last_update_time):.1f}s, likely hung.")
                        if prev_output_filename:
                            yield prev_output_filename, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)
                        else:
                            yield None, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)
                        break
        except Exception as e:
            print(f"Error starting process: {e}")
            traceback.print_exc()
            yield None, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)
    
    process = process_with_gpu
else:
    def process(input_image, prompt, n_prompt, seed, total_second_length, use_teacache):
        global stream
        assert input_image is not None, "No input image provided."

        latent_window_size = 9
        steps = 25
        cfg = 1.0
        gs = 10.0
        rs = 0.0
        gpu_memory_preservation = 6

        yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
        try:
            stream = AsyncStream()
            async_run(
                worker,
                input_image, prompt, n_prompt, seed,
                total_second_length, latent_window_size, steps,
                cfg, gs, rs, gpu_memory_preservation, use_teacache
            )

            output_filename = None
            prev_output_filename = None
            error_message = None

            while True:
                try:
                    flag, data = stream.output_queue.next()
                    if flag == 'file':
                        output_filename = data
                        prev_output_filename = output_filename
                        yield output_filename, gr.update(), gr.update(), '', gr.update(interactive=False), gr.update(interactive=True)
                    elif flag == 'progress':
                        preview, desc, html = data
                        yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
                    elif flag == 'error':
                        error_message = data
                        print(f"Received error: {error_message}")
                    elif flag == 'end':
                        if output_filename is None and prev_output_filename is not None:
                            output_filename = prev_output_filename
                        if error_message:
                            yield output_filename, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)
                        else:
                            yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
                        break
                except Exception as e:
                    print(f"Error processing output: {e}")
                    if (time.time() - last_update_time) > 60:
                        print(f"No updates for {(time.time()-last_update_time):.1f}s, likely hung.")
                        if prev_output_filename:
                            yield prev_output_filename, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)
                        else:
                            yield None, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)
                        break
        except Exception as e:
            print(f"Error starting process: {e}")
            traceback.print_exc()
            yield None, gr.update(visible=False), gr.update(), gr.update(interactive=True), gr.update(interactive=False)


def end_process():
    """
    Stop generation by pushing 'end' signal into the queue.
    """
    print("User clicked the stop button, sending 'end' signal...")
    global stream
    if 'stream' in globals() and stream is not None:
        try:
            current_top = stream.input_queue.top()
            print(f"Queue top signal: {current_top}")
        except Exception as e:
            print(f"Error checking queue status: {e}")
        try:
            stream.input_queue.push('end')
            print("Successfully pushed 'end' signal.")
        except Exception as e:
            print(f"Error pushing 'end' signal: {e}")
    else:
        print("Warning: 'stream' is not initialized; cannot stop.")
    return None


quick_prompts = [
    ["The camera smoothly orbits around the center of the scene, keeping the center point fixed and always in view"]
]

def make_custom_css():
    base_progress_css = make_progress_bar_css()
    enhanced_css = """
    body {
        background: #f9fafb !important;
        font-family: "Noto Sans", sans-serif;
    }
    #app-container {
        max-width: 1200px;
        margin: 0 auto;
        padding: 1rem;
        position: relative;
    }
    h1 {
        font-size: 2rem;
        text-align: center;
        margin-bottom: 1rem;
        color: #2d3748;
        font-weight: 700;
    }
    .start-btn, .stop-btn {
        min-height: 45px;
        font-size: 1rem;
        font-weight: 600;
    }
    .start-btn {
        background-color: #3182ce !important;
        color: #fff !important;
    }
    .stop-btn {
        background-color: #e53e3e !important;
        color: #fff !important;
    }
    .button-container button:hover {
        filter: brightness(0.95);
    }
    .preview-container, .video-container {
        border: 1px solid #cbd5e0;
        border-radius: 8px;
        overflow: hidden;
    }
    .progress-container {
        margin-top: 15px;
        margin-bottom: 15px;
    }
    .error-message {
        background-color: #fff5f5;
        border: 1px solid #fed7d7;
        color: #e53e3e;
        padding: 10px;
        border-radius: 4px;
        margin-top: 10px;
    }
    .error-icon {
        color: #e53e3e;
        margin-right: 8px;
    }
    #error-message {
        color: #ff4444;
        font-weight: bold;
        padding: 10px;
        border-radius: 4px;
        margin-top: 10px;
    }
    @media (max-width: 768px) {
        #app-container {
            padding: 0.5rem;
        }
        .mobile-full-width {
            flex-direction: column !important;
        }
        .mobile-full-width > .gr-block {
            width: 100% !important;
        }
    }
    """
    return base_progress_css + enhanced_css

css = make_custom_css()

block = gr.Blocks(css=css).queue()
with block:
    gr.HTML("<h1>FramePack Rotate-Landscape - Generate Rotating Landscape Video</h1>")
    
    with gr.Row(elem_classes="mobile-full-width"):
        with gr.Column(scale=1):
            input_image = gr.Image(
                sources='upload',
                type="numpy",
                label="Upload Image",
                height=320
            )

            prompt = gr.Textbox(
                label="Prompt",
                value='The camera smoothly orbits around the center of the scene...',
            )

            example_quick_prompts = gr.Dataset(
                samples=quick_prompts,
                label="Quick Prompts",
                samples_per_page=1000,
                components=[prompt]
            )
            example_quick_prompts.click(
                lambda x: x[0],
                inputs=[example_quick_prompts],
                outputs=prompt,
                show_progress=False,
                queue=False
            )

            with gr.Row(elem_classes="button-container"):
                start_button = gr.Button(
                    value="Generate",
                    elem_classes="start-btn",
                    variant="primary"
                )
                end_button = gr.Button(
                    value="Stop",
                    elem_classes="stop-btn",
                    interactive=False
                )

            use_teacache = gr.Checkbox(
                label="Use TeaCache",
                value=True,
                info="Faster speed, but possibly worse finger/hand generation."
            )
            n_prompt = gr.Textbox(label="Negative Prompt", value="", visible=False)
            seed = gr.Number(label="Seed", value=31337, precision=0)
            total_second_length = gr.Slider(
                label="Video length (max 3 seconds)",
                minimum=0.5, maximum=3, value=1.0, step=0.1
            )

        with gr.Column(scale=1):
            preview_image = gr.Image(
                label="Preview",
                height=200,
                visible=False,
                elem_classes="preview-container"
            )
            result_video = gr.Video(
                label="Generated Video",
                autoplay=True,
                loop=True,
                show_share_button=True,
                height=512,
                elem_classes="video-container"
            )
            gr.HTML("""
            <div>
                Note: Due to reversed sampling, ending actions may appear before starting actions. If the start action is missing, please wait for further frames.
            </div>
            """)

            with gr.Group(elem_classes="progress-container"):
                progress_desc = gr.Markdown('')
                progress_bar = gr.HTML('')

            error_message = gr.HTML('', elem_id='error-message', visible=True)
    
    # Inputs
    ips = [input_image, prompt, n_prompt, seed, total_second_length, use_teacache]
    start_button.click(
        fn=process,
        inputs=ips,
        outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button]
    )
    end_button.click(fn=end_process)

block.launch()