jbilcke-hf
/

HunyuanVideoGP-HFIE

Text-to-Video

Safetensors

Inference Endpoints

Model card Files Files and versions Community

jbilcke-hf HF staff commited on about 13 hours ago

Commit

6752edd

verified ·

1 Parent(s): 1a61ca3

Update handler.py

Browse files

Files changed (1) hide show

handler.py +57 -7

handler.py CHANGED Viewed

@@ -13,7 +13,6 @@ import base64
 from hyvideo.utils.file_utils import save_videos_grid
 from hyvideo.inference import HunyuanVideoSampler
 from hyvideo.constants import NEGATIVE_PROMPT, VAE_PATH, TEXT_ENCODER_PATH, TOKENIZER_PATH
-from hyvideo.modules.attenion import get_attention_modes
 try:
     import triton
@@ -37,8 +36,45 @@ DEFAULT_NB_FRAMES = (4 * 30) + 1  # or 129 (note: hunyan requires an extra +1 fr
 DEFAULT_NB_STEPS = 22  # Default for standard model
 DEFAULT_FPS = 24
 # Get supported attention modes
-attention_modes_supported = get_attention_modes()
 def setup_vae_path(vae_path: Path) -> Path:
     """Create a temporary directory with correctly named VAE config file"""
@@ -317,10 +353,20 @@ class EndpointHandler:
         try:
             logger.info("Attempting to initialize HunyuanVideoSampler...")
-            # Apply attention mode setting
-            self.args.attention = self.attention_mode
-            self.model = HunyuanVideoSampler.from_pretrained(models_root_path, args=self.args)
             # Set attention mode for transformer blocks
             if hasattr(self.model, 'pipeline') and hasattr(self.model.pipeline, 'transformer'):
@@ -362,7 +408,7 @@ class EndpointHandler:
             logger.error(f"Error initializing model: {str(e)}")
             raise
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """Process a single request"""
         # Log incoming request
         logger.info(f"Processing request with data: {data}")
@@ -385,6 +431,7 @@ class EndpointHandler:
         flow_shift = float(data.pop("flow_shift", 7.0))
         embedded_guidance_scale = float(data.pop("embedded_guidance_scale", 6.0))
         enable_riflex = data.pop("enable_riflex", self.args.enable_riflex)
         logger.info(f"Processing with parameters: width={width}, height={height}, "
                    f"video_length={video_length}, seed={seed}, "
@@ -392,10 +439,12 @@ class EndpointHandler:
         try:
             # Set up TeaCache for this generation if enabled
-            if hasattr(self.model.pipeline, 'transformer') and self.model.pipeline.transformer.enable_teacache:
                 transformer = self.model.pipeline.transformer
                 transformer.num_steps = num_inference_steps
                 transformer.cnt = 0
                 transformer.accumulated_rel_l1_distance = 0
                 transformer.previous_modulated_input = None
                 transformer.previous_residual = None
@@ -450,6 +499,7 @@ class EndpointHandler:
             logger.info("Successfully generated and encoded video")
             return video_data_uri
         except Exception as e:

 from hyvideo.utils.file_utils import save_videos_grid
 from hyvideo.inference import HunyuanVideoSampler
 from hyvideo.constants import NEGATIVE_PROMPT, VAE_PATH, TEXT_ENCODER_PATH, TOKENIZER_PATH
 try:
     import triton
 DEFAULT_NB_STEPS = 22  # Default for standard model
 DEFAULT_FPS = 24
+def get_attention_modes():
+    """Get available attention modes - fallback if module function isn't available"""
+    modes = ["sdpa"]  # Always available
+    try:
+        import torch
+        if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
+            modes.append("sdpa")
+    except:
+        pass
+    try:
+        import flash_attn
+        modes.append("flash")
+    except:
+        pass
+    try:
+        import sageattention
+        modes.append("sage")
+        if hasattr(sageattention, 'efficient_attention_v2'):
+            modes.append("sage2")
+    except:
+        pass
+    try:
+        import xformers
+        modes.append("xformers")
+    except:
+        pass
+    return modes
 # Get supported attention modes
+try:
+    from hyvideo.modules.attenion import get_attention_modes
+    attention_modes_supported = get_attention_modes()
+except:
+    attention_modes_supported = get_attention_modes()
 def setup_vae_path(vae_path: Path) -> Path:
     """Create a temporary directory with correctly named VAE config file"""
         try:
             logger.info("Attempting to initialize HunyuanVideoSampler...")
+            # Extract necessary paths
+            transformer_path = str(self.args.dit_weight)
+            text_encoder_path = str(Path(self.args.model_base) / "text_encoder")
+            logger.info(f"Transformer path: {transformer_path}")
+            logger.info(f"Text encoder path: {text_encoder_path}")
+            # Initialize the model using the exact signature from gradio_server.py
+            self.model = HunyuanVideoSampler.from_pretrained(
+                transformer_path,
+                text_encoder_path,
+                attention_mode=self.attention_mode,
+                args=self.args
+            )
             # Set attention mode for transformer blocks
             if hasattr(self.model, 'pipeline') and hasattr(self.model.pipeline, 'transformer'):
             logger.error(f"Error initializing model: {str(e)}")
             raise
+    def __call__(self, data: Dict[str, Any]) -> str:
         """Process a single request"""
         # Log incoming request
         logger.info(f"Processing request with data: {data}")
         flow_shift = float(data.pop("flow_shift", 7.0))
         embedded_guidance_scale = float(data.pop("embedded_guidance_scale", 6.0))
         enable_riflex = data.pop("enable_riflex", self.args.enable_riflex)
+        tea_cache = float(data.pop("tea_cache", 0.0))
         logger.info(f"Processing with parameters: width={width}, height={height}, "
                    f"video_length={video_length}, seed={seed}, "
         try:
             # Set up TeaCache for this generation if enabled
+            if hasattr(self.model.pipeline, 'transformer') and tea_cache > 0:
                 transformer = self.model.pipeline.transformer
+                transformer.enable_teacache = True
                 transformer.num_steps = num_inference_steps
                 transformer.cnt = 0
+                transformer.rel_l1_thresh = tea_cache
                 transformer.accumulated_rel_l1_distance = 0
                 transformer.previous_modulated_input = None
                 transformer.previous_residual = None
             logger.info("Successfully generated and encoded video")
+            # Return exactly what the demo.py expects
             return video_data_uri
         except Exception as e: