jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -1,4 +1,4 @@
-from typing import Dict, Any, Union, Optional
 import torch
 from diffusers import LTXPipeline, LTXImageToVideoPipeline
 from PIL import Image
@@ -15,6 +15,19 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class EndpointHandler:
     def __init__(self, path: str = ""):
         """Initialize the LTX Video handler with both text-to-video and image-to-video pipelines.
@@ -35,11 +48,55 @@ class EndpointHandler:
         # Enable memory optimizations
         self.text_to_video.enable_model_cpu_offload()
         self.image_to_video.enable_model_cpu_offload()
-        # Set default FPS
-        self.fps = 24
-    def _create_video_file(self, frames: torch.Tensor, fps: int = 24) -> bytes:
         """Convert frames to an MP4 video file.
         Args:
@@ -50,11 +107,11 @@ class EndpointHandler:
             bytes: MP4 video file content
         """
         # Log frame information
-        num_frames = frames.shape[1]  # Shape should be [1, num_frames, channels, height, width]
         duration = num_frames / fps
         logger.info(f"Creating video with {num_frames} frames at {fps} FPS (duration: {duration:.2f} seconds)")
-        # Convert tensor to numpy array - remove batch dimension and rearrange to [num_frames, height, width, channels]
         video_np = frames.squeeze(0).permute(0, 2, 3, 1).cpu().float().numpy()
         video_np = (video_np * 255).astype(np.uint8)
@@ -68,8 +125,7 @@ class EndpointHandler:
         try:
             # Create video clip and write to file
             clip = ImageSequenceClip(list(video_np), fps=fps)
-            resized = clip.resize((width, height))
-            resized.write_videofile(output_path, codec="libx264", audio=False)
             # Read the video file
             with open(output_path, "rb") as f:
@@ -93,60 +149,66 @@ class EndpointHandler:
             data (Dict[str, Any]): Input data containing:
                 - prompt (str): Text description for video generation
                 - image (Optional[str]): Base64 encoded image for image-to-video generation
-                - num_frames (Optional[int]): Number of frames to generate (default: 24)
                 - fps (Optional[int]): Frames per second (default: 24)
                 - guidance_scale (Optional[float]): Guidance scale (default: 7.5)
-                - num_inference_steps (Optional[int]): Number of inference steps (default: 50)
         Returns:
             Dict[str, Any]: Dictionary containing:
                 - video: Base64 encoded MP4 video
                 - content-type: MIME type of the video
         """
-        # Extract parameters
         prompt = data.get("prompt")
         if not prompt:
             raise ValueError("'prompt' is required in the input data")
-        # Get optional parameters with defaults
-        num_frames = data.get("num_frames", 24)
-        fps = data.get("fps", self.fps)
         guidance_scale = data.get("guidance_scale", 7.5)
-        num_inference_steps = data.get("num_inference_steps", 50)
         logger.info(f"Generating video with prompt: '{prompt}'")
-        logger.info(f"Parameters: num_frames={num_frames}, fps={fps}, guidance_scale={guidance_scale}, num_inference_steps={num_inference_steps}")
-        # Check if image is provided for image-to-video generation
-        image_data = data.get("image")
         try:
             with torch.no_grad():
                 if image_data:
                     # Decode base64 image
                     image_bytes = base64.b64decode(image_data)
                     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                     logger.info("Using image-to-video generation mode")
-                    # Generate video from image
-                    output = self.image_to_video(
-                        prompt=prompt,
-                        image=image,
-                        num_frames=num_frames,
-                        guidance_scale=guidance_scale,
-                        num_inference_steps=num_inference_steps,
-                        output_type="pt"
-                    ).frames  # Remove [0] to keep all frames
                 else:
                     logger.info("Using text-to-video generation mode")
-                    # Generate video from text only
-                    output = self.text_to_video(
-                        prompt=prompt,
-                        num_frames=num_frames,
-                        guidance_scale=guidance_scale,
-                        num_inference_steps=num_inference_steps,
-                        output_type="pt"
-                    ).frames  # Remove [0] to keep all frames
                 # Convert frames to video file
                 video_content = self._create_video_file(output, fps=fps)
@@ -156,7 +218,15 @@ class EndpointHandler:
                 return {
                     "video": video_base64,
-                    "content-type": "video/mp4"
                 }
         except Exception as e:

+from typing import Dict, Any, Union, Optional, Tuple
 import torch
 from diffusers import LTXPipeline, LTXImageToVideoPipeline
 from PIL import Image
 logger = logging.getLogger(__name__)
 class EndpointHandler:
+    # Default configuration
+    DEFAULT_FPS = 24
+    DEFAULT_DURATION = 4  # seconds
+    DEFAULT_NUM_FRAMES = (DEFAULT_DURATION * DEFAULT_FPS) + 1  # 97 frames
+    DEFAULT_NUM_STEPS = 25
+    DEFAULT_WIDTH = 768
+    DEFAULT_HEIGHT = 512
+    # Constraints
+    MAX_WIDTH = 1280
+    MAX_HEIGHT = 720
+    MAX_FRAMES = 257
     def __init__(self, path: str = ""):
         """Initialize the LTX Video handler with both text-to-video and image-to-video pipelines.
         # Enable memory optimizations
         self.text_to_video.enable_model_cpu_offload()
         self.image_to_video.enable_model_cpu_offload()
+    def _validate_and_adjust_resolution(self, width: int, height: int) -> Tuple[int, int]:
+        """Validate and adjust resolution to meet constraints.
+        Args:
+            width (int): Requested width
+            height (int): Requested height
+        Returns:
+            Tuple[int, int]: Adjusted (width, height)
+        """
+        # Round to nearest multiple of 32
+        width = round(width / 32) * 32
+        height = round(height / 32) * 32
+        # Enforce maximum dimensions
+        width = min(width, self.MAX_WIDTH)
+        height = min(height, self.MAX_HEIGHT)
+        # Enforce minimum dimensions
+        width = max(width, 32)
+        height = max(height, 32)
+        return width, height
+    def _validate_and_adjust_frames(self, num_frames: Optional[int] = None, fps: Optional[int] = None) -> Tuple[int, int]:
+        """Validate and adjust frame count and FPS to meet constraints.
+        Args:
+            num_frames (Optional[int]): Requested number of frames
+            fps (Optional[int]): Requested frames per second
+        Returns:
+            Tuple[int, int]: Adjusted (num_frames, fps)
+        """
+        # Use defaults if not provided
+        fps = fps or self.DEFAULT_FPS
+        num_frames = num_frames or self.DEFAULT_NUM_FRAMES
+        # Adjust frames to be in format 8k + 1
+        k = (num_frames - 1) // 8
+        num_frames = (k * 8) + 1
+        # Enforce maximum frame count
+        num_frames = min(num_frames, self.MAX_FRAMES)
+        return num_frames, fps
+    def _create_video_file(self, frames: torch.Tensor, fps: int = DEFAULT_FPS) -> bytes:
         """Convert frames to an MP4 video file.
         Args:
             bytes: MP4 video file content
         """
         # Log frame information
+        num_frames = frames.shape[1]
         duration = num_frames / fps
         logger.info(f"Creating video with {num_frames} frames at {fps} FPS (duration: {duration:.2f} seconds)")
+        # Convert tensor to numpy array
         video_np = frames.squeeze(0).permute(0, 2, 3, 1).cpu().float().numpy()
         video_np = (video_np * 255).astype(np.uint8)
         try:
             # Create video clip and write to file
             clip = ImageSequenceClip(list(video_np), fps=fps)
+            clip.write_videofile(output_path, codec="libx264", audio=False)
             # Read the video file
             with open(output_path, "rb") as f:
             data (Dict[str, Any]): Input data containing:
                 - prompt (str): Text description for video generation
                 - image (Optional[str]): Base64 encoded image for image-to-video generation
+                - width (Optional[int]): Video width (default: 768)
+                - height (Optional[int]): Video height (default: 512)
+                - num_frames (Optional[int]): Number of frames (default: 97)
                 - fps (Optional[int]): Frames per second (default: 24)
+                - num_inference_steps (Optional[int]): Number of inference steps (default: 25)
                 - guidance_scale (Optional[float]): Guidance scale (default: 7.5)
         Returns:
             Dict[str, Any]: Dictionary containing:
                 - video: Base64 encoded MP4 video
                 - content-type: MIME type of the video
+                - metadata: Dictionary with actual values used for generation
         """
+        # Extract and validate prompt
         prompt = data.get("prompt")
         if not prompt:
             raise ValueError("'prompt' is required in the input data")
+        # Get and validate resolution
+        width = data.get("width", self.DEFAULT_WIDTH)
+        height = data.get("height", self.DEFAULT_HEIGHT)
+        width, height = self._validate_and_adjust_resolution(width, height)
+        # Get and validate frames and FPS
+        num_frames = data.get("num_frames", self.DEFAULT_NUM_FRAMES)
+        fps = data.get("fps", self.DEFAULT_FPS)
+        num_frames, fps = self._validate_and_adjust_frames(num_frames, fps)
+        # Get other parameters with defaults
         guidance_scale = data.get("guidance_scale", 7.5)
+        num_inference_steps = data.get("num_inference_steps", self.DEFAULT_NUM_STEPS)
         logger.info(f"Generating video with prompt: '{prompt}'")
+        logger.info(f"Parameters: size={width}x{height}, num_frames={num_frames}, fps={fps}")
+        logger.info(f"Additional params: guidance_scale={guidance_scale}, num_inference_steps={num_inference_steps}")
         try:
             with torch.no_grad():
+                generation_kwargs = {
+                    "prompt": prompt,
+                    "height": height,
+                    "width": width,
+                    "num_frames": num_frames,
+                    "guidance_scale": guidance_scale,
+                    "num_inference_steps": num_inference_steps,
+                    "output_type": "pt"
+                }
+                # Check if image is provided for image-to-video generation
+                image_data = data.get("image")
                 if image_data:
                     # Decode base64 image
                     image_bytes = base64.b64decode(image_data)
                     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                     logger.info("Using image-to-video generation mode")
+                    generation_kwargs["image"] = image
+                    output = self.image_to_video(**generation_kwargs).frames
                 else:
                     logger.info("Using text-to-video generation mode")
+                    output = self.text_to_video(**generation_kwargs).frames
                 # Convert frames to video file
                 video_content = self._create_video_file(output, fps=fps)
                 return {
                     "video": video_base64,
+                    "content-type": "video/mp4",
+                    "metadata": {
+                        "width": width,
+                        "height": height,
+                        "num_frames": num_frames,
+                        "fps": fps,
+                        "duration": num_frames / fps,
+                        "num_inference_steps": num_inference_steps
+                    }
                 }
         except Exception as e: