Test

Paused

App Files Files Community

eeuuia commited on Oct 13

Commit

79d5be1

verified ·

1 Parent(s): 6c0add0

Rename LTX-Video/ltx_video/pipelines/pipeline_ltx_video (3).py to LTX-Video/ltx_video/pipelines/pipeline_ltx_video.py

Browse files

Files changed (1) hide show

LTX-Video/ltx_video/pipelines/{pipeline_ltx_video (3).py → pipeline_ltx_video.py} +65 -38

LTX-Video/ltx_video/pipelines/{pipeline_ltx_video (3).py → pipeline_ltx_video.py} RENAMED Viewed

@@ -59,8 +59,24 @@ logging.set_verbosity_debug()
 #logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class SpyLatent:
     """
@@ -120,7 +136,7 @@ class SpyLatent:
             # --- Converte para 5D se necessário ---
             tensor_5d = self._to_5d(tensor, reference_shape_5d)
             if tensor_5d is not None and tensor.ndim == 3:
-                self._print_stats("Convertido para 5D", tensor_5d)
             # --- Visualização com VAE ---
             if save_visual and self.vae is not None and tensor_5d is not None:
@@ -129,7 +145,7 @@ class SpyLatent:
                 frame_idx_to_viz = min(1, tensor_5d.shape[2] - 1)
                 if frame_idx_to_viz < 0:
-                    print("  VISUALIZAÇÃO (VAE): Tensor não tem frames para visualizar.")
                 else:
                     #print(f"  VISUALIZAÇÃO (VAE): Usando frame de índice {frame_idx_to_viz}.")
                     latent_slice = tensor_5d[:, :, frame_idx_to_viz:frame_idx_to_viz+1, :, :]
@@ -163,7 +179,7 @@ class SpyLatent:
         std = tensor.std().item()
         min_val = tensor.min().item()
         max_val = tensor.max().item()
-        print(f"  {prefix}: {tensor.shape}")
@@ -240,7 +256,7 @@ ASPECT_RATIO_512_BIN = {
     "4.0": [1024.0, 256.0],
 }
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
@@ -308,9 +324,10 @@ def retrieve_timesteps(
         num_inference_steps = len(timesteps)
         try:
-            print(f"[LTX]LATENTS {latents.shape}")
         except Exception:
-                pass
     return timesteps, num_inference_steps
@@ -334,8 +351,9 @@ class ConditioningItem:
     conditioning_strength: float
     media_x: Optional[int] = None
     media_y: Optional[int] = None
 class LTXVideoPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using LTX-Video.
@@ -387,6 +405,7 @@ class LTXVideoPipeline(DiffusionPipeline):
     ]
     model_cpu_offload_seq = "prompt_enhancer_image_caption_model->prompt_enhancer_llm_model->text_encoder->transformer->vae"
     def __init__(
         self,
         tokenizer: T5Tokenizer,
@@ -425,6 +444,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         self.spy = SpyLatent(vae=vae)
     def mask_text_embeddings(self, emb, mask):
         if emb.shape[0] == 1:
             keep_index = mask.sum().item()
@@ -434,6 +454,8 @@ class LTXVideoPipeline(DiffusionPipeline):
             return masked_feature, emb.shape[2]
     # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -628,6 +650,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
     def check_inputs(
         self,
         prompt,
@@ -714,6 +737,7 @@ class LTXVideoPipeline(DiffusionPipeline):
                 self.prompt_enhancer_llm_tokenizer is not None
             ), "Text prompt enhancer tokenizer must be initialized if enhance_prompt is True"
     def _text_preprocessing(self, text):
         if not isinstance(text, (tuple, list)):
             text = [text]
@@ -725,6 +749,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return [process(t) for t in text]
     @staticmethod
     def add_noise_to_image_conditioning_latents(
         t: float,
         init_latents: torch.Tensor,
@@ -751,6 +776,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return latents
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
         self,
         latents: torch.Tensor | None,
@@ -832,6 +858,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return latents
     @staticmethod
     def classify_height_width_bin(
         height: int, width: int, ratios: dict
     ) -> Tuple[int, int]:
@@ -842,6 +869,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return int(default_hw[0]), int(default_hw[1])
     @staticmethod
     def resize_and_crop_tensor(
         samples: torch.Tensor, new_width: int, new_height: int
     ) -> torch.Tensor:
@@ -868,6 +896,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return samples
     @staticmethod
     def resize_tensor(media_items, height, width):
         n_frames = media_items.shape[2]
         if media_items.shape[-2:] != (height, width):
@@ -882,6 +911,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return media_items
     @torch.no_grad()
     def __call__(
         self,
         height: int,
@@ -1087,7 +1117,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         )
         try:
-            print(f"[LTX2]LATENTS {latents.shape}")
         except Exception:
             pass
@@ -1160,7 +1190,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             )
         try:
-            print(f"[LTX3]LATENTS {latents.shape}")
         except Exception:
             pass
@@ -1230,7 +1260,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         )
         try:
-            print(f"[LTX4]LATENTS {latents.shape}")
             original_shape = latents
         except Exception:
             pass
@@ -1252,17 +1282,17 @@ class LTXVideoPipeline(DiffusionPipeline):
         init_latents = latents.clone()  # Used for image_cond_noise_update
         try:
-            print(f"[LTXCond]conditioning_mask {conditioning_mask.shape}")
         except Exception:
             pass
         try:
-            print(f"[LTXCond]pixel_coords {pixel_coords.shape}")
         except Exception:
             pass
         try:
-            print(f"[LTXCond]pixel_coords {pixel_coords.shape}")
         except Exception:
             pass
@@ -1274,7 +1304,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         try:
-            print(f"[LTX5]LATENTS {latents.shape}")
         except Exception:
             pass
@@ -1283,6 +1313,12 @@ class LTXVideoPipeline(DiffusionPipeline):
             len(timesteps) - num_inference_steps * self.scheduler.order, 0
         )
         orig_conditioning_mask = conditioning_mask
         # Befor compiling this code please be aware:
@@ -1337,11 +1373,7 @@ class LTXVideoPipeline(DiffusionPipeline):
                         generator,
                     )
-                try:
-                   print(f"[LTX6]LATENTS {latents.shape}")
-                   self.spy.inspect(latents, "LTX6_After_Patchify", reference_shape_5d=original_shape)
-                except Exception:
-                   pass
@@ -1352,11 +1384,7 @@ class LTXVideoPipeline(DiffusionPipeline):
                     latent_model_input, t
                 )
-                try:
-                   print(f"[LTX7]LATENTS {latent_model_input.shape}")
-                   self.spy.inspect(latents, "LTX7_After_Patchify", reference_shape_5d=original_shape)
-                except Exception:
-                   pass
                 current_timestep = t
                 if not torch.is_tensor(current_timestep):
@@ -1473,11 +1501,7 @@ class LTXVideoPipeline(DiffusionPipeline):
                     stochastic_sampling=stochastic_sampling,
                 )
-                try:
-                   print(f"[LTX8]LATENTS {latents.shape}")
-                   self.spy.inspect(latents, "LTX8_After_Patchify", reference_shape_5d=original_shape)
-                except Exception:
-                   pass
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or (
@@ -1504,15 +1528,10 @@ class LTXVideoPipeline(DiffusionPipeline):
                 torch.cuda.empty_cache()
         # Remove the added conditioning latents
-        latents = latents[:, num_cond_latents:]
-        try:
-             print(f"[LTX10]LATENTS {latents.shape}")
-             self.spy.inspect(latents, "LTX10_After_Patchify", reference_shape_5d=original_shape)
-        except Exception:
-            pass
         latents = self.patchifier.unpatchify(
             latents=latents,
             output_height=latent_height,
@@ -1549,7 +1568,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             )
             try:
-                 print(f"[LTX11]LATENTS {latents.shape}")
             except Exception:
                  pass
@@ -1566,6 +1585,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return ImagePipelineOutput(images=image)
     def denoising_step(
         self,
         latents: torch.Tensor,
@@ -1601,6 +1621,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         tokens_to_denoise_mask = (t - t_eps < (1.0 - conditioning_mask)).unsqueeze(-1)
         return torch.where(tokens_to_denoise_mask, denoised_latents, latents)
     def prepare_conditioning(
         self,
         conditioning_items: Optional[List[ConditioningItem]],
@@ -1808,6 +1829,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         )
     @staticmethod
     def _resize_conditioning_item(
         conditioning_item: ConditioningItem,
         height: int,
@@ -1823,6 +1845,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         )
         return new_conditioning_item
     def _get_latent_spatial_position(
         self,
         latents: torch.Tensor,
@@ -1871,6 +1894,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return latents, x_start // scale, y_start // scale
     @staticmethod
     def _handle_non_first_conditioning_sequence(
         init_latents: torch.Tensor,
         init_conditioning_mask: torch.Tensor,
@@ -1946,6 +1970,7 @@ class LTXVideoPipeline(DiffusionPipeline):
             latents,
         )
     def trim_conditioning_sequence(
         self, start_frame: int, sequence_num_frames: int, target_num_frames: int
     ):
@@ -1967,6 +1992,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return num_frames
     @staticmethod
     def tone_map_latents(
         latents: torch.Tensor,
         compression: float,
@@ -2008,6 +2034,7 @@ class LTXVideoPipeline(DiffusionPipeline):
         return filtered
 def adain_filter_latent(
     latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0
 ):
@@ -2038,7 +2065,7 @@ def adain_filter_latent(
     result = torch.lerp(latents, result, factor)
     return result
 class LTXMultiScalePipeline:
     def _upsample_latents(
         self, latest_upsampler: LatentUpsampler, latents: torch.Tensor

 #logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+import logging
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", message=".*")
+from huggingface_hub import logging as ll
+ll.set_verbosity_error()
+ll.set_verbosity_warning()
+ll.set_verbosity_info()
+ll.set_verbosity_debug()
+from utils.debug_utils import log_function_io
+logger = logging.getLogger("AducDebug")
+logging.basicConfig(level=logging.DEBUG)
+logger.setLevel(logging.DEBUG)
+@log_function_io
 class SpyLatent:
     """
             # --- Converte para 5D se necessário ---
             tensor_5d = self._to_5d(tensor, reference_shape_5d)
             if tensor_5d is not None and tensor.ndim == 3:
+                #self._print_stats("Convertido para 5D", tensor_5d)
             # --- Visualização com VAE ---
             if save_visual and self.vae is not None and tensor_5d is not None:
                 frame_idx_to_viz = min(1, tensor_5d.shape[2] - 1)
                 if frame_idx_to_viz < 0:
+                    #print("  VISUALIZAÇÃO (VAE): Tensor não tem frames para visualizar.")
                 else:
                     #print(f"  VISUALIZAÇÃO (VAE): Usando frame de índice {frame_idx_to_viz}.")
                     latent_slice = tensor_5d[:, :, frame_idx_to_viz:frame_idx_to_viz+1, :, :]
         std = tensor.std().item()
         min_val = tensor.min().item()
         max_val = tensor.max().item()
+        #print(f"  {prefix}: {tensor.shape}")
     "4.0": [1024.0, 256.0],
 }
+@log_function_io
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
         num_inference_steps = len(timesteps)
         try:
+            print(f"[LTX]timesteps {timesteps}")
+            print(f"[LTX]num_inference_steps {num_inference_steps}")
         except Exception:
+            pass
     return timesteps, num_inference_steps
     conditioning_strength: float
     media_x: Optional[int] = None
     media_y: Optional[int] = None
+@log_function_io
 class LTXVideoPipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using LTX-Video.
     ]
     model_cpu_offload_seq = "prompt_enhancer_image_caption_model->prompt_enhancer_llm_model->text_encoder->transformer->vae"
+    @log_function_io
     def __init__(
         self,
         tokenizer: T5Tokenizer,
         self.spy = SpyLatent(vae=vae)
+    @log_function_io
     def mask_text_embeddings(self, emb, mask):
         if emb.shape[0] == 1:
             keep_index = mask.sum().item()
             return masked_feature, emb.shape[2]
     # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    @log_function_io
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
+    @log_function_io
     def check_inputs(
         self,
         prompt,
                 self.prompt_enhancer_llm_tokenizer is not None
             ), "Text prompt enhancer tokenizer must be initialized if enhance_prompt is True"
+    @log_function_io
     def _text_preprocessing(self, text):
         if not isinstance(text, (tuple, list)):
             text = [text]
         return [process(t) for t in text]
     @staticmethod
+    @log_function_io
     def add_noise_to_image_conditioning_latents(
         t: float,
         init_latents: torch.Tensor,
         return latents
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    @log_function_io
     def prepare_latents(
         self,
         latents: torch.Tensor | None,
         return latents
     @staticmethod
+    @log_function_io
     def classify_height_width_bin(
         height: int, width: int, ratios: dict
     ) -> Tuple[int, int]:
         return int(default_hw[0]), int(default_hw[1])
     @staticmethod
+    @log_function_io
     def resize_and_crop_tensor(
         samples: torch.Tensor, new_width: int, new_height: int
     ) -> torch.Tensor:
         return samples
     @staticmethod
+    @log_function_io
     def resize_tensor(media_items, height, width):
         n_frames = media_items.shape[2]
         if media_items.shape[-2:] != (height, width):
         return media_items
     @torch.no_grad()
+    @log_function_io
     def __call__(
         self,
         height: int,
         )
         try:
+            print(f"[LTX2]timesteps {timesteps}")
         except Exception:
             pass
             )
         try:
+            print(f"[LTX3]LATENTS {prompt}")
         except Exception:
             pass
         )
         try:
+            print(f"[LTX4]media_items {media_items}")
             original_shape = latents
         except Exception:
             pass
         init_latents = latents.clone()  # Used for image_cond_noise_update
         try:
+            print(f"[LTXCond]conditioning_items {conditioning_items.shape}")
         except Exception:
             pass
         try:
+            print(f"[LTXCond]num_frames {num_frames}")
         except Exception:
             pass
         try:
+            print(f"[LTXCond]width {width}")
         except Exception:
             pass
         try:
+            print(f"[LTX5]width {width}")
         except Exception:
             pass
             len(timesteps) - num_inference_steps * self.scheduler.order, 0
         )
+        try:
+            print(f"[LTX5]num_warmup_steps {num_warmup_steps}")
+        except Exception:
+            pass
         orig_conditioning_mask = conditioning_mask
         # Befor compiling this code please be aware:
                         generator,
                     )
                     latent_model_input, t
                 )
                 current_timestep = t
                 if not torch.is_tensor(current_timestep):
                     stochastic_sampling=stochastic_sampling,
                 )
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or (
                 torch.cuda.empty_cache()
         # Remove the added conditioning latents
+        #latents = latents[:, num_cond_latents:]
         latents = self.patchifier.unpatchify(
             latents=latents,
             output_height=latent_height,
             )
             try:
+                 print(f"[LTX11]LATENTSfim {latents.shape}")
             except Exception:
                  pass
         return ImagePipelineOutput(images=image)
+    @log_function_io
     def denoising_step(
         self,
         latents: torch.Tensor,
         tokens_to_denoise_mask = (t - t_eps < (1.0 - conditioning_mask)).unsqueeze(-1)
         return torch.where(tokens_to_denoise_mask, denoised_latents, latents)
+    @log_function_io
     def prepare_conditioning(
         self,
         conditioning_items: Optional[List[ConditioningItem]],
         )
     @staticmethod
+    @log_function_io
     def _resize_conditioning_item(
         conditioning_item: ConditioningItem,
         height: int,
         )
         return new_conditioning_item
+    @log_function_io
     def _get_latent_spatial_position(
         self,
         latents: torch.Tensor,
         return latents, x_start // scale, y_start // scale
     @staticmethod
+    @log_function_io
     def _handle_non_first_conditioning_sequence(
         init_latents: torch.Tensor,
         init_conditioning_mask: torch.Tensor,
             latents,
         )
+    @log_function_io
     def trim_conditioning_sequence(
         self, start_frame: int, sequence_num_frames: int, target_num_frames: int
     ):
         return num_frames
     @staticmethod
+    @log_function_io
     def tone_map_latents(
         latents: torch.Tensor,
         compression: float,
         return filtered
+@log_function_io
 def adain_filter_latent(
     latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0
 ):
     result = torch.lerp(latents, result, factor)
     return result
+@log_function_io
 class LTXMultiScalePipeline:
     def _upsample_latents(
         self, latest_upsampler: LatentUpsampler, latents: torch.Tensor