Spaces:
Sleeping
Sleeping
| # Adaptado de: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py | |
| # (e com a nossa modificação pela ciência!) | |
| import copy | |
| import inspect | |
| import math | |
| import re | |
| from contextlib import nullcontext | |
| from dataclasses import dataclass | |
| from typing import Any, Callable, Dict, List, Optional, Tuple, Union | |
| import torch | |
| import torch.nn.functional as F | |
| from diffusers.image_processor import VaeImageProcessor | |
| from diffusers.models import AutoencoderKL | |
| from diffusers.pipelines.pipeline_utils import DiffusionPipeline, ImagePipelineOutput | |
| from diffusers.schedulers import DPMSolverMultistepScheduler | |
| from diffusers.utils import deprecate, logging | |
| from diffusers.utils.torch_utils import randn_tensor | |
| from einops import rearrange | |
| from transformers import ( | |
| T5EncoderModel, | |
| T5Tokenizer, | |
| AutoModelForCausalLM, | |
| AutoProcessor, | |
| AutoTokenizer, | |
| ) | |
| from ltx_video.models.autoencoders.causal_video_autoencoder import ( | |
| CausalVideoAutoencoder, | |
| ) | |
| from ltx_video.models.autoencoders.vae_encode import ( | |
| get_vae_size_scale_factor, | |
| latent_to_pixel_coords, | |
| vae_decode, | |
| vae_encode, | |
| ) | |
| from ltx_video.models.transformers.symmetric_patchifier import Patchifier | |
| from ltx_video.models.transformers.transformer3d import Transformer3DModel | |
| from ltx_video.schedulers.rf import TimestepShifter | |
| from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy | |
| from ltx_video.utils.prompt_enhance_utils import generate_cinematic_prompt | |
| from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler | |
| from ltx_video.models.autoencoders.vae_encode import ( | |
| un_normalize_latents, | |
| normalize_latents, | |
| ) | |
| # ... (Todo o código inicial do arquivo permanece o mesmo, incluindo ASPECT_RATIO_BINS, retrieve_timesteps, ConditioningItem, etc.) | |
| # ... (Vou pular para a classe LTXVideoPipeline para manter a resposta focada) | |
| class LTXVideoPipeline(DiffusionPipeline): | |
| # ... (O __init__ e outras funções como encode_prompt, check_inputs, etc., permanecem as mesmas) | |
| # ... (Pulando para a função __call__ onde faremos a nossa modificação) | |
| def __call__( | |
| self, | |
| height: int, | |
| width: int, | |
| num_frames: int, | |
| frame_rate: float, | |
| prompt: Union[str, List[str]] = None, | |
| negative_prompt: str = "", | |
| num_inference_steps: int = 20, | |
| skip_initial_inference_steps: int = 0, | |
| skip_final_inference_steps: int = 0, | |
| timesteps: List[int] = None, | |
| guidance_scale: Union[float, List[float]] = 4.5, | |
| cfg_star_rescale: bool = False, | |
| skip_layer_strategy: Optional[SkipLayerStrategy] = None, | |
| skip_block_list: Optional[Union[List[List[int]], List[int]]] = None, | |
| stg_scale: Union[float, List[float]] = 1.0, | |
| rescaling_scale: Union[float, List[float]] = 0.7, | |
| guidance_timesteps: Optional[List[int]] = None, | |
| num_images_per_prompt: Optional[int] = 1, | |
| eta: float = 0.0, | |
| generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, | |
| latents: Optional[torch.FloatTensor] = None, | |
| prompt_embeds: Optional[torch.FloatTensor] = None, | |
| prompt_attention_mask: Optional[torch.FloatTensor] = None, | |
| negative_prompt_embeds: Optional[torch.FloatTensor] = None, | |
| negative_prompt_attention_mask: Optional[torch.FloatTensor] = None, | |
| output_type: Optional[str] = "pil", | |
| return_dict: bool = True, | |
| callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, | |
| conditioning_items: Optional[List[ConditioningItem]] = None, | |
| decode_timestep: Union[List[float], float] = 0.0, | |
| decode_noise_scale: Optional[List[float]] = None, | |
| mixed_precision: bool = False, | |
| offload_to_cpu: bool = False, | |
| enhance_prompt: bool = False, | |
| text_encoder_max_tokens: int = 256, | |
| stochastic_sampling: bool = False, | |
| media_items: Optional[torch.Tensor] = None, | |
| tone_map_compression_ratio: float = 0.0, | |
| **kwargs, | |
| ) -> Union[ImagePipelineOutput, Tuple]: | |
| # --- [NOSSA MODIFICAÇÃO] Captura o prompt original para logging --- | |
| original_prompt_for_logging = prompt | |
| # ... (O resto do código inicial da função __call__ permanece o mesmo) ... | |
| # ... (check_inputs, default height/width, etc.) | |
| if enhance_prompt: | |
| self.prompt_enhancer_image_caption_model = ( | |
| self.prompt_enhancer_image_caption_model.to(self._execution_device) | |
| ) | |
| self.prompt_enhancer_llm_model = self.prompt_enhancer_llm_model.to( | |
| self._execution_device | |
| ) | |
| # A chamada para o Diretor Assistente | |
| enhanced_prompt = generate_cinematic_prompt( | |
| self.prompt_enhancer_image_caption_model, | |
| self.prompt_enhancer_image_caption_processor, | |
| self.prompt_enhancer_llm_model, | |
| self.prompt_enhancer_llm_tokenizer, | |
| prompt, | |
| conditioning_items, | |
| max_new_tokens=text_encoder_max_tokens, | |
| ) | |
| # --- [NOSSA ESCUTA SECRETA PELA CIÊNCIA!] --- | |
| print("\n" + "="*50) | |
| print("--- [LOG DO DIRETOR ASSISTENTE (PROMPT ENHANCER)] ---") | |
| print(f"Prompt Original do Maestro: {original_prompt_for_logging}") | |
| print(f"PROMPT FINAL APERFEIÇOADO (enviado para o LTX): {enhanced_prompt}") | |
| print("--- [FIM DO LOG DO DIRETOR ASSISTENTE] ---") | |
| print("="*50 + "\n") | |
| # --- [FIM DA ESCUTA] --- | |
| # Atualiza o prompt que será usado pelo resto da função | |
| prompt = enhanced_prompt | |
| # ... (O resto da função __call__ continua a partir daqui, usando o `prompt` novo ou o original) | |
| # ... (encode_prompt, prepare_latents, denoising loop, etc.) | |
| # 3. Encode input prompt | |
| if self.text_encoder is not None: | |
| self.text_encoder = self.text_encoder.to(self._execution_device) | |
| ( | |
| prompt_embeds, | |
| prompt_attention_mask, | |
| negative_prompt_embeds, | |
| negative_prompt_attention_mask, | |
| ) = self.encode_prompt( | |
| prompt, | |
| True, | |
| negative_prompt=negative_prompt, | |
| # ... (resto dos parâmetros) | |
| ) | |
| # ... (todo o resto do arquivo, sem mais nenhuma modificação) ... | |
| # ... (denoising_step, prepare_conditioning, etc.) |