CalamitousFelicitousness commited on 5 days ago

Commit

df9529d

verified ·

1 Parent(s): 6943141

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitattributes +1 -0
llm_adapter/config.json +12 -0
llm_adapter/diffusion_pytorch_model.safetensors +3 -0
llm_adapter/modeling_llm_adapter.py +215 -0
model_index.json +35 -0
pipeline.py +371 -0
scheduler/scheduler_config.json +6 -0
t5_tokenizer/tokenizer.json +0 -0
t5_tokenizer/tokenizer_config.json +113 -0
text_encoder/config.json +22 -0
text_encoder/model.safetensors +3 -0
tokenizer/chat_template.jinja +89 -0
tokenizer/tokenizer.json +3 -0
tokenizer/tokenizer_config.json +29 -0
transformer/config.json +29 -0
transformer/diffusion_pytorch_model.safetensors +3 -0
vae/config.json +56 -0
vae/diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

llm_adapter/config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "_class_name": "AnimaLLMAdapter",
+  "_diffusers_version": "0.37.0",
+  "source_dim": 1024,
+  "target_dim": 1024,
+  "model_dim": 1024,
+  "num_layers": 6,
+  "num_heads": 16,
+  "mlp_ratio": 4.0,
+  "vocab_size": 32128,
+  "use_self_attn": true
+}

llm_adapter/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:149d3c0ae9a1b76c5a02a722288a7eadeec306769e2a60f5b34513155c8a2105
+size 269339368

llm_adapter/modeling_llm_adapter.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(x, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim):
+        super().__init__()
+        self.rope_theta = 10000
+        inv_freq = 1.0 / (
+            self.rope_theta
+            ** (torch.arange(0, head_dim, 2, dtype=torch.int64).to(dtype=torch.float) / head_dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, head_dim):
+        super().__init__()
+        inner_dim = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.q_proj = nn.Linear(query_dim, inner_dim, bias=False)
+        self.q_norm = nn.RMSNorm(head_dim, eps=1e-6)
+        self.k_proj = nn.Linear(context_dim, inner_dim, bias=False)
+        self.k_norm = nn.RMSNorm(head_dim, eps=1e-6)
+        self.v_proj = nn.Linear(context_dim, inner_dim, bias=False)
+        self.o_proj = nn.Linear(inner_dim, query_dim, bias=False)
+    def forward(self, x, mask=None, context=None, position_embeddings=None, position_embeddings_context=None):
+        context = x if context is None else context
+        input_shape = x.shape[:-1]
+        q_shape = (*input_shape, self.n_heads, self.head_dim)
+        context_shape = context.shape[:-1]
+        kv_shape = (*context_shape, self.n_heads, self.head_dim)
+        query_states = self.q_norm(self.q_proj(x).view(q_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(context).view(kv_shape)).transpose(1, 2)
+        value_states = self.v_proj(context).view(kv_shape).transpose(1, 2)
+        if position_embeddings is not None:
+            assert position_embeddings_context is not None
+            cos, sin = position_embeddings
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+            cos, sin = position_embeddings_context
+            key_states = apply_rotary_pos_emb(key_states, cos, sin)
+        attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=mask)
+        attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+        return self.o_proj(attn_output)
+class TransformerBlock(nn.Module):
+    def __init__(self, source_dim, model_dim, num_heads=16, mlp_ratio=4.0, use_self_attn=True):
+        super().__init__()
+        self.use_self_attn = use_self_attn
+        if self.use_self_attn:
+            self.norm_self_attn = nn.RMSNorm(model_dim, eps=1e-6)
+            self.self_attn = Attention(
+                query_dim=model_dim,
+                context_dim=model_dim,
+                n_heads=num_heads,
+                head_dim=model_dim // num_heads,
+            )
+        self.norm_cross_attn = nn.RMSNorm(model_dim, eps=1e-6)
+        self.cross_attn = Attention(
+            query_dim=model_dim,
+            context_dim=source_dim,
+            n_heads=num_heads,
+            head_dim=model_dim // num_heads,
+        )
+        self.norm_mlp = nn.RMSNorm(model_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(model_dim, int(model_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Linear(int(model_dim * mlp_ratio), model_dim),
+        )
+    def forward(
+        self,
+        x,
+        context,
+        target_attention_mask=None,
+        source_attention_mask=None,
+        position_embeddings=None,
+        position_embeddings_context=None,
+    ):
+        if self.use_self_attn:
+            normed = self.norm_self_attn(x)
+            attn_out = self.self_attn(
+                normed,
+                mask=target_attention_mask,
+                position_embeddings=position_embeddings,
+                position_embeddings_context=position_embeddings,
+            )
+            x = x + attn_out
+        normed = self.norm_cross_attn(x)
+        attn_out = self.cross_attn(
+            normed,
+            mask=source_attention_mask,
+            context=context,
+            position_embeddings=position_embeddings,
+            position_embeddings_context=position_embeddings_context,
+        )
+        x = x + attn_out
+        x = x + self.mlp(self.norm_mlp(x))
+        return x
+class AnimaLLMAdapter(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        source_dim: int = 1024,
+        target_dim: int = 1024,
+        model_dim: int = 1024,
+        num_layers: int = 6,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        vocab_size: int = 32128,
+        use_self_attn: bool = True,
+    ):
+        super().__init__()
+        self.embed = nn.Embedding(vocab_size, target_dim)
+        if model_dim != target_dim:
+            self.in_proj = nn.Linear(target_dim, model_dim)
+        else:
+            self.in_proj = nn.Identity()
+        self.rotary_emb = RotaryEmbedding(model_dim // num_heads)
+        self.blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    source_dim,
+                    model_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    use_self_attn=use_self_attn,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.out_proj = nn.Linear(model_dim, target_dim)
+        self.norm = nn.RMSNorm(target_dim, eps=1e-6)
+    def forward(
+        self,
+        source_hidden_states: torch.Tensor,
+        target_input_ids: torch.Tensor,
+        target_attention_mask: torch.Tensor = None,
+        source_attention_mask: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if target_attention_mask is not None:
+            target_attention_mask = target_attention_mask.to(torch.bool)
+            if target_attention_mask.ndim == 2:
+                target_attention_mask = target_attention_mask.unsqueeze(1).unsqueeze(1)
+        if source_attention_mask is not None:
+            source_attention_mask = source_attention_mask.to(torch.bool)
+            if source_attention_mask.ndim == 2:
+                source_attention_mask = source_attention_mask.unsqueeze(1).unsqueeze(1)
+        x = self.in_proj(self.embed(target_input_ids))
+        context = source_hidden_states
+        position_ids = torch.arange(x.shape[1], device=x.device).unsqueeze(0)
+        position_ids_context = torch.arange(context.shape[1], device=x.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(x, position_ids)
+        position_embeddings_context = self.rotary_emb(x, position_ids_context)
+        for block in self.blocks:
+            x = block(
+                x,
+                context,
+                target_attention_mask=target_attention_mask,
+                source_attention_mask=source_attention_mask,
+                position_embeddings=position_embeddings,
+                position_embeddings_context=position_embeddings_context,
+            )
+        return self.norm(self.out_proj(x))

model_index.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_class_name": [
+    "pipeline",
+    "AnimaTextToImagePipeline"
+  ],
+  "_diffusers_version": "0.37.0",
+  "text_encoder": [
+    "transformers",
+    "Qwen3Model"
+  ],
+  "tokenizer": [
+    "transformers",
+    "PreTrainedTokenizerFast"
+  ],
+  "t5_tokenizer": [
+    "transformers",
+    "T5TokenizerFast"
+  ],
+  "llm_adapter": [
+    "modeling_llm_adapter",
+    "AnimaLLMAdapter"
+  ],
+  "transformer": [
+    "diffusers",
+    "CosmosTransformer3DModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLWan"
+  ],
+  "scheduler": [
+    "diffusers",
+    "FlowMatchEulerDiscreteScheduler"
+  ]
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import inspect
+from typing import Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizerFast
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.models import AutoencoderKLWan, CosmosTransformer3DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.cosmos.pipeline_output import CosmosImagePipelineOutput
+logger = logging.get_logger(__name__)
+def retrieve_timesteps(scheduler, num_inference_steps=None, device=None, timesteps=None, sigmas=None, **kwargs):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed.")
+    if timesteps is not None:
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class AnimaTextToImagePipeline(DiffusionPipeline):
+    """Pipeline for text-to-image generation using the Anima model.
+    Anima uses a Cosmos Predict2 backbone with a Qwen3 text encoder and an LLM adapter
+    that cross-attends T5 token embeddings to Qwen3 hidden states.
+    """
+    model_cpu_offload_seq = "text_encoder->llm_adapter->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        text_encoder: PreTrainedModel,
+        tokenizer: PreTrainedTokenizerFast,
+        t5_tokenizer: PreTrainedTokenizerFast,
+        llm_adapter,
+        transformer: CosmosTransformer3DModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            t5_tokenizer=t5_tokenizer,
+            llm_adapter=llm_adapter,
+            transformer=transformer,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: torch.device,
+        dtype: torch.dtype,
+        max_sequence_length: int = 512,
+    ):
+        """Encode prompt through Qwen3 and run LLM adapter with T5 token IDs."""
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        # Check for empty prompts - return zero embeddings directly
+        all_empty = all(p.strip() == "" for p in prompt)
+        if all_empty:
+            return torch.zeros(batch_size, 512, self.llm_adapter.config.target_dim, device=device, dtype=dtype)
+        # Tokenize with Qwen3 tokenizer
+        qwen_inputs = self.tokenizer(
+            prompt,
+            padding=True,
+            truncation=True,
+            max_length=max_sequence_length,
+            return_tensors="pt",
+        )
+        qwen_input_ids = qwen_inputs.input_ids.to(device)
+        qwen_attention_mask = qwen_inputs.attention_mask.to(device)
+        # Get Qwen3 hidden states
+        qwen_outputs = self.text_encoder(
+            input_ids=qwen_input_ids,
+            attention_mask=qwen_attention_mask,
+        )
+        qwen_hidden_states = qwen_outputs.last_hidden_state.to(dtype=dtype)
+        # Tokenize with T5 tokenizer (we only need the IDs for the adapter embedding)
+        t5_inputs = self.t5_tokenizer(
+            prompt,
+            padding=True,
+            truncation=True,
+            max_length=max_sequence_length,
+            return_tensors="pt",
+        )
+        t5_input_ids = t5_inputs.input_ids.to(device)
+        # Run LLM adapter: T5 token embeddings attend to Qwen3 hidden states
+        adapted_embeds = self.llm_adapter(
+            source_hidden_states=qwen_hidden_states,
+            target_input_ids=t5_input_ids,
+        )
+        # Pad to 512 sequence length if shorter
+        if adapted_embeds.shape[1] < 512:
+            adapted_embeds = torch.nn.functional.pad(
+                adapted_embeds, (0, 0, 0, 512 - adapted_embeds.shape[1])
+            )
+        return adapted_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds = self._encode_prompt(prompt, device, dtype, max_sequence_length)
+            _, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_embeds = self._encode_prompt(negative_prompt, device, dtype, max_sequence_length)
+            _, seq_len, _ = negative_prompt_embeds.shape
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds, negative_prompt_embeds
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        num_frames: int = 1,
+        dtype: torch.dtype = None,
+        device: torch.device = None,
+        generator=None,
+        latents: torch.Tensor = None,
+    ):
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        latent_height = height // self.vae_scale_factor_spatial
+        latent_width = width // self.vae_scale_factor_spatial
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    def check_inputs(self, prompt, height, width, prompt_embeds=None):
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError("Cannot forward both `prompt` and `prompt_embeds`.")
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError("Provide either `prompt` or `prompt_embeds`.")
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 768,
+        width: int = 1360,
+        num_inference_steps: int = 35,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        num_frames = 1
+        self.check_inputs(prompt, height, width, prompt_embeds)
+        self._guidance_scale = guidance_scale
+        self._current_timestep = None
+        self._interrupt = False
+        device = self._execution_device
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        # Encode prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            device=device,
+            max_sequence_length=max_sequence_length,
+        )
+        # Prepare timesteps - use default descending schedule (1→0)
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps=num_inference_steps, device=device
+        )
+        # Prepare latents
+        transformer_dtype = self.transformer.dtype
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+        padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
+        # Denoising loop using CONST preconditioning (flow matching velocity model):
+        #   - c_in = 1.0 (no input scaling)
+        #   - timestep = sigma (passed directly)
+        #   - model output is the velocity: denoised = x - velocity * sigma
+        #   - CFG applied to velocity (equivalent to applying to denoised for linear preconditioning)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                sigma = self.scheduler.sigmas[i]
+                # Pass sigma directly as timestep (CONST preconditioning)
+                timestep = sigma.expand(latents.shape[0]).to(transformer_dtype)
+                latent_model_input = latents.to(transformer_dtype)
+                # Model predicts velocity (raw output IS the velocity for CONST)
+                velocity = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    padding_mask=padding_mask,
+                    return_dict=False,
+                )[0].float()
+                if self.do_classifier_free_guidance:
+                    velocity_uncond = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        padding_mask=padding_mask,
+                        return_dict=False,
+                    )[0].float()
+                    velocity = velocity_uncond + self.guidance_scale * (velocity - velocity_uncond)
+                # Euler step: scheduler computes x_next = x + (sigma_next - sigma) * velocity
+                latents = self.scheduler.step(velocity, t, latents, return_dict=False)[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        self._current_timestep = None
+        if not output_type == "latent":
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+            image = [batch[0] for batch in video]
+            if isinstance(video, torch.Tensor):
+                image = torch.stack(image)
+            elif isinstance(video, np.ndarray):
+                image = np.stack(image)
+        else:
+            image = latents[:, :, 0]
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return CosmosImagePipelineOutput(images=image)

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.37.0",
+  "num_train_timesteps": 1000,
+  "shift": 3.0
+}

t5_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

t5_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "backend": "tokenizers",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "extra_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "is_local": false,
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "architectures": [
+    "Qwen3Model"
+  ],
+  "model_type": "qwen3",
+  "vocab_size": 151936,
+  "hidden_size": 1024,
+  "intermediate_size": 3072,
+  "num_hidden_layers": 28,
+  "num_attention_heads": 16,
+  "num_key_value_heads": 8,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "max_position_embeddings": 32768,
+  "rms_norm_eps": 1e-06,
+  "rope_theta": 1000000.0,
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "use_cache": false,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16"
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d10aa56a4da8a95d954d99228d9e20e27f96ac5fc8aa41b89a41532b16bb4817
+size 1192135064

tokenizer/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

tokenizer/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

transformer/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_class_name": "CosmosTransformer3DModel",
+  "_diffusers_version": "0.37.0",
+  "in_channels": 16,
+  "out_channels": 16,
+  "num_attention_heads": 16,
+  "attention_head_dim": 128,
+  "num_layers": 28,
+  "mlp_ratio": 4.0,
+  "text_embed_dim": 1024,
+  "adaln_lora_dim": 256,
+  "max_size": [
+    128,
+    240,
+    240
+  ],
+  "patch_size": [
+    1,
+    2,
+    2
+  ],
+  "rope_scale": [
+    1.0,
+    4.0,
+    4.0
+  ],
+  "concat_padding_mask": true,
+  "extra_pos_embed_type": null
+}

transformer/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c0b348c119e44dcc26589102ad5ca64d26ac84d5db3b743d29f0fa2fc2f8b2
+size 3912877072

vae/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_class_name": "AutoencoderKLWan",
+  "_diffusers_version": "0.33.0.dev0",
+  "attn_scales": [],
+  "base_dim": 96,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "latents_mean": [
+    -0.7571,
+    -0.7089,
+    -0.9113,
+    0.1075,
+    -0.1745,
+    0.9653,
+    -0.1517,
+    1.5508,
+    0.4134,
+    -0.0715,
+    0.5517,
+    -0.3632,
+    -0.1922,
+    -0.9497,
+    0.2503,
+    -0.2921
+  ],
+  "latents_std": [
+    2.8184,
+    1.4541,
+    2.3275,
+    2.6558,
+    1.2196,
+    1.7708,
+    2.6052,
+    2.0743,
+    3.2687,
+    2.1526,
+    2.8652,
+    1.5579,
+    1.6382,
+    1.1253,
+    2.8251,
+    1.916
+  ],
+  "num_res_blocks": 2,
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 16
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b5bf326a6c4f66fb2b2250687fdccd1f126ee7c977d2f0170cb56fdacc70a9a
+size 253806934