sippycoder commited on Mar 17

Commit

785ed35

verified ·

1 Parent(s): 303a617

Upload folder using huggingface_hub

Browse files

Files changed (40) hide show

model_index.json +20 -0
modeling_nucleusmoe.py +859 -0
pipeline_nucleusmoe.py +717 -0
pipeline_output.py +20 -0
scheduler/scheduler_config.json +18 -0
text_encoder/README.md +192 -0
text_encoder/chat_template.json +3 -0
text_encoder/config.json +62 -0
text_encoder/generation_config.json +14 -0
text_encoder/merges.txt +0 -0
text_encoder/model-00001-of-00004.safetensors +3 -0
text_encoder/model-00002-of-00004.safetensors +3 -0
text_encoder/model-00003-of-00004.safetensors +3 -0
text_encoder/model-00004-of-00004.safetensors +3 -0
text_encoder/model.safetensors.index.json +757 -0
text_encoder/preprocessor_config.json +21 -0
text_encoder/tokenizer.json +0 -0
text_encoder/tokenizer_config.json +239 -0
text_encoder/video_preprocessor_config.json +21 -0
text_encoder/vocab.json +0 -0
transformer/config.json +61 -0
transformer/diffusion_pytorch_model-00001-of-00007.safetensors +3 -0
transformer/diffusion_pytorch_model-00002-of-00007.safetensors +3 -0
transformer/diffusion_pytorch_model-00003-of-00007.safetensors +3 -0
transformer/diffusion_pytorch_model-00004-of-00007.safetensors +3 -0
transformer/diffusion_pytorch_model-00005-of-00007.safetensors +3 -0
transformer/diffusion_pytorch_model-00006-of-00007.safetensors +3 -0
transformer/diffusion_pytorch_model-00007-of-00007.safetensors +3 -0
transformer/diffusion_pytorch_model.safetensors.index.json +0 -0
transformer/model-00001-of-00007.safetensors +3 -0
transformer/model-00002-of-00007.safetensors +3 -0
transformer/model-00003-of-00007.safetensors +3 -0
transformer/model-00004-of-00007.safetensors +3 -0
transformer/model-00005-of-00007.safetensors +3 -0
transformer/model-00006-of-00007.safetensors +3 -0
transformer/model-00007-of-00007.safetensors +3 -0
transformer/model.safetensors.index.json +0 -0
transformer/modeling_nucleusmoe.py +859 -0
vae/config.json +56 -0
vae/diffusion_pytorch_model.safetensors +3 -0

model_index.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "_class_name": ["pipeline_nucleusmoe", "NucleusMoEImagePipeline"],
+    "_diffusers_version": "0.36.0",
+    "scheduler": [
+      "diffusers",
+      "FlowMatchEulerDiscreteScheduler"
+    ],
+    "text_encoder": [
+      "transformers",
+      "Qwen3VLForConditionalGeneration"
+    ],
+    "transformer": [
+      "modeling_nucleusmoe",
+      "NucleusMoEImageTransformer2DModel"
+    ],
+    "vae": [
+      "diffusers",
+      "AutoencoderKLQwenImage"
+    ]
+  }

modeling_nucleusmoe.py ADDED Viewed

	@@ -0,0 +1,859 @@

+# Copyright 2025 Nucleus-Image Team, The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import math
+from typing import Any, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import AttentionMixin, FeedForward
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+from diffusers.models.attention_processor import Attention
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+) -> torch.Tensor:
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent).to(timesteps.dtype)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def apply_rotary_emb_nucleus(
+    x: torch.Tensor,
+    freqs_cis: torch.Tensor | tuple[torch.Tensor],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(1)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+def compute_text_seq_len_from_mask(
+    encoder_hidden_states: torch.Tensor, encoder_hidden_states_mask: torch.Tensor | None
+) -> tuple[int, torch.Tensor | None, torch.Tensor | None]:
+    """
+    Compute text sequence length without assuming contiguous masks. Returns length for RoPE and a normalized bool mask.
+    """
+    batch_size, text_seq_len = encoder_hidden_states.shape[:2]
+    if encoder_hidden_states_mask is None:
+        return text_seq_len, None, None
+    if encoder_hidden_states_mask.shape[:2] != (batch_size, text_seq_len):
+        raise ValueError(
+            f"`encoder_hidden_states_mask` shape {encoder_hidden_states_mask.shape} must match "
+            f"(batch_size, text_seq_len)=({batch_size}, {text_seq_len})."
+        )
+    if encoder_hidden_states_mask.dtype != torch.bool:
+        encoder_hidden_states_mask = encoder_hidden_states_mask.to(torch.bool)
+    position_ids = torch.arange(text_seq_len, device=encoder_hidden_states.device, dtype=torch.long)
+    active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
+    has_active = encoder_hidden_states_mask.any(dim=1)
+    per_sample_len = torch.where(
+        has_active,
+        active_positions.max(dim=1).values + 1,
+        torch.as_tensor(text_seq_len, device=encoder_hidden_states.device),
+    )
+    return text_seq_len, per_sample_len, encoder_hidden_states_mask
+class NucleusTimestepProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, use_additional_t_cond=False):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=embedding_dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=embedding_dim, time_embed_dim=4 * embedding_dim, out_dim=embedding_dim
+        )
+        self.norm = RMSNorm(embedding_dim, eps=1e-6)
+        self.use_additional_t_cond = use_additional_t_cond
+        if use_additional_t_cond:
+            self.addition_t_embedding = nn.Embedding(2, embedding_dim)
+    def forward(self, timestep, hidden_states, addition_t_cond=None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))  # (N, D)
+        conditioning = timesteps_emb
+        if self.use_additional_t_cond:
+            if addition_t_cond is None:
+                raise ValueError("When additional_t_cond is True, addition_t_cond must be provided.")
+            addition_t_emb = self.addition_t_embedding(addition_t_cond)
+            addition_t_emb = addition_t_emb.to(dtype=hidden_states.dtype)
+            conditioning = conditioning + addition_t_emb
+        return self.norm(conditioning)
+class NucleusEmbedRope(nn.Module):
+    def __init__(self, theta: int, axes_dim: list[int], scale_rope=False):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        pos_index = torch.arange(4096)
+        neg_index = torch.arange(4096).flip(0) * -1 - 1
+        self.pos_freqs = torch.cat(
+            [
+                self.rope_params(pos_index, self.axes_dim[0], self.theta),
+                self.rope_params(pos_index, self.axes_dim[1], self.theta),
+                self.rope_params(pos_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        self.neg_freqs = torch.cat(
+            [
+                self.rope_params(neg_index, self.axes_dim[0], self.theta),
+                self.rope_params(neg_index, self.axes_dim[1], self.theta),
+                self.rope_params(neg_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        # DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART
+        self.scale_rope = scale_rope
+    def rope_params(self, index, dim, theta=10000):
+        """
+        Args:
+            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
+        """
+        assert dim % 2 == 0
+        freqs = torch.outer(index, 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim)))
+        freqs = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs
+    def forward(
+        self,
+        video_fhw: tuple[int, int, int, list[tuple[int, int, int]]],
+        txt_seq_lens: list[int] | None = None,
+        device: torch.device = None,
+        max_txt_seq_len: int | torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            video_fhw (`tuple[int, int, int]` or `list[tuple[int, int, int]]`):
+                A list of 3 integers [frame, height, width] representing the shape of the video.
+            txt_seq_lens (`list[int]`, *optional*, **Deprecated**):
+                Deprecated parameter. Use `max_txt_seq_len` instead. If provided, the maximum value will be used.
+            device: (`torch.device`, *optional*):
+                The device on which to perform the RoPE computation.
+            max_txt_seq_len (`int` or `torch.Tensor`, *optional*):
+                The maximum text sequence length for RoPE computation. This should match the encoder hidden states
+                sequence length. Can be either an int or a scalar tensor (for torch.compile compatibility).
+        """
+        # Handle deprecated txt_seq_lens parameter
+        if txt_seq_lens is not None:
+            deprecate(
+                "txt_seq_lens",
+                "0.39.0",
+                "Passing `txt_seq_lens` is deprecated and will be removed in version 0.39.0. "
+                "Please use `max_txt_seq_len` instead. "
+                "The new parameter accepts a single int or tensor value representing the maximum text sequence length.",
+                standard_warn=False,
+            )
+            if max_txt_seq_len is None:
+                # Use max of txt_seq_lens for backward compatibility
+                max_txt_seq_len = max(txt_seq_lens) if isinstance(txt_seq_lens, list) else txt_seq_lens
+        if max_txt_seq_len is None:
+            raise ValueError("Either `max_txt_seq_len` or `txt_seq_lens` (deprecated) must be provided.")
+        # Validate batch inference with variable-sized images
+        if isinstance(video_fhw, list) and len(video_fhw) > 1:
+            # Check if all instances have the same size
+            first_fhw = video_fhw[0]
+            if not all(fhw == first_fhw for fhw in video_fhw):
+                logger.warning(
+                    "Batch inference with variable-sized images is not currently supported in NucleusEmbedRope. "
+                    "All images in the batch should have the same dimensions (frame, height, width). "
+                    f"Detected sizes: {video_fhw}. Using the first image's dimensions {first_fhw} "
+                    "for RoPE computation, which may lead to incorrect results for other images in the batch."
+                )
+        if isinstance(video_fhw, list):
+            video_fhw = video_fhw[0]
+        if not isinstance(video_fhw, list):
+            video_fhw = [video_fhw]
+        vid_freqs = []
+        max_vid_index = 0
+        for idx, fhw in enumerate(video_fhw):
+            frame, height, width = fhw
+            # RoPE frequencies are cached via a lru_cache decorator on _compute_video_freqs
+            video_freq = self._compute_video_freqs(frame, height, width, idx, device)
+            vid_freqs.append(video_freq)
+            if self.scale_rope:
+                max_vid_index = max(height // 2, width // 2, max_vid_index)
+            else:
+                max_vid_index = max(height, width, max_vid_index)
+        max_txt_seq_len_int = int(max_txt_seq_len)
+        # Create device-specific copy for text freqs without modifying self.pos_freqs
+        txt_freqs = self.pos_freqs.to(device)[max_vid_index : max_vid_index + max_txt_seq_len_int, ...]
+        vid_freqs = torch.cat(vid_freqs, dim=0)
+        return vid_freqs, txt_freqs
+    @functools.lru_cache(maxsize=128)
+    def _compute_video_freqs(
+        self, frame: int, height: int, width: int, idx: int = 0, device: torch.device = None
+    ) -> torch.Tensor:
+        seq_lens = frame * height * width
+        pos_freqs = self.pos_freqs.to(device) if device is not None else self.pos_freqs
+        neg_freqs = self.neg_freqs.to(device) if device is not None else self.neg_freqs
+        freqs_pos = pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        if self.scale_rope:
+            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+        else:
+            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        return freqs.clone().contiguous()
+class NucleusMoEAttnProcessor2_0:
+    """
+    Attention processor for the Nucleus MoE architecture. Image queries attend to concatenated image+text keys/values
+    (cross-attention style, no text query). Supports grouped-query attention (GQA) when num_key_value_heads is set on
+    the Attention module.
+    """
+    _attention_backend = None
+    _parallel_config = None
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "NucleusMoEAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: torch.FloatTensor | None = None,
+        image_rotary_emb: torch.Tensor | None = None,
+    ) -> torch.FloatTensor:
+        head_dim = attn.inner_dim // attn.heads
+        num_kv_heads = attn.inner_kv_dim // head_dim
+        num_kv_groups = attn.heads // num_kv_heads
+        img_query = attn.to_q(hidden_states).unflatten(-1, (attn.heads, -1))
+        img_key = attn.to_k(hidden_states).unflatten(-1, (num_kv_heads, -1))
+        img_value = attn.to_v(hidden_states).unflatten(-1, (num_kv_heads, -1))
+        if attn.norm_q is not None:
+            img_query = attn.norm_q(img_query)
+        if attn.norm_k is not None:
+            img_key = attn.norm_k(img_key)
+        if image_rotary_emb is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            img_query = apply_rotary_emb_nucleus(img_query, img_freqs, use_real=False)
+            img_key = apply_rotary_emb_nucleus(img_key, img_freqs, use_real=False)
+        if encoder_hidden_states is not None:
+            txt_key = attn.add_k_proj(encoder_hidden_states).unflatten(-1, (num_kv_heads, -1))
+            txt_value = attn.add_v_proj(encoder_hidden_states).unflatten(-1, (num_kv_heads, -1))
+            if attn.norm_added_k is not None:
+                txt_key = attn.norm_added_k(txt_key)
+            if image_rotary_emb is not None:
+                txt_key = apply_rotary_emb_nucleus(txt_key, txt_freqs, use_real=False)
+            joint_key = torch.cat([img_key, txt_key], dim=1)
+            joint_value = torch.cat([img_value, txt_value], dim=1)
+        else:
+            joint_key = img_key
+            joint_value = img_value
+        if num_kv_groups > 1:
+            joint_key = joint_key.repeat_interleave(num_kv_groups, dim=2)
+            joint_value = joint_value.repeat_interleave(num_kv_groups, dim=2)
+        hidden_states = dispatch_attention_fn(
+            img_query,
+            joint_key,
+            joint_value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(img_query.dtype)
+        hidden_states = attn.to_out[0](hidden_states)
+        if len(attn.to_out) > 1:
+            hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+def _is_moe_layer(strategy: str, layer_idx: int, num_layers: int) -> bool:
+    if strategy == "leave_first_three_and_last_block_dense":
+        return layer_idx >= 3 and layer_idx < num_layers - 1
+    elif strategy == "leave_first_three_blocks_dense":
+        return layer_idx >= 3
+    elif strategy == "leave_first_block_dense":
+        return layer_idx >= 1
+    elif strategy == "all_moe":
+        return True
+    elif strategy == "all_dense":
+        return False
+    return True
+class NucleusMoELayer(nn.Module):
+    """
+    Mixture-of-Experts layer with expert-choice routing and a shared expert.
+    Each expert is a separate ``FeedForward`` module stored in an ``nn.ModuleList``.
+    The router concatenates a timestep embedding with the (unmodulated) hidden state
+    to produce per-token affinity scores, then selects the top-C tokens per expert
+    (expert-choice routing).  A shared expert processes all tokens in parallel and its
+    output is combined with the routed expert outputs via scatter-add.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        moe_intermediate_dim: int,
+        num_experts: int,
+        capacity_factor: float,
+        use_sigmoid: bool,
+        route_scale: float,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.capacity_factor = capacity_factor
+        self.use_sigmoid = use_sigmoid
+        self.route_scale = route_scale
+        self.gate = nn.Linear(hidden_size * 2, num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [
+                FeedForward(
+                    dim=hidden_size, dim_out=hidden_size,
+                    inner_dim=moe_intermediate_dim, activation_fn="swiglu", bias=False,
+                )
+                for _ in range(num_experts)
+            ]
+        )
+        self.shared_expert = FeedForward(
+            dim=hidden_size, dim_out=hidden_size,
+            inner_dim=moe_intermediate_dim, activation_fn="swiglu", bias=False,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        hidden_states_unmodulated: torch.Tensor,
+        timestep: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        bs, slen, dim = hidden_states.shape
+        if timestep is not None:
+            timestep_expanded = timestep.unsqueeze(1).expand(-1, slen, -1)
+            router_input = torch.cat([timestep_expanded, hidden_states_unmodulated], dim=-1)
+        else:
+            router_input = hidden_states_unmodulated
+        logits = self.gate(router_input)
+        if self.use_sigmoid:
+            scores = torch.sigmoid(logits.float()).to(logits.dtype)
+        else:
+            scores = F.softmax(logits.float(), dim=-1).to(logits.dtype)
+        affinity = scores.transpose(1, 2)  # (B, E, S)
+        capacity = max(1, math.ceil(self.capacity_factor * slen / self.num_experts))
+        topk = torch.topk(affinity, k=capacity, dim=-1)
+        top_indices = topk.indices  # (B, E, C)
+        gating = affinity.gather(dim=-1, index=top_indices)  # (B, E, C)
+        batch_offsets = torch.arange(bs, device=hidden_states.device, dtype=torch.long).view(bs, 1, 1) * slen
+        global_token_indices = (batch_offsets + top_indices).transpose(0, 1).reshape(self.num_experts, -1).reshape(-1)
+        gating_flat = gating.transpose(0, 1).reshape(self.num_experts, -1).reshape(-1)
+        token_score_sums = torch.zeros(bs * slen, device=hidden_states.device, dtype=gating_flat.dtype)
+        token_score_sums.scatter_add_(0, global_token_indices, gating_flat)
+        gating_flat = gating_flat / (token_score_sums[global_token_indices] + 1e-12)
+        gating_flat = gating_flat * self.route_scale
+        x_flat = hidden_states.reshape(bs * slen, dim)
+        routed_input = x_flat[global_token_indices]
+        tokens_per_expert = bs * capacity
+        routed_output_parts = []
+        for i, expert in enumerate(self.experts):
+            start = i * tokens_per_expert
+            end = start + tokens_per_expert
+            expert_out = expert(routed_input[start:end])
+            routed_output_parts.append(expert_out)
+        routed_output = torch.cat(routed_output_parts, dim=0)
+        routed_output = (routed_output.float() * gating_flat.unsqueeze(-1)).to(hidden_states.dtype)
+        out = self.shared_expert(hidden_states).reshape(bs * slen, dim)
+        scatter_idx = global_token_indices.reshape(-1, 1).expand(-1, dim)
+        out = out.scatter_add(dim=0, index=scatter_idx, src=routed_output)
+        out = out.reshape(bs, slen, dim)
+        return out
+@maybe_allow_in_graph
+class NucleusMoEImageTransformerBlock(nn.Module):
+    """
+    Single-stream DiT block with optional Mixture-of-Experts MLP, matching the DiTBlock
+    architecture from model_v2.  Only the image stream receives adaptive modulation;
+    the text context is projected per-block and used as cross-attention keys/values.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_key_value_heads: int | None = None,
+        joint_attention_dim: int = 3584,
+        qk_norm: str = "rms_norm",
+        eps: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        moe_enabled: bool = False,
+        num_experts: int = 128,
+        moe_intermediate_dim: int = 1344,
+        capacity_factor: float = 8.0,
+        use_sigmoid: bool = False,
+        route_scale: float = 2.5,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.moe_enabled = moe_enabled
+        self.img_mod = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, 4 * dim, bias=True),
+        )
+        self.encoder_proj = nn.Linear(joint_attention_dim, dim)
+        self.pre_attn_norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+        self.attn = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            kv_heads=num_key_value_heads,
+            dim_head=attention_head_dim,
+            added_kv_proj_dim=dim,
+            added_proj_bias=False,
+            out_dim=dim,
+            out_bias=False,
+            bias=False,
+            processor=NucleusMoEAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=eps,
+            context_pre_only=None,
+        )
+        self.pre_mlp_norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+        if moe_enabled:
+            self.img_mlp = NucleusMoELayer(
+                hidden_size=dim,
+                moe_intermediate_dim=moe_intermediate_dim,
+                num_experts=num_experts,
+                capacity_factor=capacity_factor,
+                use_sigmoid=use_sigmoid,
+                route_scale=route_scale,
+            )
+        else:
+            mlp_inner_dim = int(dim * mlp_ratio * 2 / 3) // 128 * 128
+            self.img_mlp = FeedForward(
+                dim=dim, dim_out=dim, inner_dim=mlp_inner_dim,
+                activation_fn="swiglu", bias=False,
+            )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
+        attention_kwargs: dict[str, Any] | None = None,
+    ) -> torch.Tensor:
+        scale1, gate1, scale2, gate2 = self.img_mod(temb).unsqueeze(1).chunk(4, dim=-1)
+        scale1, scale2 = 1 + scale1, 1 + scale2
+        gate1 = gate1.clamp(min=-2.0, max=2.0)
+        gate2 = gate2.clamp(min=-2.0, max=2.0)
+        context = self.encoder_proj(encoder_hidden_states)
+        img_normed = self.pre_attn_norm(hidden_states)
+        img_modulated = img_normed * scale1
+        attention_kwargs = attention_kwargs or {}
+        img_attn_output = self.attn(
+            hidden_states=img_modulated,
+            encoder_hidden_states=context,
+            image_rotary_emb=image_rotary_emb,
+            **attention_kwargs,
+        )
+        hidden_states = hidden_states + gate1.tanh() * img_attn_output
+        img_normed2 = self.pre_mlp_norm(hidden_states)
+        img_modulated2 = img_normed2 * scale2
+        if self.moe_enabled:
+            img_mlp_output = self.img_mlp(img_modulated2, img_normed2, timestep=temb)
+        else:
+            img_mlp_output = self.img_mlp(img_modulated2)
+        hidden_states = hidden_states + gate2.tanh() * img_mlp_output
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states
+class NucleusMoEImageTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin
+):
+    """
+    Nucleus MoE Transformer for image generation.  Single-stream DiT with
+    cross-attention to text and optional Mixture-of-Experts feed-forward layers.
+    Args:
+        patch_size (`int`, defaults to `2`):
+            Patch size to turn the input data into small patches.
+        in_channels (`int`, defaults to `64`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `None`):
+            The number of channels in the output. If not specified, it defaults to `in_channels`.
+        num_layers (`int`, defaults to `24`):
+            The number of transformer blocks.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of dimensions to use for each attention head.
+        num_attention_heads (`int`, defaults to `16`):
+            The number of attention heads to use.
+        num_key_value_heads (`int`, *optional*):
+            The number of key/value heads for grouped-query attention.  Defaults to `num_attention_heads`.
+        joint_attention_dim (`int`, defaults to `3584`):
+            The embedding dimension of the encoder hidden states (text).
+        axes_dims_rope (`tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions to use for the rotary positional embeddings.
+        use_layer3d_rope (`bool`, defaults to `False`):
+            Whether to use the Layer3D variant of RoPE.
+        mlp_ratio (`float`, defaults to `4.0`):
+            Multiplier for the MLP hidden dimension in dense (non-MoE) blocks.
+        moe_enabled (`bool`, defaults to `True`):
+            Whether to use Mixture-of-Experts layers.
+        dense_moe_strategy (`str`, defaults to ``"leave_first_three_and_last_block_dense"``):
+            Strategy for choosing which layers are MoE vs dense.
+        num_experts (`int`, defaults to `128`):
+            Number of experts per MoE layer.
+        moe_intermediate_dim (`int`, defaults to `1344`):
+            Hidden dimension inside each expert.
+        capacity_factor (`float`, defaults to `8.0`):
+            Expert-choice capacity factor.
+        use_sigmoid (`bool`, defaults to `False`):
+            Use sigmoid instead of softmax for routing scores.
+        route_scale (`float`, defaults to `2.5`):
+            Scaling factor applied to routing weights.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["NucleusMoEImageTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["NucleusMoEImageTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 64,
+        out_channels: int | None = None,
+        num_layers: int = 24,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int | None = None,
+        joint_attention_dim: int = 3584,
+        axes_dims_rope: tuple[int, int, int] = (16, 56, 56),
+        mlp_ratio: float = 4.0,
+        moe_enabled: bool = True,
+        dense_moe_strategy: str = "leave_first_three_and_last_block_dense",
+        num_experts: int = 128,
+        moe_intermediate_dim: int = 1344,
+        capacity_factors: List[float] = [8.0] * 24,
+        use_sigmoid: bool = False,
+        route_scale: float = 2.5,
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = NucleusEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
+        self.time_text_embed = NucleusTimestepProjEmbeddings(embedding_dim=self.inner_dim)
+        self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
+        self.img_in = nn.Linear(in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                NucleusMoEImageTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    num_key_value_heads=num_key_value_heads,
+                    joint_attention_dim=joint_attention_dim,
+                    mlp_ratio=mlp_ratio,
+                    moe_enabled=moe_enabled and _is_moe_layer(dense_moe_strategy, idx, num_layers),
+                    num_experts=num_experts,
+                    moe_intermediate_dim=moe_intermediate_dim,
+                    capacity_factor=capacity_factors[idx],
+                    use_sigmoid=use_sigmoid,
+                    route_scale=route_scale,
+                )
+                for idx in range(num_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        img_shapes: list[tuple[int, int, int]] | None = None,
+        encoder_hidden_states: torch.Tensor = None,
+        encoder_hidden_states_mask: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        txt_seq_lens: list[int] | None = None,
+        attention_kwargs: dict[str, Any] | None = None,
+        return_dict: bool = True,
+    ) -> torch.Tensor | Transformer2DModelOutput:
+        """
+        The [`NucleusMoEImageTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            img_shapes (`list[tuple[int, int, int]]`, *optional*):
+                Image shapes ``(frame, height, width)`` for RoPE computation.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+                Boolean mask for the encoder hidden states.
+            timestep (`torch.LongTensor`):
+                Used to indicate denoising step.
+            txt_seq_lens (`list[int]`, *optional*, **Deprecated**):
+                Deprecated.  Use ``encoder_hidden_states_mask`` instead.
+            attention_kwargs (`dict`, *optional*):
+                Extra kwargs forwarded to the attention processor.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.transformer_2d.Transformer2DModelOutput`].
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if txt_seq_lens is not None:
+            deprecate(
+                "txt_seq_lens",
+                "0.39.0",
+                "Passing `txt_seq_lens` is deprecated and will be removed in version 0.39.0. "
+                "Please use `encoder_hidden_states_mask` instead.",
+                standard_warn=False,
+            )
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            scale_lora_layers(self, lora_scale)
+        hidden_states = self.img_in(hidden_states)
+        timestep = timestep.to(hidden_states.dtype)
+        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+        text_seq_len, _, encoder_hidden_states_mask = compute_text_seq_len_from_mask(
+            encoder_hidden_states, encoder_hidden_states_mask
+        )
+        temb = self.time_text_embed(timestep, hidden_states)
+        image_rotary_emb = self.pos_embed(img_shapes, max_txt_seq_len=text_seq_len, device=hidden_states.device)
+        block_attention_kwargs = attention_kwargs.copy() if attention_kwargs is not None else {}
+        if encoder_hidden_states_mask is not None:
+            batch_size, image_seq_len = hidden_states.shape[:2]
+            image_mask = torch.ones((batch_size, image_seq_len), dtype=torch.bool, device=hidden_states.device)
+            joint_attention_mask = torch.cat([image_mask, encoder_hidden_states_mask], dim=1)
+            block_attention_kwargs["attention_mask"] = joint_attention_mask
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    block_attention_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=block_attention_kwargs,
+                )
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

pipeline_nucleusmoe.py ADDED Viewed

	@@ -0,0 +1,717 @@

+# Copyright 2025 Nucleus-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable
+import numpy as np
+import torch
+from transformers import Qwen3VLForConditionalGeneration, Qwen3VLProcessor
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models import AutoencoderKLQwenImage
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from .modeling_nucleusmoe import NucleusMoEImageTransformer2DModel
+from .pipeline_output import NucleusMoEImagePipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+DEFAULT_SYSTEM_PROMPT = (
+    "You are an assistant designed to generate photorealistic, ultra-high-quality images based on user prompts."
+)
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import NucleusMoEImagePipeline
+        >>> pipe = NucleusMoEImagePipeline.from_pretrained(
+        ...     "NucleusAI/Nucleus-MoE-Image", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> image = pipe(prompt, num_inference_steps=50).images[0]
+        >>> image.save("nucleus_moe.png")
+        ```
+"""
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: int | None = None,
+    device: str | torch.device | None = None,
+    timesteps: list[int] | None = None,
+    sigmas: list[float] | None = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`list[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`list[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and
+        the second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class NucleusMoEImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Nucleus MoE.
+    This pipeline uses a single-stream DiT with Mixture-of-Experts feed-forward layers,
+    cross-attention to a Qwen3-VL text encoder, and a flow-matching Euler discrete scheduler.
+    Args:
+        transformer ([`NucleusMoEImageTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLQwenImage`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`Qwen3_VLForConditionalGeneration`]):
+            Text encoder for computing prompt embeddings.
+        processor ([`Qwen3VLProcessor`]):
+            Processor for tokenizing text inputs.
+    """
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = ["processor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        if "processor" not in kwargs:
+            kwargs["processor"] = Qwen3VLProcessor.from_pretrained(
+                pretrained_model_name_or_path, subfolder="text_encoder"
+            )
+        return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+    def __init__(
+        self,
+        transformer: NucleusMoEImageTransformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKLQwenImage,
+        text_encoder: Qwen3VLForConditionalGeneration,
+        processor: Qwen3VLProcessor | None = None,
+    ):
+        super().__init__()
+        if processor is None:
+            processor_path = (
+                getattr(text_encoder, "name_or_path", None)
+                or getattr(getattr(text_encoder, "config", None), "_name_or_path", None)
+            )
+            if processor_path is None:
+                raise ValueError(
+                    "Could not infer a processor path from `text_encoder`; pass `processor=` explicitly."
+                )
+            processor = Qwen3VLProcessor.from_pretrained(processor_path)
+        self.register_modules(
+            transformer=transformer,
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            processor=processor,
+        )
+        self.vae_scale_factor = (
+            2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.default_sample_size = 128
+        self.return_index = -8
+    # ------------------------------------------------------------------ #
+    #  Text encoding (aligned with pipeline.py's chat-template approach) #
+    # ------------------------------------------------------------------ #
+    def _format_prompt(self, prompt: str, system_prompt: str | None = None) -> str:
+        if system_prompt is None:
+            system_prompt = DEFAULT_SYSTEM_PROMPT
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": [{"type": "text", "text": prompt}]},
+        ]
+        return self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+    def encode_prompt(
+        self,
+        prompt: str | list[str] = None,
+        device: torch.device | None = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: torch.Tensor | None = None,
+        prompt_embeds_mask: torch.Tensor | None = None,
+        max_sequence_length: int = 1024,
+    ):
+        r"""
+        Encode text prompt(s) into embeddings using the Qwen3-VL text encoder.
+        Args:
+            prompt (`str` or `list[str]`, *optional*):
+                The prompt or prompts to encode.
+            device (`torch.device`, *optional*):
+                Torch device for the resulting tensors.
+            num_images_per_prompt (`int`, defaults to 1):
+                Number of images to generate per prompt.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings.  Skips encoding when provided.
+            prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Attention mask for pre-generated embeddings.
+            max_sequence_length (`int`, defaults to 512):
+                Maximum token length for the encoded prompt.
+        """
+        device = device or self._execution_device
+        if prompt_embeds is None:
+            prompt = [prompt] if isinstance(prompt, str) else prompt
+            formatted = [self._format_prompt(p) for p in prompt]
+            inputs = self.processor(
+                text=formatted,
+                padding="longest",
+                pad_to_multiple_of=8,
+                max_length=max_sequence_length,
+                truncation=True,
+                return_attention_mask=True,
+                return_tensors="pt",
+            ).to(device=device)
+            prompt_embeds_mask = inputs.attention_mask
+            outputs = self.text_encoder(
+                **inputs, use_cache=False, return_dict=True, output_hidden_states=True
+            )
+            prompt_embeds = outputs.hidden_states[self.return_index]
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(device=device)
+            if prompt_embeds_mask is not None:
+                prompt_embeds_mask = prompt_embeds_mask.to(device=device)
+        if num_images_per_prompt > 1:
+            prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            if prompt_embeds_mask is not None:
+                prompt_embeds_mask = prompt_embeds_mask.repeat_interleave(
+                    num_images_per_prompt, dim=0
+                )
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+        return prompt_embeds, prompt_embeds_mask
+    # ------------------------------------------------------------------ #
+    #  Input validation                                                   #
+    # ------------------------------------------------------------------ #
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_embeds_mask=None,
+        negative_prompt_embeds_mask=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} "
+                f"but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, "
+                f"but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. "
+                "Please make sure to only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both undefined."
+            )
+        elif prompt is not None and not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and "
+                f"`negative_prompt_embeds`: {negative_prompt_embeds}. "
+                "Please make sure to only forward one of the two."
+            )
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError(
+                f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}"
+            )
+    # ------------------------------------------------------------------ #
+    #  Latent helpers                                                     #
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width)
+        return latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, 1, num_channels_latents, height, width)
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        return latents
+    # ------------------------------------------------------------------ #
+    #  Convenience methods for VAE                                        #
+    # ------------------------------------------------------------------ #
+    def enable_vae_slicing(self):
+        r"""Enable sliced VAE decoding for memory efficiency."""
+        depr_message = (
+            f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and will be "
+            "removed in a future version. Please use `pipe.vae.enable_slicing()`."
+        )
+        deprecate("enable_vae_slicing", "0.40.0", depr_message)
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""Disable sliced VAE decoding."""
+        depr_message = (
+            f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and will be "
+            "removed in a future version. Please use `pipe.vae.disable_slicing()`."
+        )
+        deprecate("disable_vae_slicing", "0.40.0", depr_message)
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""Enable tiled VAE decoding for memory efficiency."""
+        depr_message = (
+            f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and will be "
+            "removed in a future version. Please use `pipe.vae.enable_tiling()`."
+        )
+        deprecate("enable_vae_tiling", "0.40.0", depr_message)
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""Disable tiled VAE decoding."""
+        depr_message = (
+            f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and will be "
+            "removed in a future version. Please use `pipe.vae.disable_tiling()`."
+        )
+        deprecate("disable_vae_tiling", "0.40.0", depr_message)
+        self.vae.disable_tiling()
+    # ------------------------------------------------------------------ #
+    #  Properties                                                         #
+    # ------------------------------------------------------------------ #
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    # ------------------------------------------------------------------ #
+    #  Main call                                                          #
+    # ------------------------------------------------------------------ #
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: str | list[str] = None,
+        negative_prompt: str | list[str] = None,
+        true_cfg_scale: float = 4.0,
+        height: int | None = None,
+        width: int | None = None,
+        num_inference_steps: int = 50,
+        sigmas: list[float] | None = None,
+        num_images_per_prompt: int = 1,
+        generator: torch.Generator | list[torch.Generator] | None = None,
+        latents: torch.Tensor | None = None,
+        prompt_embeds: torch.Tensor | None = None,
+        prompt_embeds_mask: torch.Tensor | None = None,
+        negative_prompt_embeds: torch.Tensor | None = None,
+        negative_prompt_embeds_mask: torch.Tensor | None = None,
+        output_type: str | None = "pil",
+        return_dict: bool = True,
+        attention_kwargs: dict[str, Any] | None = None,
+        callback_on_step_end: Callable[[int, int, dict], None] | None = None,
+        callback_on_step_end_tensor_inputs: list[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `list[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            negative_prompt (`str` or `list[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, an empty string is used
+                when `true_cfg_scale > 1`.
+            true_cfg_scale (`float`, *optional*, defaults to 4.0):
+                Classifier-free guidance scale. Values greater than 1 enable CFG. Higher values produce images
+                more closely linked to the text `prompt` at the expense of lower image quality.
+            height (`int`, *optional*, defaults to `self.default_sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.default_sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps.
+            sigmas (`list[float]`, *optional*):
+                Custom sigmas for the denoising schedule. If not defined, a linear schedule is used.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
+                One or a list of torch generators to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents to be used as inputs for image generation.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings.
+            prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Attention mask for pre-generated text embeddings.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings.
+            negative_prompt_embeds_mask (`torch.Tensor`, *optional*):
+                Attention mask for pre-generated negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `"pil"`, `"np"`, or `"latent"`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`NucleusMoEImagePipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                Kwargs passed to the attention processor.
+            callback_on_step_end (`Callable`, *optional*):
+                A function called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`list`, *optional*):
+                Tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, defaults to 512):
+                Maximum sequence length for the text prompt.
+        Examples:
+        Returns:
+            [`NucleusMoEImagePipelineOutput`] or `tuple`:
+            [`NucleusMoEImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple` where the first
+            element is a list with the generated images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._attention_kwargs = attention_kwargs or {}
+        self._current_timestep = None
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
+        )
+        do_true_cfg = true_cfg_scale > 1
+        if do_true_cfg and not has_neg_prompt:
+            negative_prompt = [""] * batch_size
+        # 3. Encode prompts
+        prompt_embeds, prompt_embeds_mask = self.encode_prompt(
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_true_cfg:
+            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                prompt_embeds_mask=negative_prompt_embeds_mask,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+            )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latent_h = 2 * (int(height) // (self.vae_scale_factor * 2))
+        latent_w = 2 * (int(width) // (self.vae_scale_factor * 2))
+        img_shapes = [(1, latent_h // 2, latent_w // 2)] * (batch_size * num_images_per_prompt)
+        # 5. Prepare timesteps
+        sigmas = (
+            np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        )
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # 6. Denoising loop
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_hidden_states_mask=prompt_embeds_mask,
+                    img_shapes=img_shapes,
+                    attention_kwargs=self._attention_kwargs,
+                    return_dict=False,
+                )[0]
+                if do_true_cfg:
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        encoder_hidden_states_mask=negative_prompt_embeds_mask,
+                        img_shapes=img_shapes,
+                        attention_kwargs=self._attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                    cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                    noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
+                    noise_pred = comb_pred * (cond_norm / noise_norm)
+                # Model predicts v = clean - noise; scheduler expects noise - clean
+                noise_pred = -noise_pred
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        # 7. Decode latents
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = (
+                1.0
+                / torch.tensor(self.vae.config.latents_std)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents = latents / latents_std + latents_mean
+            image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return NucleusMoEImagePipelineOutput(images=image)

pipeline_output.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from dataclasses import dataclass
+import numpy as np
+import PIL.Image
+from diffusers.utils import BaseOutput
+@dataclass
+class NucleusMoEImagePipelineOutput(BaseOutput):
+    """
+    Output class for Nucleus MoE Image pipelines.
+    Args:
+        images (`list[PIL.Image.Image]` or `np.ndarray`)
+            list of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+    images: list[PIL.Image.Image] | np.ndarray

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_class_name": "FlowMatchEulerDiscreteScheduler",
+  "_diffusers_version": "0.36.0",
+  "base_image_seq_len": 256,
+  "base_shift": 0.5,
+  "invert_sigmas": false,
+  "max_image_seq_len": 4096,
+  "max_shift": 1.15,
+  "num_train_timesteps": 1000,
+  "shift": 1.0,
+  "shift_terminal": null,
+  "stochastic_sampling": false,
+  "time_shift_type": "exponential",
+  "use_beta_sigmas": false,
+  "use_dynamic_shifting": false,
+  "use_exponential_sigmas": false,
+  "use_karras_sigmas": false
+}

text_encoder/README.md ADDED Viewed

	@@ -0,0 +1,192 @@

+---
+license: apache-2.0
+pipeline_tag: image-text-to-text
+library_name: transformers
+---
+<a href="https://chat.qwenlm.ai/" target="_blank" style="margin: 2px;">
+    <img alt="Chat" src="https://img.shields.io/badge/%F0%9F%92%9C%EF%B8%8F%20Qwen%20Chat%20-536af5" style="display: inline-block; vertical-align: middle;"/>
+</a>
+# Qwen3-VL-8B-Instruct
+Meet Qwen3-VL — the most powerful vision-language model in the Qwen series to date.
+This generation delivers comprehensive upgrades across the board: superior text understanding & generation, deeper visual perception & reasoning, extended context length, enhanced spatial and video dynamics comprehension, and stronger agent interaction capabilities.
+Available in Dense and MoE architectures that scale from edge to cloud, with Instruct and reasoning‑enhanced Thinking editions for flexible, on‑demand deployment.
+#### Key Enhancements:
+* **Visual Agent**: Operates PC/mobile GUIs—recognizes elements, understands functions, invokes tools, completes tasks.
+* **Visual Coding Boost**: Generates Draw.io/HTML/CSS/JS from images/videos.
+* **Advanced Spatial Perception**: Judges object positions, viewpoints, and occlusions; provides stronger 2D grounding and enables 3D grounding for spatial reasoning and embodied AI.
+* **Long Context & Video Understanding**: Native 256K context, expandable to 1M; handles books and hours-long video with full recall and second-level indexing.
+* **Enhanced Multimodal Reasoning**: Excels in STEM/Math—causal analysis and logical, evidence-based answers.
+* **Upgraded Visual Recognition**: Broader, higher-quality pretraining is able to “recognize everything”—celebrities, anime, products, landmarks, flora/fauna, etc.
+* **Expanded OCR**: Supports 32 languages (up from 19); robust in low light, blur, and tilt; better with rare/ancient characters and jargon; improved long-document structure parsing.
+* **Text Understanding on par with pure LLMs**: Seamless text–vision fusion for lossless, unified comprehension.
+#### Model Architecture Updates:
+<p align="center">
+    <img src="https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/qwen3vl_arc.jpg" width="80%"/>
+<p>
+1. **Interleaved-MRoPE**: Full‑frequency allocation over time, width, and height via robust positional embeddings, enhancing long‑horizon video reasoning.
+2. **DeepStack**: Fuses multi‑level ViT features to capture fine‑grained details and sharpen image–text alignment.
+3. **Text–Timestamp Alignment:** Moves beyond T‑RoPE to precise, timestamp‑grounded event localization for stronger video temporal modeling.
+This is the weight repository for Qwen3-VL-8B-Instruct.
+---
+## Model Performance
+**Multimodal performance**
+![](https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/qwen3vl_4b_8b_vl_instruct.jpg)
+**Pure text performance**
+![](https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/qwen3vl_4b_8b_text_instruct.jpg)
+## Quickstart
+Below, we provide simple examples to show how to use Qwen3-VL with 🤖 ModelScope and 🤗 Transformers.
+The code of Qwen3-VL has been in the latest Hugging Face transformers and we advise you to build from source with command:
+```
+pip install git+https://github.com/huggingface/transformers
+# pip install transformers==4.57.0 # currently, V4.57.0 is not released
+```
+### Using 🤗 Transformers to Chat
+Here we show a code snippet to show how to use the chat model with `transformers`:
+```python
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+# default: Load the model on the available device(s)
+model = Qwen3VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen3-VL-8B-Instruct", dtype="auto", device_map="auto"
+)
+# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+# model = Qwen3VLForConditionalGeneration.from_pretrained(
+#     "Qwen/Qwen3-VL-8B-Instruct",
+#     dtype=torch.bfloat16,
+#     attn_implementation="flash_attention_2",
+#     device_map="auto",
+# )
+processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+# Preparation for inference
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt"
+)
+inputs = inputs.to(model.device)
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+```
+### Generation Hyperparameters
+#### VL
+```bash
+export greedy='false'
+export top_p=0.8
+export top_k=20
+export temperature=0.7
+export repetition_penalty=1.0
+export presence_penalty=1.5
+export out_seq_length=16384
+```
+#### Text
+```bash
+export greedy='false'
+export top_p=1.0
+export top_k=40
+export repetition_penalty=1.0
+export presence_penalty=2.0
+export temperature=1.0
+export out_seq_length=32768
+```
+## Citation
+If you find our work helpful, feel free to give us a cite.
+```
+@misc{qwen3technicalreport,
+      title={Qwen3 Technical Report},
+      author={Qwen Team},
+      year={2025},
+      eprint={2505.09388},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2505.09388},
+}
+@article{Qwen2.5-VL,
+  title={Qwen2.5-VL Technical Report},
+  author={Bai, Shuai and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Song, Sibo and Dang, Kai and Wang, Peng and Wang, Shijie and Tang, Jun and Zhong, Humen and Zhu, Yuanzhi and Yang, Mingkun and Li, Zhaohai and Wan, Jianqiang and Wang, Pengfei and Ding, Wei and Fu, Zheren and Xu, Yiheng and Ye, Jiabo and Zhang, Xi and Xie, Tianbao and Cheng, Zesen and Zhang, Hang and Yang, Zhibo and Xu, Haiyang and Lin, Junyang},
+  journal={arXiv preprint arXiv:2502.13923},
+  year={2025}
+}
+@article{Qwen2VL,
+  title={Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution},
+  author={Wang, Peng and Bai, Shuai and Tan, Sinan and Wang, Shijie and Fan, Zhihao and Bai, Jinze and Chen, Keqin and Liu, Xuejing and Wang, Jialin and Ge, Wenbin and Fan, Yang and Dang, Kai and Du, Mengfei and Ren, Xuancheng and Men, Rui and Liu, Dayiheng and Zhou, Chang and Zhou, Jingren and Lin, Junyang},
+  journal={arXiv preprint arXiv:2409.12191},
+  year={2024}
+}
+@article{Qwen-VL,
+  title={Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond},
+  author={Bai, Jinze and Bai, Shuai and Yang, Shusheng and Wang, Shijie and Tan, Sinan and Wang, Peng and Lin, Junyang and Zhou, Chang and Zhou, Jingren},
+  journal={arXiv preprint arXiv:2308.12966},
+  year={2023}
+}
+```

text_encoder/chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n"
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "architectures": [
+    "Qwen3VLForConditionalGeneration"
+  ],
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.0.dev0",
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      8,
+      16,
+      24
+    ],
+    "depth": 27,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 4096,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

text_encoder/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "bos_token_id": 151643,
+    "pad_token_id": 151643,
+    "do_sample": true,
+    "eos_token_id": [
+        151645,
+        151643
+    ],
+    "top_k": 20,
+    "top_p": 0.8,
+    "repetition_penalty": 1.0,
+    "temperature": 0.7,
+    "transformers_version": "4.56.0"
+}

text_encoder/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

text_encoder/model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5d0aef0eb170fc7453a296c43c0849a56f510555d3588e4fd662bb35490aefa
+size 4902275944

text_encoder/model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8be88fb5501e4d5719a6d4cc212e6a13480330e74f3e8c77daa1a68f199106b5
+size 4915962496

text_encoder/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83de00eafe6e0d57ccd009dbcf71c9974d74df2f016c27afb7e95aafd16b2192
+size 4999831048

text_encoder/model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a88b98e9f96270973f567e6a2c103ede6ccdf915ca3075e21c755604d0377a5
+size 2716270024

text_encoder/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,757 @@

+{
+  "metadata": {
+    "total_size": 17534247392
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.20.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.21.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.22.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.35.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.1.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.10.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.11.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.12.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.13.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.14.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.15.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.16.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.2.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.23.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.24.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.25.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.26.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.3.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.4.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.5.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.6.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.7.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.8.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.9.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc1.bias": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc1.weight": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc2.bias": "model-00004-of-00004.safetensors",
+    "model.visual.merger.linear_fc2.weight": "model-00004-of-00004.safetensors",
+    "model.visual.merger.norm.bias": "model-00004-of-00004.safetensors",
+    "model.visual.merger.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.patch_embed.proj.bias": "model-00004-of-00004.safetensors",
+    "model.visual.patch_embed.proj.weight": "model-00004-of-00004.safetensors",
+    "model.visual.pos_embed.weight": "model-00004-of-00004.safetensors"
+  }
+}

text_encoder/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "size": {
+        "longest_edge": 16777216,
+        "shortest_edge": 65536
+    },
+    "patch_size": 16,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "Qwen3VLProcessor",
+    "image_processor_type": "Qwen2VLImageProcessorFast"
+}

text_encoder/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

text_encoder/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' }}\n        {%- if messages[0].content is string %}\n            {{- messages[0].content }}\n        {%- else %}\n            {%- for content in messages[0].content %}\n                {%- if 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- for message in messages %}\n    {%- if message.role == \"user\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role + '\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content_item in message.content %}\n                {%- if 'text' in content_item %}\n                    {{- content_item.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and message.content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {%- if message.content is string %}\n            {{- message.content }}\n        {%- else %}\n            {%- for content in message.content %}\n                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n                    {%- set image_count.value = image_count.value + 1 %}\n                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n                    <|vision_start|><|image_pad|><|vision_end|>\n                {%- elif content.type == 'video' or 'video' in content %}\n                    {%- set video_count.value = video_count.value + 1 %}\n                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n                    <|vision_start|><|video_pad|><|vision_end|>\n                {%- elif 'text' in content %}\n                    {{- content.text }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 262144,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

text_encoder/video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "size": {
+        "longest_edge": 25165824,
+        "shortest_edge": 4096
+    },
+    "patch_size": 16,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "image_std": [
+        0.5,
+        0.5,
+        0.5
+    ],
+    "processor_class": "Qwen3VLProcessor",
+    "video_processor_type": "Qwen3VLVideoProcessor"
+}

text_encoder/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/config.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "_class_name": [
+    "modeling_nucleusmoe",
+    "NucleusMoEImageTransformer2DModel"
+  ],
+  "_diffusers_version": "0.36.0",
+  "patch_size": 2,
+  "in_channels": 64,
+  "out_channels": 16,
+  "num_layers": 32,
+  "attention_head_dim": 128,
+  "num_attention_heads": 16,
+  "num_key_value_heads": 4,
+  "joint_attention_dim": 4096,
+  "axes_dims_rope": [
+    16,
+    56,
+    56
+  ],
+  "mlp_ratio": 4.0,
+  "moe_enabled": true,
+  "dense_moe_strategy": "leave_first_three_blocks_dense",
+  "num_experts": 64,
+  "moe_intermediate_dim": 1344,
+  "capacity_factors": [
+    0.0,
+    0.0,
+    0.0,
+    4.0,
+    4.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0,
+    2.0
+],
+  "use_sigmoid": false,
+  "route_scale": 2.5
+}

transformer/diffusion_pytorch_model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:239b546d425bdeedc664ab9052ba33e33da744d423d2462261b0a3d82ca7c88b
+size 4991757800

transformer/diffusion_pytorch_model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25a9024259d23108fb09b834849c469469bdac1f09e15f1be49f55276cb8ae27
+size 4999012736

transformer/diffusion_pytorch_model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a6c2458a82bdbfdd626017a1fb4d8a6d3c120f72902d7f4d248bdb5f56cc47
+size 5000040248

transformer/diffusion_pytorch_model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfda61b257bd60a6ef9b48ae611127296ed79262e1aee4cdead7566e1ab10fbc
+size 4994535096

transformer/diffusion_pytorch_model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e88a38ae2ecdfe6ad7c58294c75732661f5f55bc94e71567c167befde8ecd07
+size 4999013192

transformer/diffusion_pytorch_model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51935971585478fb2cf1564ebcaabca9affbaa806f68c0b7667262d2036f663a
+size 5000040248

transformer/diffusion_pytorch_model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87f3dab9083547c2acecb71376cdaf229227f5011ccb872ac171c07227a922c0
+size 3861789552

transformer/diffusion_pytorch_model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:239b546d425bdeedc664ab9052ba33e33da744d423d2462261b0a3d82ca7c88b
+size 4991757800

transformer/model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25a9024259d23108fb09b834849c469469bdac1f09e15f1be49f55276cb8ae27
+size 4999012736

transformer/model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a6c2458a82bdbfdd626017a1fb4d8a6d3c120f72902d7f4d248bdb5f56cc47
+size 5000040248

transformer/model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfda61b257bd60a6ef9b48ae611127296ed79262e1aee4cdead7566e1ab10fbc
+size 4994535096

transformer/model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e88a38ae2ecdfe6ad7c58294c75732661f5f55bc94e71567c167befde8ecd07
+size 4999013192

transformer/model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51935971585478fb2cf1564ebcaabca9affbaa806f68c0b7667262d2036f663a
+size 5000040248

transformer/model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb62a72407928ebc1eda73210be1ec448464714cf439951d23b49b9b59b65c27
+size 3861520360

transformer/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

transformer/modeling_nucleusmoe.py ADDED Viewed

	@@ -0,0 +1,859 @@

+# Copyright 2025 Nucleus-Image Team, The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import math
+from typing import Any, List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import AttentionMixin, FeedForward
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+from diffusers.models.attention_processor import Attention
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+) -> torch.Tensor:
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent).to(timesteps.dtype)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def apply_rotary_emb_nucleus(
+    x: torch.Tensor,
+    freqs_cis: torch.Tensor | tuple[torch.Tensor],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(1)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+def compute_text_seq_len_from_mask(
+    encoder_hidden_states: torch.Tensor, encoder_hidden_states_mask: torch.Tensor | None
+) -> tuple[int, torch.Tensor | None, torch.Tensor | None]:
+    """
+    Compute text sequence length without assuming contiguous masks. Returns length for RoPE and a normalized bool mask.
+    """
+    batch_size, text_seq_len = encoder_hidden_states.shape[:2]
+    if encoder_hidden_states_mask is None:
+        return text_seq_len, None, None
+    if encoder_hidden_states_mask.shape[:2] != (batch_size, text_seq_len):
+        raise ValueError(
+            f"`encoder_hidden_states_mask` shape {encoder_hidden_states_mask.shape} must match "
+            f"(batch_size, text_seq_len)=({batch_size}, {text_seq_len})."
+        )
+    if encoder_hidden_states_mask.dtype != torch.bool:
+        encoder_hidden_states_mask = encoder_hidden_states_mask.to(torch.bool)
+    position_ids = torch.arange(text_seq_len, device=encoder_hidden_states.device, dtype=torch.long)
+    active_positions = torch.where(encoder_hidden_states_mask, position_ids, position_ids.new_zeros(()))
+    has_active = encoder_hidden_states_mask.any(dim=1)
+    per_sample_len = torch.where(
+        has_active,
+        active_positions.max(dim=1).values + 1,
+        torch.as_tensor(text_seq_len, device=encoder_hidden_states.device),
+    )
+    return text_seq_len, per_sample_len, encoder_hidden_states_mask
+class NucleusTimestepProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim, use_additional_t_cond=False):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=embedding_dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=embedding_dim, time_embed_dim=4 * embedding_dim, out_dim=embedding_dim
+        )
+        self.norm = RMSNorm(embedding_dim, eps=1e-6)
+        self.use_additional_t_cond = use_additional_t_cond
+        if use_additional_t_cond:
+            self.addition_t_embedding = nn.Embedding(2, embedding_dim)
+    def forward(self, timestep, hidden_states, addition_t_cond=None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))  # (N, D)
+        conditioning = timesteps_emb
+        if self.use_additional_t_cond:
+            if addition_t_cond is None:
+                raise ValueError("When additional_t_cond is True, addition_t_cond must be provided.")
+            addition_t_emb = self.addition_t_embedding(addition_t_cond)
+            addition_t_emb = addition_t_emb.to(dtype=hidden_states.dtype)
+            conditioning = conditioning + addition_t_emb
+        return self.norm(conditioning)
+class NucleusEmbedRope(nn.Module):
+    def __init__(self, theta: int, axes_dim: list[int], scale_rope=False):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        pos_index = torch.arange(4096)
+        neg_index = torch.arange(4096).flip(0) * -1 - 1
+        self.pos_freqs = torch.cat(
+            [
+                self.rope_params(pos_index, self.axes_dim[0], self.theta),
+                self.rope_params(pos_index, self.axes_dim[1], self.theta),
+                self.rope_params(pos_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        self.neg_freqs = torch.cat(
+            [
+                self.rope_params(neg_index, self.axes_dim[0], self.theta),
+                self.rope_params(neg_index, self.axes_dim[1], self.theta),
+                self.rope_params(neg_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        # DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART
+        self.scale_rope = scale_rope
+    def rope_params(self, index, dim, theta=10000):
+        """
+        Args:
+            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
+        """
+        assert dim % 2 == 0
+        freqs = torch.outer(index, 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim)))
+        freqs = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs
+    def forward(
+        self,
+        video_fhw: tuple[int, int, int, list[tuple[int, int, int]]],
+        txt_seq_lens: list[int] | None = None,
+        device: torch.device = None,
+        max_txt_seq_len: int | torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            video_fhw (`tuple[int, int, int]` or `list[tuple[int, int, int]]`):
+                A list of 3 integers [frame, height, width] representing the shape of the video.
+            txt_seq_lens (`list[int]`, *optional*, **Deprecated**):
+                Deprecated parameter. Use `max_txt_seq_len` instead. If provided, the maximum value will be used.
+            device: (`torch.device`, *optional*):
+                The device on which to perform the RoPE computation.
+            max_txt_seq_len (`int` or `torch.Tensor`, *optional*):
+                The maximum text sequence length for RoPE computation. This should match the encoder hidden states
+                sequence length. Can be either an int or a scalar tensor (for torch.compile compatibility).
+        """
+        # Handle deprecated txt_seq_lens parameter
+        if txt_seq_lens is not None:
+            deprecate(
+                "txt_seq_lens",
+                "0.39.0",
+                "Passing `txt_seq_lens` is deprecated and will be removed in version 0.39.0. "
+                "Please use `max_txt_seq_len` instead. "
+                "The new parameter accepts a single int or tensor value representing the maximum text sequence length.",
+                standard_warn=False,
+            )
+            if max_txt_seq_len is None:
+                # Use max of txt_seq_lens for backward compatibility
+                max_txt_seq_len = max(txt_seq_lens) if isinstance(txt_seq_lens, list) else txt_seq_lens
+        if max_txt_seq_len is None:
+            raise ValueError("Either `max_txt_seq_len` or `txt_seq_lens` (deprecated) must be provided.")
+        # Validate batch inference with variable-sized images
+        if isinstance(video_fhw, list) and len(video_fhw) > 1:
+            # Check if all instances have the same size
+            first_fhw = video_fhw[0]
+            if not all(fhw == first_fhw for fhw in video_fhw):
+                logger.warning(
+                    "Batch inference with variable-sized images is not currently supported in NucleusEmbedRope. "
+                    "All images in the batch should have the same dimensions (frame, height, width). "
+                    f"Detected sizes: {video_fhw}. Using the first image's dimensions {first_fhw} "
+                    "for RoPE computation, which may lead to incorrect results for other images in the batch."
+                )
+        if isinstance(video_fhw, list):
+            video_fhw = video_fhw[0]
+        if not isinstance(video_fhw, list):
+            video_fhw = [video_fhw]
+        vid_freqs = []
+        max_vid_index = 0
+        for idx, fhw in enumerate(video_fhw):
+            frame, height, width = fhw
+            # RoPE frequencies are cached via a lru_cache decorator on _compute_video_freqs
+            video_freq = self._compute_video_freqs(frame, height, width, idx, device)
+            vid_freqs.append(video_freq)
+            if self.scale_rope:
+                max_vid_index = max(height // 2, width // 2, max_vid_index)
+            else:
+                max_vid_index = max(height, width, max_vid_index)
+        max_txt_seq_len_int = int(max_txt_seq_len)
+        # Create device-specific copy for text freqs without modifying self.pos_freqs
+        txt_freqs = self.pos_freqs.to(device)[max_vid_index : max_vid_index + max_txt_seq_len_int, ...]
+        vid_freqs = torch.cat(vid_freqs, dim=0)
+        return vid_freqs, txt_freqs
+    @functools.lru_cache(maxsize=128)
+    def _compute_video_freqs(
+        self, frame: int, height: int, width: int, idx: int = 0, device: torch.device = None
+    ) -> torch.Tensor:
+        seq_lens = frame * height * width
+        pos_freqs = self.pos_freqs.to(device) if device is not None else self.pos_freqs
+        neg_freqs = self.neg_freqs.to(device) if device is not None else self.neg_freqs
+        freqs_pos = pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        if self.scale_rope:
+            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+        else:
+            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        return freqs.clone().contiguous()
+class NucleusMoEAttnProcessor2_0:
+    """
+    Attention processor for the Nucleus MoE architecture. Image queries attend to concatenated image+text keys/values
+    (cross-attention style, no text query). Supports grouped-query attention (GQA) when num_key_value_heads is set on
+    the Attention module.
+    """
+    _attention_backend = None
+    _parallel_config = None
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "NucleusMoEAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: torch.FloatTensor | None = None,
+        image_rotary_emb: torch.Tensor | None = None,
+    ) -> torch.FloatTensor:
+        head_dim = attn.inner_dim // attn.heads
+        num_kv_heads = attn.inner_kv_dim // head_dim
+        num_kv_groups = attn.heads // num_kv_heads
+        img_query = attn.to_q(hidden_states).unflatten(-1, (attn.heads, -1))
+        img_key = attn.to_k(hidden_states).unflatten(-1, (num_kv_heads, -1))
+        img_value = attn.to_v(hidden_states).unflatten(-1, (num_kv_heads, -1))
+        if attn.norm_q is not None:
+            img_query = attn.norm_q(img_query)
+        if attn.norm_k is not None:
+            img_key = attn.norm_k(img_key)
+        if image_rotary_emb is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            img_query = apply_rotary_emb_nucleus(img_query, img_freqs, use_real=False)
+            img_key = apply_rotary_emb_nucleus(img_key, img_freqs, use_real=False)
+        if encoder_hidden_states is not None:
+            txt_key = attn.add_k_proj(encoder_hidden_states).unflatten(-1, (num_kv_heads, -1))
+            txt_value = attn.add_v_proj(encoder_hidden_states).unflatten(-1, (num_kv_heads, -1))
+            if attn.norm_added_k is not None:
+                txt_key = attn.norm_added_k(txt_key)
+            if image_rotary_emb is not None:
+                txt_key = apply_rotary_emb_nucleus(txt_key, txt_freqs, use_real=False)
+            joint_key = torch.cat([img_key, txt_key], dim=1)
+            joint_value = torch.cat([img_value, txt_value], dim=1)
+        else:
+            joint_key = img_key
+            joint_value = img_value
+        if num_kv_groups > 1:
+            joint_key = joint_key.repeat_interleave(num_kv_groups, dim=2)
+            joint_value = joint_value.repeat_interleave(num_kv_groups, dim=2)
+        hidden_states = dispatch_attention_fn(
+            img_query,
+            joint_key,
+            joint_value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(img_query.dtype)
+        hidden_states = attn.to_out[0](hidden_states)
+        if len(attn.to_out) > 1:
+            hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+def _is_moe_layer(strategy: str, layer_idx: int, num_layers: int) -> bool:
+    if strategy == "leave_first_three_and_last_block_dense":
+        return layer_idx >= 3 and layer_idx < num_layers - 1
+    elif strategy == "leave_first_three_blocks_dense":
+        return layer_idx >= 3
+    elif strategy == "leave_first_block_dense":
+        return layer_idx >= 1
+    elif strategy == "all_moe":
+        return True
+    elif strategy == "all_dense":
+        return False
+    return True
+class NucleusMoELayer(nn.Module):
+    """
+    Mixture-of-Experts layer with expert-choice routing and a shared expert.
+    Each expert is a separate ``FeedForward`` module stored in an ``nn.ModuleList``.
+    The router concatenates a timestep embedding with the (unmodulated) hidden state
+    to produce per-token affinity scores, then selects the top-C tokens per expert
+    (expert-choice routing).  A shared expert processes all tokens in parallel and its
+    output is combined with the routed expert outputs via scatter-add.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        moe_intermediate_dim: int,
+        num_experts: int,
+        capacity_factor: float,
+        use_sigmoid: bool,
+        route_scale: float,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.capacity_factor = capacity_factor
+        self.use_sigmoid = use_sigmoid
+        self.route_scale = route_scale
+        self.gate = nn.Linear(hidden_size * 2, num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [
+                FeedForward(
+                    dim=hidden_size, dim_out=hidden_size,
+                    inner_dim=moe_intermediate_dim, activation_fn="swiglu", bias=False,
+                )
+                for _ in range(num_experts)
+            ]
+        )
+        self.shared_expert = FeedForward(
+            dim=hidden_size, dim_out=hidden_size,
+            inner_dim=moe_intermediate_dim, activation_fn="swiglu", bias=False,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        hidden_states_unmodulated: torch.Tensor,
+        timestep: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        bs, slen, dim = hidden_states.shape
+        if timestep is not None:
+            timestep_expanded = timestep.unsqueeze(1).expand(-1, slen, -1)
+            router_input = torch.cat([timestep_expanded, hidden_states_unmodulated], dim=-1)
+        else:
+            router_input = hidden_states_unmodulated
+        logits = self.gate(router_input)
+        if self.use_sigmoid:
+            scores = torch.sigmoid(logits.float()).to(logits.dtype)
+        else:
+            scores = F.softmax(logits.float(), dim=-1).to(logits.dtype)
+        affinity = scores.transpose(1, 2)  # (B, E, S)
+        capacity = max(1, math.ceil(self.capacity_factor * slen / self.num_experts))
+        topk = torch.topk(affinity, k=capacity, dim=-1)
+        top_indices = topk.indices  # (B, E, C)
+        gating = affinity.gather(dim=-1, index=top_indices)  # (B, E, C)
+        batch_offsets = torch.arange(bs, device=hidden_states.device, dtype=torch.long).view(bs, 1, 1) * slen
+        global_token_indices = (batch_offsets + top_indices).transpose(0, 1).reshape(self.num_experts, -1).reshape(-1)
+        gating_flat = gating.transpose(0, 1).reshape(self.num_experts, -1).reshape(-1)
+        token_score_sums = torch.zeros(bs * slen, device=hidden_states.device, dtype=gating_flat.dtype)
+        token_score_sums.scatter_add_(0, global_token_indices, gating_flat)
+        gating_flat = gating_flat / (token_score_sums[global_token_indices] + 1e-12)
+        gating_flat = gating_flat * self.route_scale
+        x_flat = hidden_states.reshape(bs * slen, dim)
+        routed_input = x_flat[global_token_indices]
+        tokens_per_expert = bs * capacity
+        routed_output_parts = []
+        for i, expert in enumerate(self.experts):
+            start = i * tokens_per_expert
+            end = start + tokens_per_expert
+            expert_out = expert(routed_input[start:end])
+            routed_output_parts.append(expert_out)
+        routed_output = torch.cat(routed_output_parts, dim=0)
+        routed_output = (routed_output.float() * gating_flat.unsqueeze(-1)).to(hidden_states.dtype)
+        out = self.shared_expert(hidden_states).reshape(bs * slen, dim)
+        scatter_idx = global_token_indices.reshape(-1, 1).expand(-1, dim)
+        out = out.scatter_add(dim=0, index=scatter_idx, src=routed_output)
+        out = out.reshape(bs, slen, dim)
+        return out
+@maybe_allow_in_graph
+class NucleusMoEImageTransformerBlock(nn.Module):
+    """
+    Single-stream DiT block with optional Mixture-of-Experts MLP, matching the DiTBlock
+    architecture from model_v2.  Only the image stream receives adaptive modulation;
+    the text context is projected per-block and used as cross-attention keys/values.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_key_value_heads: int | None = None,
+        joint_attention_dim: int = 3584,
+        qk_norm: str = "rms_norm",
+        eps: float = 1e-6,
+        mlp_ratio: float = 4.0,
+        moe_enabled: bool = False,
+        num_experts: int = 128,
+        moe_intermediate_dim: int = 1344,
+        capacity_factor: float = 8.0,
+        use_sigmoid: bool = False,
+        route_scale: float = 2.5,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.moe_enabled = moe_enabled
+        self.img_mod = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, 4 * dim, bias=True),
+        )
+        self.encoder_proj = nn.Linear(joint_attention_dim, dim)
+        self.pre_attn_norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+        self.attn = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            kv_heads=num_key_value_heads,
+            dim_head=attention_head_dim,
+            added_kv_proj_dim=dim,
+            added_proj_bias=False,
+            out_dim=dim,
+            out_bias=False,
+            bias=False,
+            processor=NucleusMoEAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=eps,
+            context_pre_only=None,
+        )
+        self.pre_mlp_norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+        if moe_enabled:
+            self.img_mlp = NucleusMoELayer(
+                hidden_size=dim,
+                moe_intermediate_dim=moe_intermediate_dim,
+                num_experts=num_experts,
+                capacity_factor=capacity_factor,
+                use_sigmoid=use_sigmoid,
+                route_scale=route_scale,
+            )
+        else:
+            mlp_inner_dim = int(dim * mlp_ratio * 2 / 3) // 128 * 128
+            self.img_mlp = FeedForward(
+                dim=dim, dim_out=dim, inner_dim=mlp_inner_dim,
+                activation_fn="swiglu", bias=False,
+            )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
+        attention_kwargs: dict[str, Any] | None = None,
+    ) -> torch.Tensor:
+        scale1, gate1, scale2, gate2 = self.img_mod(temb).unsqueeze(1).chunk(4, dim=-1)
+        scale1, scale2 = 1 + scale1, 1 + scale2
+        gate1 = gate1.clamp(min=-2.0, max=2.0)
+        gate2 = gate2.clamp(min=-2.0, max=2.0)
+        context = self.encoder_proj(encoder_hidden_states)
+        img_normed = self.pre_attn_norm(hidden_states)
+        img_modulated = img_normed * scale1
+        attention_kwargs = attention_kwargs or {}
+        img_attn_output = self.attn(
+            hidden_states=img_modulated,
+            encoder_hidden_states=context,
+            image_rotary_emb=image_rotary_emb,
+            **attention_kwargs,
+        )
+        hidden_states = hidden_states + gate1.tanh() * img_attn_output
+        img_normed2 = self.pre_mlp_norm(hidden_states)
+        img_modulated2 = img_normed2 * scale2
+        if self.moe_enabled:
+            img_mlp_output = self.img_mlp(img_modulated2, img_normed2, timestep=temb)
+        else:
+            img_mlp_output = self.img_mlp(img_modulated2)
+        hidden_states = hidden_states + gate2.tanh() * img_mlp_output
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states
+class NucleusMoEImageTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin
+):
+    """
+    Nucleus MoE Transformer for image generation.  Single-stream DiT with
+    cross-attention to text and optional Mixture-of-Experts feed-forward layers.
+    Args:
+        patch_size (`int`, defaults to `2`):
+            Patch size to turn the input data into small patches.
+        in_channels (`int`, defaults to `64`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `None`):
+            The number of channels in the output. If not specified, it defaults to `in_channels`.
+        num_layers (`int`, defaults to `24`):
+            The number of transformer blocks.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of dimensions to use for each attention head.
+        num_attention_heads (`int`, defaults to `16`):
+            The number of attention heads to use.
+        num_key_value_heads (`int`, *optional*):
+            The number of key/value heads for grouped-query attention.  Defaults to `num_attention_heads`.
+        joint_attention_dim (`int`, defaults to `3584`):
+            The embedding dimension of the encoder hidden states (text).
+        axes_dims_rope (`tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions to use for the rotary positional embeddings.
+        use_layer3d_rope (`bool`, defaults to `False`):
+            Whether to use the Layer3D variant of RoPE.
+        mlp_ratio (`float`, defaults to `4.0`):
+            Multiplier for the MLP hidden dimension in dense (non-MoE) blocks.
+        moe_enabled (`bool`, defaults to `True`):
+            Whether to use Mixture-of-Experts layers.
+        dense_moe_strategy (`str`, defaults to ``"leave_first_three_and_last_block_dense"``):
+            Strategy for choosing which layers are MoE vs dense.
+        num_experts (`int`, defaults to `128`):
+            Number of experts per MoE layer.
+        moe_intermediate_dim (`int`, defaults to `1344`):
+            Hidden dimension inside each expert.
+        capacity_factor (`float`, defaults to `8.0`):
+            Expert-choice capacity factor.
+        use_sigmoid (`bool`, defaults to `False`):
+            Use sigmoid instead of softmax for routing scores.
+        route_scale (`float`, defaults to `2.5`):
+            Scaling factor applied to routing weights.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["NucleusMoEImageTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["NucleusMoEImageTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 64,
+        out_channels: int | None = None,
+        num_layers: int = 24,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int | None = None,
+        joint_attention_dim: int = 3584,
+        axes_dims_rope: tuple[int, int, int] = (16, 56, 56),
+        mlp_ratio: float = 4.0,
+        moe_enabled: bool = True,
+        dense_moe_strategy: str = "leave_first_three_and_last_block_dense",
+        num_experts: int = 128,
+        moe_intermediate_dim: int = 1344,
+        capacity_factors: List[float] = [8.0] * 24,
+        use_sigmoid: bool = False,
+        route_scale: float = 2.5,
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = NucleusEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
+        self.time_text_embed = NucleusTimestepProjEmbeddings(embedding_dim=self.inner_dim)
+        self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
+        self.img_in = nn.Linear(in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                NucleusMoEImageTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    num_key_value_heads=num_key_value_heads,
+                    joint_attention_dim=joint_attention_dim,
+                    mlp_ratio=mlp_ratio,
+                    moe_enabled=moe_enabled and _is_moe_layer(dense_moe_strategy, idx, num_layers),
+                    num_experts=num_experts,
+                    moe_intermediate_dim=moe_intermediate_dim,
+                    capacity_factor=capacity_factors[idx],
+                    use_sigmoid=use_sigmoid,
+                    route_scale=route_scale,
+                )
+                for idx in range(num_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        img_shapes: list[tuple[int, int, int]] | None = None,
+        encoder_hidden_states: torch.Tensor = None,
+        encoder_hidden_states_mask: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        txt_seq_lens: list[int] | None = None,
+        attention_kwargs: dict[str, Any] | None = None,
+        return_dict: bool = True,
+    ) -> torch.Tensor | Transformer2DModelOutput:
+        """
+        The [`NucleusMoEImageTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            img_shapes (`list[tuple[int, int, int]]`, *optional*):
+                Image shapes ``(frame, height, width)`` for RoPE computation.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+                Boolean mask for the encoder hidden states.
+            timestep (`torch.LongTensor`):
+                Used to indicate denoising step.
+            txt_seq_lens (`list[int]`, *optional*, **Deprecated**):
+                Deprecated.  Use ``encoder_hidden_states_mask`` instead.
+            attention_kwargs (`dict`, *optional*):
+                Extra kwargs forwarded to the attention processor.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.transformer_2d.Transformer2DModelOutput`].
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if txt_seq_lens is not None:
+            deprecate(
+                "txt_seq_lens",
+                "0.39.0",
+                "Passing `txt_seq_lens` is deprecated and will be removed in version 0.39.0. "
+                "Please use `encoder_hidden_states_mask` instead.",
+                standard_warn=False,
+            )
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            scale_lora_layers(self, lora_scale)
+        hidden_states = self.img_in(hidden_states)
+        timestep = timestep.to(hidden_states.dtype)
+        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+        text_seq_len, _, encoder_hidden_states_mask = compute_text_seq_len_from_mask(
+            encoder_hidden_states, encoder_hidden_states_mask
+        )
+        temb = self.time_text_embed(timestep, hidden_states)
+        image_rotary_emb = self.pos_embed(img_shapes, max_txt_seq_len=text_seq_len, device=hidden_states.device)
+        block_attention_kwargs = attention_kwargs.copy() if attention_kwargs is not None else {}
+        if encoder_hidden_states_mask is not None:
+            batch_size, image_seq_len = hidden_states.shape[:2]
+            image_mask = torch.ones((batch_size, image_seq_len), dtype=torch.bool, device=hidden_states.device)
+            joint_attention_mask = torch.cat([image_mask, encoder_hidden_states_mask], dim=1)
+            block_attention_kwargs["attention_mask"] = joint_attention_mask
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    block_attention_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=block_attention_kwargs,
+                )
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

vae/config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_class_name": "AutoencoderKLQwenImage",
+  "_diffusers_version": "0.36.0.dev0",
+  "attn_scales": [],
+  "base_dim": 96,
+  "dim_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
+  "dropout": 0.0,
+  "latents_mean": [
+    -0.7571,
+    -0.7089,
+    -0.9113,
+    0.1075,
+    -0.1745,
+    0.9653,
+    -0.1517,
+    1.5508,
+    0.4134,
+    -0.0715,
+    0.5517,
+    -0.3632,
+    -0.1922,
+    -0.9497,
+    0.2503,
+    -0.2921
+  ],
+  "latents_std": [
+    2.8184,
+    1.4541,
+    2.3275,
+    2.6558,
+    1.2196,
+    1.7708,
+    2.6052,
+    2.0743,
+    3.2687,
+    2.1526,
+    2.8652,
+    1.5579,
+    1.6382,
+    1.1253,
+    2.8251,
+    1.916
+  ],
+  "num_res_blocks": 2,
+  "temperal_downsample": [
+    false,
+    true,
+    true
+  ],
+  "z_dim": 16
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c8bc8b758c649abef9ea407b95408389a3b2f610d0d10fcb054fe171d0a8344
+size 253806966