allenai
/

Molmo-7B-D-0924

@@ -123,7 +123,7 @@ class RotaryEmbedding(nn.Module):
             inv_freq = 1.0 / (self.config.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
             seq = torch.arange(seq_len, device=device, dtype=torch.float)
             freqs = torch.einsum("i , j -> i j", seq, inv_freq)
-            if self.config.rope_impl == "cockatoo":
                 positions = freqs.repeat_interleave(2, dim=-1)
             else:
                 positions = torch.cat((freqs, freqs), dim=-1)
@@ -146,7 +146,7 @@ class RotaryEmbedding(nn.Module):
         return x.view(B, nh, T, hs)
     def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
-        if self.config.rope_impl == "cockatoo":
             return ((t * pos_cos) + (self.rotate_every_two(t) * pos_sin)).to(t.dtype)
         else:
             return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
@@ -205,7 +205,7 @@ class MolmoBlock(nn.Module):
         self._activation_checkpoint_fn = None
         # Dropout.
-        self.dropout = Dropout(config.residual_dropout, mask_p=config.response_residual_dropout)
         # Layer norms.
         self.k_norm: Optional[LayerNormBase] = None
@@ -298,7 +298,6 @@ class MolmoBlock(nn.Module):
         k: torch.Tensor,
         v: torch.Tensor,
         attn_mask: Optional[torch.Tensor] = None,
-        drop_mask: Optional[torch.Tensor] = None,
         dropout_p: float = 0.0,
         response_dropout_p: float = 0.0,
         is_causal: bool = False,
@@ -341,7 +340,6 @@ class MolmoBlock(nn.Module):
         v: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        drop_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
@@ -394,7 +392,6 @@ class MolmoBlock(nn.Module):
             k,
             v,
             attn_mask=attention_bias,
-            drop_mask=drop_mask,
             dropout_p=0.0 if not self.training else self.config.attention_dropout,
             response_dropout_p=0.0 if not self.training else self.config.response_attention_dropout,
             is_causal=attention_bias is None,
@@ -411,7 +408,6 @@ class MolmoBlock(nn.Module):
         x: torch.Tensor,
         attention_bias: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        drop_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
@@ -419,183 +415,7 @@ class MolmoBlock(nn.Module):
     @classmethod
     def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
-        if config.block_type == "sequential":
-            return MolmoSequentialBlock(layer_id, config, cache)
-        elif config.block_type == "llama":
-            return OLMoLlamaBlock(layer_id, config, cache)
-        else:
-            raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
-class OLMoLlamaBlock(MolmoBlock):
-    """
-    This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
-    (plus another skip connection). This block is similar to `MolmoSequentialBlock`
-    but some operations have slightly different implementations to imitate the
-    behavior of Llama.
-    """
-    def __init__(self, layer_id: int, config: MolmoConfig, cache: BufferCache):
-        super().__init__(layer_id, config, cache)
-        # Layer norms.
-        self.attn_norm = LayerNorm.build(config)
-        self.ff_norm = LayerNorm.build(config)
-        self.__cache = cache
-        # Attention input projection. Projects x -> (q, k, v)
-        q_proj_out_dim = config.d_model
-        k_proj_out_dim = config.effective_n_kv_heads * (config.d_model // config.n_heads)
-        v_proj_out_dim = config.effective_n_kv_heads * (config.d_model // config.n_heads)
-        self.q_proj = nn.Linear(
-            config.d_model, q_proj_out_dim, bias=config.qkv_bias, device=config.init_device
-        )
-        self.k_proj = nn.Linear(
-            config.d_model, k_proj_out_dim, bias=config.qkv_bias, device=config.init_device
-        )
-        self.v_proj = nn.Linear(
-            config.d_model, v_proj_out_dim, bias=config.qkv_bias, device=config.init_device
-        )
-        # Feed-forward input projection.
-        self.ff_proj1 = nn.Linear(
-            config.d_model, self.hidden_size // 2, bias=False, device=config.init_device
-        )
-        self.ff_proj2 = nn.Linear(
-            config.d_model, self.hidden_size // 2, bias=False, device=config.init_device
-        )
-        if self.config.norm_after:
-            raise NotImplementedError()
-    def reset_parameters(self):
-        super().reset_parameters()
-        self.attn_norm.reset_parameters()
-        self.ff_norm.reset_parameters()
-        # NOTE: the standard deviation for these weights does not depend on the layer.
-        init_weights(self.config, self.q_proj, d=self.config.d_model, layer_id=None)
-        init_weights(self.config, self.k_proj, d=self.config.d_model, layer_id=None)
-        init_weights(self.config, self.v_proj, d=self.config.d_model, layer_id=None)
-        init_weights(self.config, self.ff_proj1, d=self.config.d_model, layer_id=None)
-        init_weights(self.config, self.ff_proj2, d=self.config.d_model, layer_id=None)
-    def _scaled_dot_product_attention(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
-        drop_mask: Optional[torch.Tensor] = None,
-        dropout_p: float = 0.0,
-        response_dropout_p: float = 0.0,
-        is_causal: bool = False,
-    ) -> torch.Tensor:
-        # For GQA
-        assert k.size(1) == v.size(1)
-        num_kv_heads = k.size(1)
-        num_q_heads = q.size(1)
-        if num_q_heads != num_kv_heads:
-            assert num_q_heads % num_kv_heads == 0
-            k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
-            v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
-        og_dtype = q.dtype
-        k = k.to(q.device)
-        v = v.to(q.device)
-        if attn_mask is not None:
-            attn_mask = attn_mask.to(q.device)
-        assert response_dropout_p == 0.0, "Response dropout is not supported in Llama."
-        if self.config.float32_attention:
-            q, k = q.to(torch.float), k.to(torch.float)
-        if self.config.attention_type == "direct":
-            attn_weights = torch.matmul(q, k.transpose(-2, -1)) / (q.shape[-1] ** 0.5)
-            if is_causal:
-                assert attn_mask is None
-                query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
-                attn_bias = get_causal_attention_bias(self.__cache, key_len, q.device)[:, :, :query_len, :key_len]
-            elif attn_mask is not None:
-                attn_bias = attn_mask
-            else:
-                attn_bias = torch.zeros_like(attn_weights)
-            attn_weights += attn_bias
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
-            attn_weights = nn.functional.dropout(attn_weights, p=dropout_p, training=self.training).to(v.dtype)
-            att = torch.matmul(attn_weights, v)
-        elif self.config.attention_type == "sdpa":
-            att = F.scaled_dot_product_attention(
-                q,
-                k,
-                v,
-                attn_mask=attn_mask,
-                dropout_p=dropout_p,
-                is_causal=is_causal,
-            )
-        else:
-            raise NotImplementedError(self.config.attention_type)
-        att = att.to(og_dtype)
-        return att
-    def forward(
-        self,
-        x: torch.Tensor,
-        attention_bias: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        drop_mask: Optional[torch.Tensor] = None,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
-        # Get query, key, value projections.
-        # shape:
-        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
-        #  - for multi-query attn q: (batch_size, seq_len, d_model)
-        #                      k, v: (batch_size, seq_len, d_model // n_heads)
-        x_normed = self.attn_norm(x)
-        q = self.q_proj(x_normed)
-        k = self.k_proj(x_normed)
-        v = self.v_proj(x_normed)
-        if self.config.clip_qkv is not None:
-            q.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
-            k.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
-            v.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
-        # Get attention scores.
-        if self._activation_checkpoint_fn is not None:
-            att, cache = self._activation_checkpoint_fn(  # type: ignore
-                self.attention, q, k, v, attention_bias, position_ids=position_ids, drop_mask=drop_mask, layer_past=layer_past, use_cache=use_cache
-            )
-        else:
-            att, cache = self.attention(q, k, v, attention_bias, position_ids=position_ids, drop_mask=drop_mask, layer_past=layer_past, use_cache=use_cache)
-        # Add attention scores.
-        # shape: (B, T, C)
-        x = x + self.dropout(att, drop_mask=drop_mask)
-        # Add feed-forward projection.
-        # shape: (batch_size, seq_len, d_model)
-        og_x = x
-        if self._activation_checkpoint_fn is not None:
-            x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
-        else:
-            x = self.ff_norm(x)
-        x1 = self.ff_proj1(x)
-        x2 = self.ff_proj2(x)
-        if self._activation_checkpoint_fn is not None:
-            x = self._activation_checkpoint_fn(self.act, x1, x2)  # type: ignore
-        else:
-            x = self.act(x1, x2)
-        x = self.ff_out(x)
-        x = self.dropout(x, drop_mask=drop_mask)
-        x = og_x + x
-        return x, cache
 class MolmoSequentialBlock(MolmoBlock):
@@ -644,7 +464,6 @@ class MolmoSequentialBlock(MolmoBlock):
         x: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        drop_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
@@ -673,10 +492,10 @@ class MolmoSequentialBlock(MolmoBlock):
         # Get attention scores.
         if self._activation_checkpoint_fn is not None:
             att, cache = self._activation_checkpoint_fn(  # type: ignore
-                self.attention, q, k, v, attention_bias, position_ids=position_ids, drop_mask=drop_mask, layer_past=layer_past, use_cache=use_cache
             )
         else:
-            att, cache = self.attention(q, k, v, attention_bias, position_ids=position_ids, drop_mask=drop_mask, layer_past=layer_past, use_cache=use_cache)
         if self.config.norm_after:
             if self._activation_checkpoint_fn is not None:
@@ -686,7 +505,7 @@ class MolmoSequentialBlock(MolmoBlock):
         # Add attention scores.
         # shape: (B, T, C)
-        x = x + self.dropout(att, drop_mask=drop_mask)
         # Add feed-forward projection.
         # shape: (batch_size, seq_len, d_model)
@@ -711,7 +530,7 @@ class MolmoSequentialBlock(MolmoBlock):
             else:
                 x = self.ff_norm(x)
-        x = self.dropout(x, drop_mask=drop_mask)
         x = og_x + x
         return x, cache
@@ -757,27 +576,14 @@ class Dropout(nn.Dropout):
         self.mask_p = mask_p
         self.broadcast_dims = broadcast_dims
-    def forward(self, input: torch.Tensor, drop_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         """
         :param input: A tensor of shape `(batch_size, seq_len, embed_dim)`
-        :param drop_mask: A tensor of shape `(batch_size, seq_len)` with values of zero or one.
         """
         if self.p == 0.0 and (self.mask_p is None or self.mask_p == 0.0):
             return input
         else:
-            if self.mask_p > 0. and self.training:
-                assert drop_mask is not None
-                drop_mask = drop_mask.to(input.dtype)
-                keep_prob = 1.0 - self.p
-                keep_prob2 = 1.0 - self.mask_p
-                keep_prob = drop_mask * keep_prob2 + (1 - drop_mask) * keep_prob
-                keep_prob = keep_prob.unsqueeze(-1)
-                dropout_shape = list(input.shape)
-                keep_prob = keep_prob.broadcast_to(dropout_shape)
-                multiplier = input.new_empty(dropout_shape).bernoulli_(keep_prob)
-                multiplier.div_(keep_prob)
-                return input * multiplier
-            elif self.p > 0. and len(self.broadcast_dims) > 0 and self.training:
                 keep_prob = 1.0 - self.p
                 dropout_shape = list(input.shape)
                 for dim in self.broadcast_dims:
@@ -792,7 +598,6 @@ class Dropout(nn.Dropout):
 @dataclass
 class VisionBackboneConfig:
-    image_model_type: str = "openai"
     image_default_input_size: Tuple[int, int] = (336, 336)
     image_patch_size: int = 14
     image_pos_patch_size: int = 14
@@ -832,17 +637,12 @@ class FullMolmoConfig:
     mlp_ratio: int = 4
     mlp_hidden_size: Optional[int] = None
     activation_type: str = "swiglu"
-    block_type: str = "sequential"
     block_group_size: int = 1
-    alibi: bool = False
-    alibi_bias_max: float = 8.0
-    rope: bool = False
     rope_full_precision: bool = True
     rope_theta: float = 10000.
-    rope_impl: str = "cockatoo"
     vision_backbone: Optional[VisionBackboneConfig] = None
-    vit_load_path: Optional[str] = None
-    llm_load_path: Optional[str] = None
     attention_type: str = "sdpa"
     float32_attention: bool = True
     attention_dropout: float = 0.1
@@ -850,7 +650,6 @@ class FullMolmoConfig:
     multi_query_attention: Optional[bool] = None
     attention_layer_norm: bool = False
     residual_dropout: float = 0.1
-    response_residual_dropout: float = 0.0
     embedding_dropout: float = 0.1
     layer_norm_type: str = "default"
     layer_norm_with_affine: bool = True
@@ -872,10 +671,6 @@ class FullMolmoConfig:
     init_cutoff_factor: Optional[float] = None
     norm_after: bool = False
     precision: Optional[str] = None
-    max_crops: int = 12
-    crop_mode: str = "patchify-v2-and-resize-c2"
-    do_random_scale: bool = True
-    use_col_tokens: bool = True
     image_padding_embed: Optional[str] = None
     vit_layers: Tuple = (-1,)
     image_pooling_h: int = 2
@@ -883,12 +678,9 @@ class FullMolmoConfig:
     image_pooling_2d: str = "attention"
     image_projector: str = "mlp"
     image_feature_dropout: float = 0.0
-    use_cls_feature: bool = False
     initializer_range: float = 0.02
-    pad_tokenizer: bool = False
     normalize_input_embeds: bool = False
     use_position_ids: bool = True
-    query_pre_attn_scalar: int = 224
     @property
     def effective_n_kv_heads(self) -> int:
@@ -1112,7 +904,7 @@ class VisionTransformer(nn.Module):
         if patch_num is None:
             patch_num = self.config.vision_backbone.image_num_patch
         B, N, D = x.shape
         x = self.patch_embedding(x)
         # class embeddings and positional embeddings
@@ -1526,15 +1318,6 @@ class OLMoPretrainedVisionBackbone(OLMoVisionBackbone):
         self.num_prefix_tokens = self.image_vit.num_prefix_tokens
         assert self.num_prefix_tokens in {0, 1}, "Only 0 or 1 prefix tokens are supported"
-        if config.use_cls_feature:
-            assert self.num_prefix_tokens > 0, "The model does not have a CLS token"
-            nlayers = 1 if config.vit_layers is None else len(config.vit_layers)
-            self.cls_projector = nn.Linear(
-                nlayers * v_cfg.image_emb_dim,
-                self.input_dim,
-                bias=False,
-                device=config.init_device,
-                )
         self.pad_embed = None
         if config.image_padding_embed:
@@ -1551,8 +1334,6 @@ class OLMoPretrainedVisionBackbone(OLMoVisionBackbone):
     def reset_parameters(self):
         super().reset_parameters()
         self.image_vit.reset_parameters()
-        if self.config.use_cls_feature:
-            nn.init.xavier_uniform_(self.cls_projector.weight)
     def encode_image(self, images: torch.Tensor) -> torch.Tensor:
         """
@@ -1562,7 +1343,7 @@ class OLMoPretrainedVisionBackbone(OLMoVisionBackbone):
         v_cfg = self.config.vision_backbone
         B, T, N, D = images.shape
-        mask = torch.all(images.view(B * T, N, D) != -1, dim=(1, 2), keepdim=True)
         # Output all hidden states
         # n_layers x (batch_num_crops, (1+)n_tokens, image_emb_dim)
@@ -1658,9 +1439,6 @@ class OLMoPretrainedVisionBackbone(OLMoVisionBackbone):
         else:
             image_features = self.image_projector(image_features)
-        if self.config.use_cls_feature:
-            raise NotImplementedError()
         # image_features: (batch_size, num_image, num_patch, d_model)
         # cls_embed: (batch_size, num_image, d_model)
         return image_features, cls_embed
@@ -1944,7 +1722,7 @@ class Molmo(nn.Module):
         else:
             self.transformer.update({"blocks": nn.ModuleList(blocks)})
-        if not (self.config.alibi or self.config.rope):
             self.transformer.update(
                 {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
             )
@@ -2105,23 +1883,7 @@ class Molmo(nn.Module):
             x[batch_idx[valid], image_input_idx[valid]] += image_features[valid]
-            if self.config.use_cls_feature:
-                x = torch.cat([x[:, :1], cls_embed, x[:, 1:-num_image]], dim=1)
-                valid_images = torch.any(
-                    (image_input_idx >= 0).view(batch_size, num_image, num_patch), dim=-1
-                )
-                valid_images = valid_images.to(attention_mask.dtype)
-                attention_mask = torch.cat(
-                    [attention_mask[:, :1], valid_images, attention_mask[:, 1:-num_image]],
-                    dim=1,
-                )
-                position_ids = torch.clamp(
-                    torch.cumsum(attention_mask, dim=-1) - 1,
-                    min=0,
-                    ).broadcast_to((batch_size, attention_mask.shape[-1]))
-        if not (self.config.alibi or self.config.rope):
             # Get positional embeddings.
             # shape: (1, seq_len)
             pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
@@ -2151,17 +1913,12 @@ class Molmo(nn.Module):
         if (
             attention_bias is not None
             or attention_mask is not None
-            or self.config.alibi
             # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
             # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
             # scores correctly.
             or past_key_values is not None
         ):
-            if attention_bias is None and self.config.alibi:
-                attention_bias = get_causal_attention_bias(
-                    self.__cache, past_length + seq_len, x.device
-                ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
-            elif attention_bias is None:
                 attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
             elif attention_bias.dtype in (torch.int8, torch.bool):
                 attention_bias = attention_bias.to(dtype=torch.float)
@@ -2196,7 +1953,7 @@ class Molmo(nn.Module):
                     all_hidden_states.append(x)
                 layer_past = None if past_key_values is None else past_key_values[block_idx]
-                x, cache = block(x, attention_bias=attention_bias, position_ids=position_ids, drop_mask=response_mask, layer_past=layer_past, use_cache=use_cache)
                 if attn_key_values is not None:
                     assert cache is not None
@@ -2215,19 +1972,12 @@ class Molmo(nn.Module):
                          ]
                 )
                 x, cache = block_group(
-                    x, attention_bias=attention_bias, position_ids=position_ids, drop_mask=response_mask, layers_past=layers_past, use_cache=use_cache
                 )
                 if attn_key_values is not None:
                     assert cache is not None
                     attn_key_values.extend(cache)
-        if images is not None and self.config.use_cls_feature:
-            assert num_image is not None
-            x = torch.cat(
-                [x[:, :1], x[:, num_image+1:], torch.zeros_like(x[:, :num_image])],
-                dim=1,
-            )
         if last_logits_only:
             # shape: (batch_size, 1, d_model)
             if append_last_valid_logits is not None:
@@ -2271,9 +2021,9 @@ class MolmoForCausalLM(PreTrainedModel):
         if not model:
             full_config = FullMolmoConfig(
-                attention_layer_norm=config.attention_layer_norm,
                 image_padding_embed="pad_and_partial_pad",
                 image_pooling_2d="attention-meanq",
                 rope_impl="llama",
                 vocab_size=config.vocab_size,
                 max_sequence_length=config.max_position_embeddings,
@@ -2282,7 +2032,6 @@ class MolmoForCausalLM(PreTrainedModel):
                 embedding_size=config.embedding_size,
                 attention_type="sdpa",
                 embedding_dropout=0,
-                response_residual_dropout=0,
                 attention_dropout=0,
                 residual_dropout=0,
                 rope=True,
@@ -2297,10 +2046,8 @@ class MolmoForCausalLM(PreTrainedModel):
                 rope_theta=config.rope_theta,
                 layer_norm_eps=config.layer_norm_eps,
                 layer_norm_type=config.layer_norm_type,
-                pad_tokenizer=True,
                 vit_layers=[-2, -9],
                 vision_backbone=VisionBackboneConfig(
-                    image_model_type="openai",
                     image_default_input_size=(336, 336),
                     image_patch_size=14,
                     image_pos_patch_size=14,

             inv_freq = 1.0 / (self.config.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
             seq = torch.arange(seq_len, device=device, dtype=torch.float)
             freqs = torch.einsum("i , j -> i j", seq, inv_freq)
+            if self.config.rope_impl == "interleave":
                 positions = freqs.repeat_interleave(2, dim=-1)
             else:
                 positions = torch.cat((freqs, freqs), dim=-1)
         return x.view(B, nh, T, hs)
     def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        if self.config.rope_impl == "interleave":
             return ((t * pos_cos) + (self.rotate_every_two(t) * pos_sin)).to(t.dtype)
         else:
             return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
         self._activation_checkpoint_fn = None
         # Dropout.
+        self.dropout = Dropout(config.residual_dropout)
         # Layer norms.
         self.k_norm: Optional[LayerNormBase] = None
         k: torch.Tensor,
         v: torch.Tensor,
         attn_mask: Optional[torch.Tensor] = None,
         dropout_p: float = 0.0,
         response_dropout_p: float = 0.0,
         is_causal: bool = False,
         v: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
             k,
             v,
             attn_mask=attention_bias,
             dropout_p=0.0 if not self.training else self.config.attention_dropout,
             response_dropout_p=0.0 if not self.training else self.config.response_attention_dropout,
             is_causal=attention_bias is None,
         x: torch.Tensor,
         attention_bias: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
     @classmethod
     def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
+        return MolmoSequentialBlock(layer_id, config, cache)
 class MolmoSequentialBlock(MolmoBlock):
         x: torch.Tensor,
         attention_bias: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         # Get attention scores.
         if self._activation_checkpoint_fn is not None:
             att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache
             )
         else:
+            att, cache = self.attention(q, k, v, attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)
         if self.config.norm_after:
             if self._activation_checkpoint_fn is not None:
         # Add attention scores.
         # shape: (B, T, C)
+        x = x + self.dropout(att)
         # Add feed-forward projection.
         # shape: (batch_size, seq_len, d_model)
             else:
                 x = self.ff_norm(x)
+        x = self.dropout(x)
         x = og_x + x
         return x, cache
         self.mask_p = mask_p
         self.broadcast_dims = broadcast_dims
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
         """
         :param input: A tensor of shape `(batch_size, seq_len, embed_dim)`
         """
         if self.p == 0.0 and (self.mask_p is None or self.mask_p == 0.0):
             return input
         else:
+            if self.p > 0. and len(self.broadcast_dims) > 0 and self.training:
                 keep_prob = 1.0 - self.p
                 dropout_shape = list(input.shape)
                 for dim in self.broadcast_dims:
 @dataclass
 class VisionBackboneConfig:
     image_default_input_size: Tuple[int, int] = (336, 336)
     image_patch_size: int = 14
     image_pos_patch_size: int = 14
     mlp_ratio: int = 4
     mlp_hidden_size: Optional[int] = None
     activation_type: str = "swiglu"
     block_group_size: int = 1
+    rope: bool = True
     rope_full_precision: bool = True
     rope_theta: float = 10000.
+    rope_impl: str = "interleave"
     vision_backbone: Optional[VisionBackboneConfig] = None
     attention_type: str = "sdpa"
     float32_attention: bool = True
     attention_dropout: float = 0.1
     multi_query_attention: Optional[bool] = None
     attention_layer_norm: bool = False
     residual_dropout: float = 0.1
     embedding_dropout: float = 0.1
     layer_norm_type: str = "default"
     layer_norm_with_affine: bool = True
     init_cutoff_factor: Optional[float] = None
     norm_after: bool = False
     precision: Optional[str] = None
     image_padding_embed: Optional[str] = None
     vit_layers: Tuple = (-1,)
     image_pooling_h: int = 2
     image_pooling_2d: str = "attention"
     image_projector: str = "mlp"
     image_feature_dropout: float = 0.0
     initializer_range: float = 0.02
     normalize_input_embeds: bool = False
     use_position_ids: bool = True
     @property
     def effective_n_kv_heads(self) -> int:
         if patch_num is None:
             patch_num = self.config.vision_backbone.image_num_patch
         B, N, D = x.shape
         x = self.patch_embedding(x)
         # class embeddings and positional embeddings
         self.num_prefix_tokens = self.image_vit.num_prefix_tokens
         assert self.num_prefix_tokens in {0, 1}, "Only 0 or 1 prefix tokens are supported"
         self.pad_embed = None
         if config.image_padding_embed:
     def reset_parameters(self):
         super().reset_parameters()
         self.image_vit.reset_parameters()
     def encode_image(self, images: torch.Tensor) -> torch.Tensor:
         """
         v_cfg = self.config.vision_backbone
         B, T, N, D = images.shape
+        mask = ~torch.all(images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
         # Output all hidden states
         # n_layers x (batch_num_crops, (1+)n_tokens, image_emb_dim)
         else:
             image_features = self.image_projector(image_features)
         # image_features: (batch_size, num_image, num_patch, d_model)
         # cls_embed: (batch_size, num_image, d_model)
         return image_features, cls_embed
         else:
             self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not self.config.rope:
             self.transformer.update(
                 {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
             )
             x[batch_idx[valid], image_input_idx[valid]] += image_features[valid]
+        if not self.config.rope:
             # Get positional embeddings.
             # shape: (1, seq_len)
             pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
         if (
             attention_bias is not None
             or attention_mask is not None
             # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
             # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
             # scores correctly.
             or past_key_values is not None
         ):
+            if attention_bias is None:
                 attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
             elif attention_bias.dtype in (torch.int8, torch.bool):
                 attention_bias = attention_bias.to(dtype=torch.float)
                     all_hidden_states.append(x)
                 layer_past = None if past_key_values is None else past_key_values[block_idx]
+                x, cache = block(x, attention_bias=attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)
                 if attn_key_values is not None:
                     assert cache is not None
                          ]
                 )
                 x, cache = block_group(
+                    x, attention_bias=attention_bias, position_ids=position_ids, layers_past=layers_past, use_cache=use_cache
                 )
                 if attn_key_values is not None:
                     assert cache is not None
                     attn_key_values.extend(cache)
         if last_logits_only:
             # shape: (batch_size, 1, d_model)
             if append_last_valid_logits is not None:
         if not model:
             full_config = FullMolmoConfig(
                 image_padding_embed="pad_and_partial_pad",
                 image_pooling_2d="attention-meanq",
+                attention_layer_norm=config.attention_layer_norm,
                 rope_impl="llama",
                 vocab_size=config.vocab_size,
                 max_sequence_length=config.max_position_embeddings,
                 embedding_size=config.embedding_size,
                 attention_type="sdpa",
                 embedding_dropout=0,
                 attention_dropout=0,
                 residual_dropout=0,
                 rope=True,
                 rope_theta=config.rope_theta,
                 layer_norm_eps=config.layer_norm_eps,
                 layer_norm_type=config.layer_norm_type,
                 vit_layers=[-2, -9],
                 vision_backbone=VisionBackboneConfig(
                     image_default_input_size=(336, 336),
                     image_patch_size=14,
                     image_pos_patch_size=14,