stabilityai
/

stablelm-3b-4e1t

@@ -45,7 +45,7 @@ class StableLmConfig(PretrainedConfig):
         intermediate_size (`int`, *optional*, defaults to 6912):
             Dimension of the MLP representations.
         hidden_size (`int`, *optional*, defaults to 2560):
-            Dimension of the decoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
@@ -134,12 +134,14 @@ class StableLmConfig(PretrainedConfig):
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache

         intermediate_size (`int`, *optional*, defaults to 6912):
             Dimension of the MLP representations.
         hidden_size (`int`, *optional*, defaults to 2560):
+            Number of hidden layers in the Transformer decoder.
         num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer decoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache

modeling_stablelm.py CHANGED Viewed

@@ -103,7 +103,7 @@ class StableLmRotaryEmbedding(nn.Module):
         )
-# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->StableLm
 class StableLmLinearScalingRotaryEmbedding(StableLmRotaryEmbedding):
     """StableLmRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
@@ -123,7 +123,7 @@ class StableLmLinearScalingRotaryEmbedding(StableLmRotaryEmbedding):
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->StableLm
 class StableLmDynamicNTKScalingRotaryEmbedding(StableLmRotaryEmbedding):
     """StableLmRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
@@ -374,6 +374,102 @@ class StableLmAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
 class StableLmFlashAttention2(StableLmAttention):
     """
     StableLM flash attention module. This module inherits from `StableLmAttention` as the weights of the module stays
@@ -574,6 +670,7 @@ class StableLmFlashAttention2(StableLmAttention):
 ATTENTION_CLASSES = {
     "eager": StableLmAttention,
     "flash_attention_2": StableLmFlashAttention2,
 }
@@ -669,7 +766,7 @@ STABLELM_START_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare StableLM Model outputting raw hidden-states without any specific head on top.",
     STABLELM_START_DOCSTRING,
 )
 class StableLmPreTrainedModel(PreTrainedModel):
@@ -680,6 +777,7 @@ class StableLmPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -764,7 +862,7 @@ STABLELM_INPUTS_DOCSTRING = r"""
 @add_start_docstrings(
-    "The bare StableLM Model outputting raw hidden-states without any specific head on top.",
     STABLELM_START_DOCSTRING,
 )
 class StableLmModel(StableLmPreTrainedModel):
@@ -858,6 +956,11 @@ class StableLmModel(StableLmPreTrainedModel):
         if self._attn_implementation == "flash_attention_2":
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
@@ -999,7 +1102,7 @@ class StableLmForCausalLM(StableLmPreTrainedModel):
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        'The weather is always wonderful in the San Juan Islands, and whether you're a vacationer or a resident, here are some ideas for fun!'
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1048,7 +1151,6 @@ class StableLmForCausalLM(StableLmPreTrainedModel):
             attentions=outputs.attentions,
         )
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
@@ -1089,12 +1191,6 @@ class StableLmForCausalLM(StableLmPreTrainedModel):
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
-        if past_key_value := getattr(self.model.layers[0].self_attn, "past_key_value", None):
-            # generation with static cache
-            seen_tokens = past_key_value.get_seq_length()
-            input_ids = input_ids[:, seen_tokens:]
-            position_ids = position_ids[:, seen_tokens:]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
@@ -1123,7 +1219,7 @@ class StableLmForCausalLM(StableLmPreTrainedModel):
 @add_start_docstrings(
     """
-    The StableLM transformer with a sequence classification head on top (linear layer).
     [`StableLmForSequenceClassification`] uses the last token in order to do the classification, as other causal
     models (e.g. GPT-2) do.

         )
+# Copied from transformers.models.falcon.modeling_falcon.FalconLinearScalingRotaryEmbedding with Falcon->StableLm
 class StableLmLinearScalingRotaryEmbedding(StableLmRotaryEmbedding):
     """StableLmRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
         self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding with Falcon->StableLm
 class StableLmDynamicNTKScalingRotaryEmbedding(StableLmRotaryEmbedding):
     """StableLmRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
         return attn_output, attn_weights, past_key_value
+class StableLmSdpaAttention(StableLmAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "StableLmModel is using StableLmSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # Partial rotary embedding
+        query_rot, query_pass = (
+            query_states[..., : self.rotary_emb.dim],
+            query_states[..., self.rotary_emb.dim :],
+        )
+        key_rot, key_pass = (
+            key_states[..., : self.rotary_emb.dim],
+            key_states[..., self.rotary_emb.dim :],
+        )
+        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+        query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        # [batch_size, seq_length, num_heads, head_dim]
+        query_states = torch.cat((query_rot, query_pass), dim=-1)
+        key_states = torch.cat((key_rot, key_pass), dim=-1)
+        if past_key_value is not None:
+            # Specific to RoPE models with partial rotation
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # Repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout.p if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
 class StableLmFlashAttention2(StableLmAttention):
     """
     StableLM flash attention module. This module inherits from `StableLmAttention` as the weights of the module stays
 ATTENTION_CLASSES = {
     "eager": StableLmAttention,
+    "sdpa": StableLmSdpaAttention,
     "flash_attention_2": StableLmFlashAttention2,
 }
 @add_start_docstrings(
+    "The bare StableLm Model outputting raw hidden-states without any specific head on top.",
     STABLELM_START_DOCSTRING,
 )
 class StableLmPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
+    _supports_sdpa = True
     def _init_weights(self, module):
         std = self.config.initializer_range
 @add_start_docstrings(
+    "The bare StableLm Model outputting raw hidden-states without any specific head on top.",
     STABLELM_START_DOCSTRING,
 )
 class StableLmModel(StableLmPreTrainedModel):
         if self._attn_implementation == "flash_attention_2":
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        # for output_attentions case used fallback to eager attention realization
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'The weather is always wonderful in the summer in the city of San Diego. The city is located on the coast of the Pacific Ocean and is surrounded by'
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
 @add_start_docstrings(
     """
+    The StableLm transformer with a sequence classification head on top (linear layer).
     [`StableLmForSequenceClassification`] uses the last token in order to do the classification, as other causal
     models (e.g. GPT-2) do.