jbochi
/

madlad400-8b-lm

@@ -9,7 +9,7 @@
     "decoder_start_token_id": 0,
     "pad_token_id": 1,
     "eos_token_id": 3,
-    "feed_forward_proj": "gated-gelu",
     "initializer_factor": 1.0,
     "is_encoder_decoder": false,
     "is_decoder_only": true,
@@ -29,5 +29,7 @@
     "vocab_size": 256512,
     "parallel_layers": true,
     "has_relative_attention_bias": false,
-    "multi_query_attention": true
 }

     "decoder_start_token_id": 0,
     "pad_token_id": 1,
     "eos_token_id": 3,
+    "feed_forward_proj": "gated-swish",
     "initializer_factor": 1.0,
     "is_encoder_decoder": false,
     "is_decoder_only": true,
     "vocab_size": 256512,
     "parallel_layers": true,
     "has_relative_attention_bias": false,
+    "multi_query_attention": true,
+    "use_rotary_embedding": true,
+    "rotary_embedding_max_timescale": 1000
 }

decoder_only_t5/modeling.py CHANGED Viewed

@@ -36,6 +36,84 @@ class DecoderOnlyT5LayerFF(modeling_t5.T5LayerFF):
         self.dropout = nn.Dropout(config.dropout_rate)
 # https://github.com/huggingface/transformers/blob/7ee995fd9c692761c4601ddbffa2ac2ec9f27b0b/src/transformers/models/llama/modeling_llama.py#L263
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -72,9 +150,16 @@ class DecoderOnlyT5Attention(modeling_t5.T5Attention):
         self.dropout = config.dropout_rate
         self.inner_dim = self.n_heads * self.key_value_proj_dim
         self.kv_inner_dim = self.n_kv_heads * self.key_value_proj_dim
         # Mesh TensorFlow initialization to avoid scaling before softmax
         self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
         self.k = nn.Linear(self.d_model, self.kv_inner_dim, bias=False)
         self.v = nn.Linear(self.d_model, self.kv_inner_dim, bias=False)
@@ -93,6 +178,7 @@ class DecoderOnlyT5Attention(modeling_t5.T5Attention):
         mask=None,
         key_value_states=None,
         position_bias=None,
         past_key_value=None,
         layer_head_mask=None,
         query_length=None,
@@ -144,21 +230,25 @@ class DecoderOnlyT5Attention(modeling_t5.T5Attention):
                 # cross-attn
                 # (batch_size, n_kv_heads, seq_length, dim_per_head)
                 hidden_states = shape(proj_layer(key_value_states), self.n_kv_heads)
-            if past_key_value is not None:
-                if key_value_states is None:
-                    # self-attn
-                    # (batch_size, n_kv_heads, key_length, dim_per_head)
-                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-                elif past_key_value.shape[2] != key_value_states.shape[1]:
-                    # checking that the `sequence_length` of the `past_key_value` is the same as
-                    # the provided `key_value_states` to support prefix tuning
-                    # cross-attn
-                    # (batch_size, n_kv_heads, seq_length, dim_per_head)
-                    hidden_states = shape(proj_layer(key_value_states), self.n_kv_heads)
-                else:
-                    # cross-attn
-                    hidden_states = past_key_value
             return hidden_states
         # get query states
@@ -167,24 +257,35 @@ class DecoderOnlyT5Attention(modeling_t5.T5Attention):
         )  # (batch_size, n_heads, seq_length, dim_per_head)
         # get key/value states
-        key_states = repeat_kv(
-            project(
-                hidden_states,
-                self.k,
                 key_value_states,
-                past_key_value[0] if past_key_value is not None else None,
-            ),
-            self.n_kv_groups,
-        )
-        value_states = repeat_kv(
-            project(
-                hidden_states,
-                self.v,
                 key_value_states,
-                past_key_value[1] if past_key_value is not None else None,
-            ),
-            self.n_kv_groups,
-        )
         # compute scores
         scores = torch.matmul(
@@ -266,6 +367,7 @@ class DecoderOnlyT5LayerSelfAttention(modeling_t5.T5LayerSelfAttention):
         hidden_states,
         attention_mask=None,
         position_bias=None,
         layer_head_mask=None,
         past_key_value=None,
         use_cache=False,
@@ -279,6 +381,7 @@ class DecoderOnlyT5LayerSelfAttention(modeling_t5.T5LayerSelfAttention):
             x,
             mask=attention_mask,
             position_bias=position_bias,
             layer_head_mask=layer_head_mask,
             past_key_value=past_key_value,
             use_cache=use_cache,
@@ -320,6 +423,7 @@ class DecoderOnlyT5Block(modeling_t5.T5Block):
         hidden_states,
         attention_mask=None,
         position_bias=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         encoder_decoder_position_bias=None,
@@ -361,6 +465,7 @@ class DecoderOnlyT5Block(modeling_t5.T5Block):
             x,
             attention_mask=attention_mask,
             position_bias=position_bias,
             layer_head_mask=layer_head_mask,
             past_key_value=self_attn_past_key_value,
             use_cache=use_cache,
@@ -398,6 +503,7 @@ class DecoderOnlyT5Block(modeling_t5.T5Block):
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
                 position_bias=encoder_decoder_position_bias,
                 layer_head_mask=cross_attn_layer_head_mask,
                 past_key_value=cross_attn_past_key_value,
                 query_length=query_length,
@@ -486,6 +592,284 @@ class DecoderOnlyT5Stack(modeling_t5.T5Stack):
         self.device_map = None
         self.gradient_checkpointing = False
 class DecoderOnlyT5Model(modeling_t5.T5ForConditionalGeneration):
     def __init__(self, config: DecoderOnlyT5Config):
@@ -513,6 +897,14 @@ class DecoderOnlyT5Model(modeling_t5.T5ForConditionalGeneration):
         self.model_parallel = False
         self.device_map = None
     @add_start_docstrings_to_model_forward(modeling_t5.T5_INPUTS_DOCSTRING)
     @replace_return_docstrings(
         output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
@@ -520,8 +912,8 @@ class DecoderOnlyT5Model(modeling_t5.T5ForConditionalGeneration):
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
@@ -560,6 +952,7 @@ class DecoderOnlyT5Model(modeling_t5.T5ForConditionalGeneration):
         # Decode
         outputs = self.decoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
             past_key_values=past_key_values,

         self.dropout = nn.Dropout(config.dropout_rate)
+# LlamaRotaryEmbedding
+class T5DecoderOnlyRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings,
+            device=self.inv_freq.device,
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
 # https://github.com/huggingface/transformers/blob/7ee995fd9c692761c4601ddbffa2ac2ec9f27b0b/src/transformers/models/llama/modeling_llama.py#L263
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
         self.dropout = config.dropout_rate
         self.inner_dim = self.n_heads * self.key_value_proj_dim
         self.kv_inner_dim = self.n_kv_heads * self.key_value_proj_dim
+        if config.use_rotary_embedding:
+            self.rotary_embedding = T5DecoderOnlyRotaryEmbedding(
+                self.key_value_proj_dim,
+                max_position_embeddings=config.relative_attention_max_distance,
+                base=config.rotary_embedding_max_timescale,
+            )
+        else:
+            self.rotary_embedding = None
         # Mesh TensorFlow initialization to avoid scaling before softmax
         self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
         self.k = nn.Linear(self.d_model, self.kv_inner_dim, bias=False)
         self.v = nn.Linear(self.d_model, self.kv_inner_dim, bias=False)
         mask=None,
         key_value_states=None,
         position_bias=None,
+        position_ids=None,
         past_key_value=None,
         layer_head_mask=None,
         query_length=None,
                 # cross-attn
                 # (batch_size, n_kv_heads, seq_length, dim_per_head)
                 hidden_states = shape(proj_layer(key_value_states), self.n_kv_heads)
+            return hidden_states
+        def concat_past_key_value(hidden_states, past_key_value, key_value_states):
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_kv_heads, key_length, dim_per_head)
+                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+            elif past_key_value.shape[2] != key_value_states.shape[1]:
+                # checking that the `sequence_length` of the `past_key_value` is the same as
+                # the provided `key_value_states` to support prefix tuning
+                # cross-attn
+                # (batch_size, n_kv_heads, seq_length, dim_per_head)
+                raise NotImplementedError(
+                    "cross attention with RoPE and past KV is not implemented"
+                )
+                # hidden_states = shape(proj_layer(key_value_states), self.n_kv_heads)
+            else:
+                # cross-attn
+                hidden_states = past_key_value
             return hidden_states
         # get query states
         )  # (batch_size, n_heads, seq_length, dim_per_head)
         # get key/value states
+        key_states = project(hidden_states, self.k, key_value_states, past_key_value)
+        value_states = project(hidden_states, self.v, key_value_states, past_key_value)
+        # RoPE
+        if self.rotary_embedding is not None:
+            kv_seq_len = key_states.shape[-2]
+            if past_key_value:
+                kv_seq_len += past_key_value[0].shape[-2]
+            cos, sin = self.rotary_embedding(query_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin, position_ids
+            )
+        # concat past
+        if past_key_value is not None:
+            key_states = concat_past_key_value(
+                key_states,
+                past_key_value[0],
                 key_value_states,
+            )
+            value_states = concat_past_key_value(
+                value_states,
+                past_key_value[1],
                 key_value_states,
+            )
+        # MultiQueryDotProductAttention
+        key_states = repeat_kv(key_states, self.n_kv_groups)
+        value_states = repeat_kv(value_states, self.n_kv_groups)
         # compute scores
         scores = torch.matmul(
         hidden_states,
         attention_mask=None,
         position_bias=None,
+        position_ids=None,
         layer_head_mask=None,
         past_key_value=None,
         use_cache=False,
             x,
             mask=attention_mask,
             position_bias=position_bias,
+            position_ids=position_ids,
             layer_head_mask=layer_head_mask,
             past_key_value=past_key_value,
             use_cache=use_cache,
         hidden_states,
         attention_mask=None,
         position_bias=None,
+        position_ids=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         encoder_decoder_position_bias=None,
             x,
             attention_mask=attention_mask,
             position_bias=position_bias,
+            position_ids=position_ids,
             layer_head_mask=layer_head_mask,
             past_key_value=self_attn_past_key_value,
             use_cache=use_cache,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
                 position_bias=encoder_decoder_position_bias,
+                # position_ids ?
                 layer_head_mask=cross_attn_layer_head_mask,
                 past_key_value=cross_attn_past_key_value,
                 query_length=query_length,
         self.device_map = None
         self.gradient_checkpointing = False
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        # Model parallel
+        if self.model_parallel:
+            torch.cuda.set_device(self.first_device)
+            self.embed_tokens = self.embed_tokens.to(self.first_device)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds"
+            )
+        if position_ids is None:
+            seq_length = input_ids.shape[1]
+            past_key_values_length = (
+                0 if past_key_values is None else past_key_values[0][0].shape[2]
+            )
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            if self.embed_tokens is None:
+                raise ValueError(
+                    "You have to initialize the model with valid token embeddings"
+                )
+            inputs_embeds = self.embed_tokens(input_ids)
+        batch_size, seq_length = input_shape
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = (
+            past_key_values[0][0].shape[2] + seq_length
+            if past_key_values is not None
+            else seq_length
+        )
+        if use_cache is True:
+            if not self.is_decoder:
+                raise ValueError(
+                    f"`use_cache` can only be set to `True` if {self} is used as a decoder"
+                )
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                batch_size, mask_seq_length, device=inputs_embeds.device
+            )
+        if (
+            self.is_decoder
+            and encoder_attention_mask is None
+            and encoder_hidden_states is not None
+        ):
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = torch.ones(
+                batch_size,
+                encoder_seq_length,
+                device=inputs_embeds.device,
+                dtype=torch.long,
+            )
+        # initialize past_key_values with `None` if past does not exist
+        if past_key_values is None:
+            past_key_values = [None] * len(self.block)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape
+        )
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            (
+                encoder_batch_size,
+                encoder_sequence_length,
+                _,
+            ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(
+                    encoder_hidden_shape, device=inputs_embeds.device
+                )
+            encoder_extended_attention_mask = self.invert_attention_mask(
+                encoder_attention_mask
+            )
+        else:
+            encoder_extended_attention_mask = None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(
+            cross_attn_head_mask, self.config.num_layers
+        )
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+        hidden_states = self.dropout(inputs_embeds)
+        for i, (layer_module, past_key_value) in enumerate(
+            zip(self.block, past_key_values)
+        ):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if position_bias is not None:
+                    position_bias = position_bias.to(hidden_states.device)
+                if encoder_hidden_states is not None:
+                    encoder_hidden_states = encoder_hidden_states.to(
+                        hidden_states.device
+                    )
+                if encoder_extended_attention_mask is not None:
+                    encoder_extended_attention_mask = (
+                        encoder_extended_attention_mask.to(hidden_states.device)
+                    )
+                if encoder_decoder_position_bias is not None:
+                    encoder_decoder_position_bias = encoder_decoder_position_bias.to(
+                        hidden_states.device
+                    )
+                if layer_head_mask is not None:
+                    layer_head_mask = layer_head_mask.to(hidden_states.device)
+                if cross_attn_layer_head_mask is not None:
+                    cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(
+                        hidden_states.device
+                    )
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.forward,
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    layer_head_mask,
+                    cross_attn_layer_head_mask,
+                    None,  # past_key_value is always None with gradient checkpointing
+                    use_cache,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    position_ids=position_ids,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    layer_head_mask=layer_head_mask,
+                    cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if use_cache is False:
+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+            hidden_states, present_key_value_state = layer_outputs[:2]
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[
+                    4 if output_attentions else 3
+                ]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (
+                    present_key_value_state,
+                )
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return modeling_t5.BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
 class DecoderOnlyT5Model(modeling_t5.T5ForConditionalGeneration):
     def __init__(self, config: DecoderOnlyT5Config):
         self.model_parallel = False
         self.device_map = None
+    def _tie_weights(self):
+        if not self.config.tie_word_embeddings:
+            return
+        if self.encoder:
+            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
+        if self.decoder:
+            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
     @add_start_docstrings_to_model_forward(modeling_t5.T5_INPUTS_DOCSTRING)
     @replace_return_docstrings(
         output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         # Decode
         outputs = self.decoder(
             input_ids=input_ids,
+            position_ids=position_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
             past_key_values=past_key_values,