ai4bharat
/

indictrans2-indic-indic-1B

@@ -34,7 +34,7 @@ from transformers.modeling_outputs import (
 from transformers.utils import logging
 from transformers.modeling_utils import PreTrainedModel
-from .configuration_indictrans import IndicTransConfig
 logger = logging.get_logger(__name__)
@@ -45,7 +45,9 @@ INDICTRANS_PRETRAINED_MODEL_ARCHIVE_LIST = [""]
 # Copied from transformers.models.bart.modeling_bart.shift_tokens_right
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
     """
     Shift input ids one token to the right.
     """
@@ -63,7 +65,10 @@ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
     """
     Make causal mask used for bi-directional self-attention.
@@ -75,8 +80,18 @@ def _make_causal_mask(
     mask = mask.to(dtype)
     if past_key_values_length > 0:
-        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
-    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
 # Copied from transformers.models.bart.modeling_bart._expand_mask
@@ -91,17 +106,23 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     inverted_mask = 1.0 - expanded_mask
-    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
     """
     Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
     are ignored. This is modified from fairseq's `utils.make_positions`.
     """
     # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
     mask = input_ids.ne(padding_idx).int()
-    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
     return incremental_indices.long() + padding_idx
@@ -109,23 +130,31 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l
 class IndicTransSinusoidalPositionalEmbedding(nn.Module):
     """This module produces sinusoidal positional embeddings of any length."""
-    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
         super().__init__()
         self.offset = 2
         self.embedding_dim = embedding_dim
         self.padding_idx = padding_idx
         self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
-    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
         emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
         if hasattr(self, "weights"):
             # in forward put the weights on the correct dtype and device of the param
-            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
         self.register_buffer("weights", emb_weights, persistent=False)
     @staticmethod
-    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
         """
         Build sinusoidal embeddings.
@@ -135,8 +164,12 @@ class IndicTransSinusoidalPositionalEmbedding(nn.Module):
         half_dim = embedding_dim // 2
         emb = math.log(10000) / (half_dim - 1)
         emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
-        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
         if embedding_dim % 2 == 1:
             # zero pad
             emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
@@ -147,26 +180,39 @@ class IndicTransSinusoidalPositionalEmbedding(nn.Module):
     @torch.no_grad()
     def forward(
-        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
     ):
         if input_ids is not None:
             bsz, seq_len = input_ids.size()
             # Create the position ids from the input token ids. Any padded tokens remain padded.
-            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
-                input_ids.device
-            )
         else:
             bsz, seq_len = inputs_embeds.size()[:-1]
-            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
         # expand embeddings if needed
         max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
         if max_pos > self.weights.size(0):
-            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
-        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
-    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
         """
         We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
@@ -179,9 +225,15 @@ class IndicTransSinusoidalPositionalEmbedding(nn.Module):
         sequence_length = input_shape[1]
         position_ids = torch.arange(
-            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
         )
-        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->IndicTrans
@@ -216,7 +268,11 @@ class IndicTransAttention(nn.Module):
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
         self,
@@ -293,7 +349,10 @@ class IndicTransAttention(nn.Module):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)
@@ -304,7 +363,9 @@ class IndicTransAttention(nn.Module):
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
@@ -312,8 +373,12 @@ class IndicTransAttention(nn.Module):
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
         else:
             attn_weights_reshaped = None
@@ -394,7 +459,9 @@ class IndicTransEncoderLayer(nn.Module):
         if self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
@@ -405,7 +472,9 @@ class IndicTransEncoderLayer(nn.Module):
             torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
         outputs = (hidden_states,)
@@ -480,7 +549,9 @@ class IndicTransDecoderLayer(nn.Module):
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
@@ -503,8 +574,14 @@ class IndicTransDecoderLayer(nn.Module):
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
                 hidden_states=hidden_states,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
@@ -512,7 +589,9 @@ class IndicTransDecoderLayer(nn.Module):
                 past_key_value=cross_attn_past_key_value,
                 output_attentions=output_attentions,
             )
-            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
             if not self.normalize_before:
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
@@ -525,7 +604,9 @@ class IndicTransDecoderLayer(nn.Module):
         if self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
@@ -577,7 +658,9 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
         embed_tokens (nn.Embedding): output embedding
     """
-    def __init__(self, config: IndicTransConfig, embed_tokens: Optional[nn.Embedding] = None):
         super().__init__(config)
         self.dropout = config.dropout
@@ -588,7 +671,9 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
         self.max_source_positions = config.max_source_positions
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-        self.embed_tokens = nn.Embedding(config.encoder_vocab_size, embed_dim, self.padding_idx)
         if embed_tokens is not None:
             self.embed_tokens.weight = embed_tokens.weight
@@ -598,9 +683,15 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
             embed_dim,
             self.padding_idx,
         )
-        self.layers = nn.ModuleList([IndicTransEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layer_norm = nn.LayerNorm(embed_dim) if config.encoder_normalize_before else None
-        self.layernorm_embedding = nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -652,15 +743,25 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
@@ -705,7 +806,11 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = torch.rand([])
-            skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
                 # under deepspeed zero3 all gpus must run in sync
@@ -727,7 +832,9 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
                     layer_outputs = encoder_layer(
                         hidden_states,
                         attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
@@ -746,9 +853,15 @@ class IndicTransEncoder(IndicTransPreTrainedModel):
             encoder_states = encoder_states + (hidden_states,)
         if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
         )
@@ -762,7 +875,9 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
         embed_tokens (nn.Embedding): output embedding
     """
-    def __init__(self, config: IndicTransConfig, embed_tokens: Optional[nn.Embedding] = None):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
@@ -772,7 +887,9 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
         self.max_target_positions = config.max_target_positions
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-        self.embed_tokens = nn.Embedding(config.decoder_vocab_size, embed_dim, self.padding_idx)
         if embed_tokens is not None:
             self.embed_tokens.weight = embed_tokens.weight
@@ -782,9 +899,15 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
             embed_dim,
             self.padding_idx,
         )
-        self.layers = nn.ModuleList([IndicTransDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layer_norm = nn.LayerNorm(embed_dim) if config.decoder_normalize_before else None
-        self.layernorm_embedding = nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -870,26 +993,40 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
@@ -914,10 +1051,14 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
         # embed positions
-        positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
         positions = positions.to(inputs_embeds.device)
         hidden_states = inputs_embeds + positions
@@ -929,7 +1070,8 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
                 )
                 use_cache = False
@@ -940,7 +1082,9 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
         next_decoder_cache = () if use_cache else None
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
             if attn_mask is not None:
                 if attn_mask.size()[0] != len(self.layers):
                     raise ValueError(
@@ -956,11 +1100,17 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = torch.rand([])
-            skip_the_layer = True if self.training and (dropout_probability < self.layerdrop) else False
             if not skip_the_layer or deepspeed_zero3_is_enabled:
                 # under deepspeed zero3 all gpus must run in sync
-                past_key_value = past_key_values[idx] if past_key_values is not None else None
                 if self.gradient_checkpointing and self.training:
@@ -978,7 +1128,9 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
                         encoder_hidden_states,
                         encoder_attention_mask,
                         head_mask[idx] if head_mask is not None else None,
-                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
                         None,
                     )
                 else:
@@ -987,9 +1139,13 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
                         attention_mask=combined_attention_mask,
                         encoder_hidden_states=encoder_hidden_states,
                         encoder_attention_mask=encoder_attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         cross_attn_layer_head_mask=(
-                            cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                         ),
                         past_key_value=past_key_value,
                         output_attentions=output_attentions,
@@ -1019,7 +1175,13 @@ class IndicTransDecoder(IndicTransPreTrainedModel):
         if not return_dict:
             return tuple(
                 v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
                 if v is not None
             )
         return BaseModelOutputWithPastAndCrossAttentions(
@@ -1037,7 +1199,7 @@ class IndicTransModel(IndicTransPreTrainedModel):
     def __init__(self, config: IndicTransConfig):
         super().__init__(config)
         self.encoder = IndicTransEncoder(config)
         self.decoder = IndicTransDecoder(config)
@@ -1068,12 +1230,20 @@ class IndicTransModel(IndicTransPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
@@ -1128,17 +1298,20 @@ class IndicTransModel(IndicTransPreTrainedModel):
 class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
     base_model_prefix = "model"
     _tied_weights_keys = None
     def __init__(self, config: IndicTransConfig):
         super().__init__(config)
         self.model = IndicTransModel(config)
-        self.lm_head = nn.Linear(config.decoder_embed_dim, config.decoder_vocab_size, bias=False)
         if config.share_decoder_input_output_embed:
             self.lm_head.weight = self.model.decoder.embed_tokens.weight
         self.post_init()
     def tie_weights(self):
         pass
@@ -1153,6 +1326,9 @@ class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
     def forward(
         self,
@@ -1181,7 +1357,9 @@ class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
         Returns:
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if labels is not None:
             if decoder_input_ids is None:
@@ -1212,12 +1390,18 @@ class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
         if labels is not None:
             # move labels to the correct device to enable PP
             labels = labels.to(lm_logits.device)
-            loss_fct = nn.CrossEntropyLoss()
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.decoder_vocab_size), labels.view(-1))
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
         return Seq2SeqLMOutput(
             loss=masked_lm_loss,
@@ -1263,5 +1447,9 @@ class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
-        return reordered_past

 from transformers.utils import logging
 from transformers.modeling_utils import PreTrainedModel
+from configuration_indictrans import IndicTransConfig
 logger = logging.get_logger(__name__)
 # Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(
+    input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int
+):
     """
     Shift input ids one token to the right.
     """
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
 ):
     """
     Make causal mask used for bi-directional self-attention.
     mask = mask.to(dtype)
     if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
 # Copied from transformers.models.bart.modeling_bart._expand_mask
     inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+def create_position_ids_from_input_ids(
+    input_ids, padding_idx, past_key_values_length=0
+):
     """
     Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
     are ignored. This is modified from fairseq's `utils.make_positions`.
     """
     # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
     mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (
+        torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length
+    ) * mask
     return incremental_indices.long() + padding_idx
 class IndicTransSinusoidalPositionalEmbedding(nn.Module):
     """This module produces sinusoidal positional embeddings of any length."""
+    def __init__(
+        self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
         super().__init__()
         self.offset = 2
         self.embedding_dim = embedding_dim
         self.padding_idx = padding_idx
         self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+    def make_weights(
+        self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
         emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
         if hasattr(self, "weights"):
             # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(
+                dtype=self.weights.dtype, device=self.weights.device
+            )
         self.register_buffer("weights", emb_weights, persistent=False)
     @staticmethod
+    def get_embedding(
+        num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
         """
         Build sinusoidal embeddings.
         half_dim = embedding_dim // 2
         emb = math.log(10000) / (half_dim - 1)
         emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
+            1
+        ) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
+            num_embeddings, -1
+        )
         if embedding_dim % 2 == 1:
             # zero pad
             emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
     @torch.no_grad()
     def forward(
+        self,
+        input_ids: torch.Tensor = None,
+        inputs_embeds: torch.Tensor = None,
+        past_key_values_length: int = 0,
     ):
         if input_ids is not None:
             bsz, seq_len = input_ids.size()
             # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(
+                input_ids, self.padding_idx, past_key_values_length
+            ).to(input_ids.device)
         else:
             bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(
+                inputs_embeds, past_key_values_length
+            )
         # expand embeddings if needed
         max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
         if max_pos > self.weights.size(0):
+            self.make_weights(
+                max_pos + self.offset, self.embedding_dim, self.padding_idx
+            )
+        return (
+            self.weights.index_select(0, position_ids.view(-1))
+            .view(bsz, seq_len, self.weights.shape[-1])
+            .detach()
+        )
+    def create_position_ids_from_inputs_embeds(
+        self, inputs_embeds, past_key_values_length
+    ):
         """
         We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
         sequence_length = input_shape[1]
         position_ids = torch.arange(
+            self.padding_idx + 1,
+            sequence_length + self.padding_idx + 1,
+            dtype=torch.long,
+            device=inputs_embeds.device,
+        )
+        return (
+            position_ids.unsqueeze(0).expand(input_shape).contiguous()
+            + past_key_values_length
         )
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->IndicTrans
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
     def forward(
         self,
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = F.softmax(attn_weights, dim=-1)
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
         else:
             attn_weights_reshaped = None
         if self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
             torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
         outputs = (hidden_states,)
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
         # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = (
+                past_key_value[-2:] if past_key_value is not None else None
+            )
+            (
+                hidden_states,
+                cross_attn_weights,
+                cross_attn_present_key_value,
+            ) = self.encoder_attn(
                 hidden_states=hidden_states,
                 key_value_states=encoder_hidden_states,
                 attention_mask=encoder_attention_mask,
                 past_key_value=cross_attn_past_key_value,
                 output_attentions=output_attentions,
             )
+            hidden_states = F.dropout(
+                hidden_states, p=self.dropout, training=self.training
+            )
             hidden_states = residual + hidden_states
             if not self.normalize_before:
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
         if self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
         hidden_states = self.fc2(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         embed_tokens (nn.Embedding): output embedding
     """
+    def __init__(
+        self, config: IndicTransConfig, embed_tokens: Optional[nn.Embedding] = None
+    ):
         super().__init__(config)
         self.dropout = config.dropout
         self.max_source_positions = config.max_source_positions
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(
+            config.encoder_vocab_size, embed_dim, self.padding_idx
+        )
         if embed_tokens is not None:
             self.embed_tokens.weight = embed_tokens.weight
             embed_dim,
             self.padding_idx,
         )
+        self.layers = nn.ModuleList(
+            [IndicTransEncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
+        self.layer_norm = (
+            nn.LayerNorm(embed_dim) if config.encoder_normalize_before else None
+        )
+        self.layernorm_embedding = (
+            nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
+        )
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
         )
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
         elif input_ids is not None:
             self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
             input_shape = input_ids.size()
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = torch.rand([])
+            skip_the_layer = (
+                True
+                if self.training and (dropout_probability < self.layerdrop)
+                else False
+            )
             if not skip_the_layer or deepspeed_zero3_is_enabled:
                 # under deepspeed zero3 all gpus must run in sync
                     layer_outputs = encoder_layer(
                         hidden_states,
                         attention_mask,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
                         output_attentions=output_attentions,
                     )
             encoder_states = encoder_states + (hidden_states,)
         if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
         return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
         )
         embed_tokens (nn.Embedding): output embedding
     """
+    def __init__(
+        self, config: IndicTransConfig, embed_tokens: Optional[nn.Embedding] = None
+    ):
         super().__init__(config)
         self.dropout = config.dropout
         self.layerdrop = config.decoder_layerdrop
         self.max_target_positions = config.max_target_positions
         self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(
+            config.decoder_vocab_size, embed_dim, self.padding_idx
+        )
         if embed_tokens is not None:
             self.embed_tokens.weight = embed_tokens.weight
             embed_dim,
             self.padding_idx,
         )
+        self.layers = nn.ModuleList(
+            [IndicTransDecoderLayer(config) for _ in range(config.decoder_layers)]
+        )
+        self.layer_norm = (
+            nn.LayerNorm(embed_dim) if config.decoder_normalize_before else None
+        )
+        self.layernorm_embedding = (
+            nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
+        )
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
         elif input_ids is not None:
             input_shape = input_ids.size()
             input_ids = input_ids.view(-1, input_shape[-1])
         elif inputs_embeds is not None:
             input_shape = inputs_embeds.size()[:-1]
         else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
         # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
         # embed positions
+        positions = self.embed_positions(
+            input_ids, inputs_embeds, past_key_values_length
+        )
         positions = positions.to(inputs_embeds.device)
         hidden_states = inputs_embeds + positions
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting"
+                    " `use_cache=False`..."
                 )
                 use_cache = False
         next_decoder_cache = () if use_cache else None
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip(
+            [head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]
+        ):
             if attn_mask is not None:
                 if attn_mask.size()[0] != len(self.layers):
                     raise ValueError(
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = torch.rand([])
+            skip_the_layer = (
+                True
+                if self.training and (dropout_probability < self.layerdrop)
+                else False
+            )
             if not skip_the_layer or deepspeed_zero3_is_enabled:
                 # under deepspeed zero3 all gpus must run in sync
+                past_key_value = (
+                    past_key_values[idx] if past_key_values is not None else None
+                )
                 if self.gradient_checkpointing and self.training:
                         encoder_hidden_states,
                         encoder_attention_mask,
                         head_mask[idx] if head_mask is not None else None,
+                        cross_attn_head_mask[idx]
+                        if cross_attn_head_mask is not None
+                        else None,
                         None,
                     )
                 else:
                         attention_mask=combined_attention_mask,
                         encoder_hidden_states=encoder_hidden_states,
                         encoder_attention_mask=encoder_attention_mask,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
                         cross_attn_layer_head_mask=(
+                            cross_attn_head_mask[idx]
+                            if cross_attn_head_mask is not None
+                            else None
                         ),
                         past_key_value=past_key_value,
                         output_attentions=output_attentions,
         if not return_dict:
             return tuple(
                 v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
                 if v is not None
             )
         return BaseModelOutputWithPastAndCrossAttentions(
     def __init__(self, config: IndicTransConfig):
         super().__init__(config)
         self.encoder = IndicTransEncoder(config)
         self.decoder = IndicTransDecoder(config)
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
         output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
 class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
     base_model_prefix = "model"
     _tied_weights_keys = None
+    _label_smoothing = 0.0
     def __init__(self, config: IndicTransConfig):
         super().__init__(config)
         self.model = IndicTransModel(config)
+        self.lm_head = nn.Linear(
+            config.decoder_embed_dim, config.decoder_vocab_size, bias=False
+        )
         if config.share_decoder_input_output_embed:
             self.lm_head.weight = self.model.decoder.embed_tokens.weight
         self.post_init()
     def tie_weights(self):
         pass
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
+    def set_label_smoothing(self, label_smoothing):
+        self._label_smoothing = label_smoothing
     def forward(
         self,
         Returns:
         """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
         if labels is not None:
             if decoder_input_ids is None:
         if labels is not None:
             # move labels to the correct device to enable PP
             labels = labels.to(lm_logits.device)
+            masked_lm_loss = F.cross_entropy(
+                input=lm_logits.view(-1, self.config.decoder_vocab_size),
+                target=labels.view(-1),
+                ignore_index=self.config.pad_token_id,
+                label_smoothing=self._label_smoothing,
+            )
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
         return Seq2SeqLMOutput(
             loss=masked_lm_loss,
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past