pavanhitloop commited on Nov 3, 2023

Commit

9d452e1

•

1 Parent(s): 1521005

codebase added

Browse files

Files changed (23) hide show

.gitattributes +12 -0
IndicTransTokenizer/__init__.py +0 -0
IndicTransTokenizer/__pycache__/__init__.cpython-39.pyc +0 -0
IndicTransTokenizer/__pycache__/tokenizer.cpython-39.pyc +0 -0
IndicTransTokenizer/__pycache__/utils.cpython-39.pyc +0 -0
IndicTransTokenizer/en-indic/dict.SRC.json +3 -0
IndicTransTokenizer/en-indic/dict.TGT.json +3 -0
IndicTransTokenizer/en-indic/model.SRC +3 -0
IndicTransTokenizer/en-indic/model.TGT +3 -0
IndicTransTokenizer/indic-en/dict.SRC.json +3 -0
IndicTransTokenizer/indic-en/dict.TGT.json +3 -0
IndicTransTokenizer/indic-en/model.SRC +3 -0
IndicTransTokenizer/indic-en/model.TGT +3 -0
IndicTransTokenizer/tokenizer.py +259 -0
IndicTransTokenizer/utils.py +591 -0
README.md +62 -0
configuration_indictrans.py +307 -0
convert_indictrans_checkpoint_to_pytorch.py +107 -0
example.py +125 -0
handler.py +194 -0
install.sh +52 -0
modeling_indictrans.py +1449 -0
sample.srt +699 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/en-indic/model.TGT filter=lfs diff=lfs merge=lfs -text
+*.TGT filter=lfs diff=lfs merge=lfs -text
+*.SRC filter=lfs diff=lfs merge=lfs -text
+*.SRC.json filter=lfs diff=lfs merge=lfs -text
+*.TGT.json filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/en-indic/dict.SRC.json filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/en-indic/dict.TGT.json filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/en-indic/model.SRC filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/indic-en/dict.SRC.json filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/indic-en/dict.TGT.json filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/indic-en/model.SRC filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/indic-en/model.TGT filter=lfs diff=lfs merge=lfs -text

IndicTransTokenizer/__init__.py ADDED Viewed

File without changes

IndicTransTokenizer/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (156 Bytes). View file

IndicTransTokenizer/__pycache__/tokenizer.cpython-39.pyc ADDED Viewed

Binary file (10.1 kB). View file

IndicTransTokenizer/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (14.1 kB). View file

IndicTransTokenizer/en-indic/dict.SRC.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99cabf338bf3db11eafae2769584b8b5d3aa579989feb7e9f72236bdf69810e1
+size 645274

IndicTransTokenizer/en-indic/dict.TGT.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7817850c9e4b99c59fad57d0611c7720f1921f215e6f247cf25d52eff7f1146
+size 3390440

IndicTransTokenizer/en-indic/model.SRC ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cedc5cbcc740369b76201942a0f096fec7287fee039b55bdb956f301235b914
+size 759425

IndicTransTokenizer/en-indic/model.TGT ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
+size 3256903

IndicTransTokenizer/indic-en/dict.SRC.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f9cdb988b42c4e0f4fce5e44cc66975e5088a96d111a149b9ac7d55059b8ec1
+size 3391183

IndicTransTokenizer/indic-en/dict.TGT.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13c3a162fe655dbe99c790a413675c5d0634cd771fadcefe8d407676a7d1a311
+size 644755

IndicTransTokenizer/indic-en/model.SRC ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
+size 3256903

IndicTransTokenizer/indic-en/model.TGT ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cedc5cbcc740369b76201942a0f096fec7287fee039b55bdb956f301235b914
+size 759425

IndicTransTokenizer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import os
+import json
+import torch
+import numpy as np
+from transformers import BatchEncoding
+from typing import Dict, List, Tuple, Union
+from sentencepiece import SentencePieceProcessor
+_PATH = os.path.dirname(os.path.realpath(__file__))
+class IndicTransTokenizer:
+    def __init__(
+        self,
+        src_vocab_fp=None,
+        tgt_vocab_fp=None,
+        src_spm_fp=None,
+        tgt_spm_fp=None,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        direction="indic-en",
+        model_max_length=256,
+    ):
+        self.model_max_length = model_max_length
+        self.supported_langs = [
+            "asm_Beng",
+            "ben_Beng",
+            "brx_Deva",
+            "doi_Deva",
+            "eng_Latn",
+            "gom_Deva",
+            "guj_Gujr",
+            "hin_Deva",
+            "kan_Knda",
+            "kas_Arab",
+            "kas_Deva",
+            "mai_Deva",
+            "mal_Mlym",
+            "mar_Deva",
+            "mni_Beng",
+            "mni_Mtei",
+            "npi_Deva",
+            "ory_Orya",
+            "pan_Guru",
+            "san_Deva",
+            "sat_Olck",
+            "snd_Arab",
+            "snd_Deva",
+            "tam_Taml",
+            "tel_Telu",
+            "urd_Arab",
+        ]
+        self.src_vocab_fp = (
+            src_vocab_fp
+            if (src_vocab_fp is not None)
+            else os.path.join(_PATH, direction, "dict.SRC.json")
+        )
+        self.tgt_vocab_fp = (
+            tgt_vocab_fp
+            if (tgt_vocab_fp is not None)
+            else os.path.join(_PATH, direction, "dict.TGT.json")
+        )
+        self.src_spm_fp = (
+            src_spm_fp
+            if (src_spm_fp is not None)
+            else os.path.join(_PATH, direction, "model.SRC")
+        )
+        self.tgt_spm_fp = (
+            tgt_spm_fp
+            if (tgt_spm_fp is not None)
+            else os.path.join(_PATH, direction, "model.TGT")
+        )
+        self.unk_token = unk_token
+        self.pad_token = pad_token
+        self.eos_token = eos_token
+        self.bos_token = bos_token
+        self.encoder = self._load_json(self.src_vocab_fp)
+        if self.unk_token not in self.encoder:
+            raise KeyError("<unk> token must be in vocab")
+        assert self.pad_token in self.encoder
+        self.encoder_rev = {v: k for k, v in self.encoder.items()}
+        self.decoder = self._load_json(self.tgt_vocab_fp)
+        if self.unk_token not in self.encoder:
+            raise KeyError("<unk> token must be in vocab")
+        assert self.pad_token in self.encoder
+        self.decoder_rev = {v: k for k, v in self.decoder.items()}
+        # load SentencePiece model for pre-processing
+        self.src_spm = self._load_spm(self.src_spm_fp)
+        self.tgt_spm = self._load_spm(self.tgt_spm_fp)
+    def is_special_token(self, x: str):
+        return (x == self.pad_token) or (x == self.bos_token) or (x == self.eos_token)
+    def get_vocab_size(self, src: bool) -> int:
+        """Returns the size of the vocabulary"""
+        return len(self.encoder) if src else len(self.decoder)
+    def _load_spm(self, path: str) -> SentencePieceProcessor:
+        return SentencePieceProcessor(model_file=path)
+    def _save_json(self, data, path: str) -> None:
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+    def _load_json(self, path: str) -> Union[Dict, List]:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def _convert_token_to_id(self, token: str, src: bool) -> int:
+        """Converts an token (str) into an index (integer) using the source/target vocabulary map."""
+        return (
+            self.encoder.get(token, self.encoder[self.unk_token])
+            if src
+            else self.decoder.get(token, self.encoder[self.unk_token])
+        )
+    def _convert_id_to_token(self, index: int, src: bool) -> str:
+        """Converts an index (integer) into a token (str) using the source/target vocabulary map."""
+        return (
+            self.encoder_rev.get(index, self.unk_token)
+            if src
+            else self.decoder_rev.get(index, self.unk_token)
+        )
+    def _convert_tokens_to_string(self, tokens: List[str], src: bool) -> str:
+        """Uses sentencepiece model for detokenization"""
+        if src:
+            if tokens[0] in self.supported_langs and tokens[1] in self.supported_langs:
+                tokens = tokens[2:]
+            return " ".join(tokens)
+        else:
+            return " ".join(tokens)
+    def _remove_translation_tags(self, text: str) -> Tuple[List, str]:
+        """Removes the translation tags before text normalization and tokenization."""
+        tokens = text.split(" ")
+        return tokens[:2], " ".join(tokens[2:])
+    def _tokenize_src_line(self, line: str) -> List[str]:
+        """Tokenizes a source line."""
+        tags, text = self._remove_translation_tags(line)
+        tokens = self.src_spm.encode(text, out_type=str)
+        return tags + tokens
+    def _tokenize_tgt_line(self, line: str) -> List[str]:
+        """Tokenizes a target line."""
+        return self.tgt_spm.encode(line, out_type=str)
+    def tokenize(self, text: str, src: bool) -> List[str]:
+        """Tokenizes a string into tokens using the source/target vocabulary."""
+        return self._tokenize_src_line(text) if src else self._tokenize_tgt_line(text)
+    def batch_tokenize(self, batch: List[str], src: bool) -> List[List[str]]:
+        """Tokenizes a list of strings into tokens using the source/target vocabulary."""
+        return [self.tokenize(line, src) for line in batch]
+    def _create_attention_mask(self, ids: List[int], max_seq_len: int) -> List[int]:
+        """Creates a attention mask for the input sequence."""
+        return ([0] * (max_seq_len - len(ids))) + ([1] * (len(ids) + 1))
+    def _pad_batch(self, tokens: List[str], max_seq_len: int) -> List[str]:
+        """Pads a batch of tokens and adds BOS/EOS tokens."""
+        return (
+            ([self.pad_token] * (max_seq_len - len(tokens))) + tokens + [self.eos_token]
+        )
+    def _decode_line(self, ids: List[int], src: bool) -> List[str]:
+        return [self._convert_id_to_token(_id, src) for _id in ids]
+    def _encode_line(self, tokens: List[str], src: bool) -> List[int]:
+        return [self._convert_token_to_id(token, src) for token in tokens]
+    def _strip_special_tokens(self, tokens: List[str]) -> List[str]:
+        return [token for token in tokens if not self.is_special_token(token)]
+    def _single_input_preprocessing(
+        self, tokens: List[str], src: bool, max_seq_len: int
+    ) -> Tuple[List[int], List[int], int]:
+        """Tokenizes a string into tokens and also converts them into integers using source/target vocabulary map."""
+        attention_mask = self._create_attention_mask(tokens, max_seq_len)
+        padded_tokens = self._pad_batch(tokens, max_seq_len)
+        input_ids = self._encode_line(padded_tokens, src)
+        return input_ids, attention_mask
+    def _single_output_postprocessing(self, ids: List[int], src: bool) -> str:
+        """Detokenizes a list of integer ids into a string using the source/target vocabulary."""
+        tokens = self._decode_line(ids, src)
+        tokens = self._strip_special_tokens(tokens)
+        return self._convert_tokens_to_string(tokens, src)
+    def __call__(
+        self,
+        batch: Union[list, str],
+        src: bool,
+        truncation: bool = False,
+        padding: str = "longest",
+        max_length: int = None,
+        return_tensors: str = "pt",
+        return_attention_mask: bool = True,
+        return_length: bool = False,
+    ) -> BatchEncoding:
+        """Tokenizes a string into tokens and also converts them into integers using source/target vocabulary map."""
+        assert padding in [
+            "longest",
+            "max_length",
+        ], "padding should be either 'longest' or 'max_length'"
+        if not isinstance(batch, list):
+            raise TypeError(
+                f"batch must be a list, but current batch is of type {type(batch)}"
+            )
+        # tokenize the source sentences
+        batch = self.batch_tokenize(batch, src)
+        # truncate the sentences if needed
+        if truncation and max_length is not None:
+            batch = [ids[:max_length] for ids in batch]
+        lengths = [len(ids) for ids in batch]
+        max_seq_len = max(lengths) if padding == "longest" else max_length
+        input_ids, attention_mask = zip(
+            *[
+                self._single_input_preprocessing(
+                    tokens=tokens, src=src, max_seq_len=max_seq_len
+                )
+                for tokens in batch
+            ]
+        )
+        _data = {"input_ids": input_ids}
+        if return_attention_mask:
+            _data["attention_mask"] = attention_mask
+        if return_length:
+            _data["lengths"] = lengths
+        return BatchEncoding(_data, tensor_type=return_tensors)
+    def batch_decode(
+        self, batch: Union[list, torch.Tensor], src: bool
+    ) -> List[List[str]]:
+        """Detokenizes a list of integer ids or a tensor into a list of strings using the source/target vocabulary."""
+        if isinstance(batch, torch.Tensor):
+            batch = batch.detach().cpu().tolist()
+        return [self._single_output_postprocessing(ids=ids, src=src) for ids in batch]

IndicTransTokenizer/utils.py ADDED Viewed

	@@ -0,0 +1,591 @@

+import regex as re
+from joblib import Parallel, delayed
+from nltk.tokenize import sent_tokenize
+from typing import List, Tuple, Union
+from sacremoses import MosesPunctNormalizer
+from indicnlp.normalize import indic_normalize
+from sacremoses import MosesTokenizer, MosesDetokenizer
+from indicnlp.transliterate import unicode_transliterate
+from indicnlp.tokenize import indic_tokenize, indic_detokenize
+from indicnlp.tokenize.sentence_tokenize import sentence_split, DELIM_PAT_NO_DANDA
+en_tok = MosesTokenizer(lang="en")
+en_normalizer = MosesPunctNormalizer()
+en_detok = MosesDetokenizer(lang="en")
+xliterator = unicode_transliterate.UnicodeIndicTransliterator()
+flores_codes = {
+    "asm_Beng": "as",
+    "awa_Deva": "hi",
+    "ben_Beng": "bn",
+    "bho_Deva": "hi",
+    "brx_Deva": "hi",
+    "doi_Deva": "hi",
+    "eng_Latn": "en",
+    "gom_Deva": "kK",
+    "guj_Gujr": "gu",
+    "hin_Deva": "hi",
+    "hne_Deva": "hi",
+    "kan_Knda": "kn",
+    "kas_Arab": "ur",
+    "kas_Deva": "hi",
+    "kha_Latn": "en",
+    "lus_Latn": "en",
+    "mag_Deva": "hi",
+    "mai_Deva": "hi",
+    "mal_Mlym": "ml",
+    "mar_Deva": "mr",
+    "mni_Beng": "bn",
+    "mni_Mtei": "hi",
+    "npi_Deva": "ne",
+    "ory_Orya": "or",
+    "pan_Guru": "pa",
+    "san_Deva": "hi",
+    "sat_Olck": "or",
+    "snd_Arab": "ur",
+    "snd_Deva": "hi",
+    "tam_Taml": "ta",
+    "tel_Telu": "te",
+    "urd_Arab": "ur",
+}
+flores_to_iso = {
+    "asm_Beng": "as",
+    "awa_Deva": "awa",
+    "ben_Beng": "bn",
+    "bho_Deva": "bho",
+    "brx_Deva": "brx",
+    "doi_Deva": "doi",
+    "eng_Latn": "en",
+    "gom_Deva": "gom",
+    "guj_Gujr": "gu",
+    "hin_Deva": "hi",
+    "hne_Deva": "hne",
+    "kan_Knda": "kn",
+    "kas_Arab": "ksa",
+    "kas_Deva": "ksd",
+    "kha_Latn": "kha",
+    "lus_Latn": "lus",
+    "mag_Deva": "mag",
+    "mai_Deva": "mai",
+    "mal_Mlym": "ml",
+    "mar_Deva": "mr",
+    "mni_Beng": "mnib",
+    "mni_Mtei": "mnim",
+    "npi_Deva": "ne",
+    "ory_Orya": "or",
+    "pan_Guru": "pa",
+    "san_Deva": "sa",
+    "sat_Olck": "sat",
+    "snd_Arab": "sda",
+    "snd_Deva": "sdd",
+    "tam_Taml": "ta",
+    "tel_Telu": "te",
+    "urd_Arab": "ur",
+}
+INDIC_NUM_MAP = {
+    "\u09e6": "0",
+    "0": "0",
+    "\u0ae6": "0",
+    "\u0ce6": "0",
+    "\u0966": "0",
+    "\u0660": "0",
+    "\uabf0": "0",
+    "\u0b66": "0",
+    "\u0a66": "0",
+    "\u1c50": "0",
+    "\u06f0": "0",
+    "\u09e7": "1",
+    "1": "1",
+    "\u0ae7": "1",
+    "\u0967": "1",
+    "\u0ce7": "1",
+    "\u06f1": "1",
+    "\uabf1": "1",
+    "\u0b67": "1",
+    "\u0a67": "1",
+    "\u1c51": "1",
+    "\u0c67": "1",
+    "\u09e8": "2",
+    "2": "2",
+    "\u0ae8": "2",
+    "\u0968": "2",
+    "\u0ce8": "2",
+    "\u06f2": "2",
+    "\uabf2": "2",
+    "\u0b68": "2",
+    "\u0a68": "2",
+    "\u1c52": "2",
+    "\u0c68": "2",
+    "\u09e9": "3",
+    "3": "3",
+    "\u0ae9": "3",
+    "\u0969": "3",
+    "\u0ce9": "3",
+    "\u06f3": "3",
+    "\uabf3": "3",
+    "\u0b69": "3",
+    "\u0a69": "3",
+    "\u1c53": "3",
+    "\u0c69": "3",
+    "\u09ea": "4",
+    "4": "4",
+    "\u0aea": "4",
+    "\u096a": "4",
+    "\u0cea": "4",
+    "\u06f4": "4",
+    "\uabf4": "4",
+    "\u0b6a": "4",
+    "\u0a6a": "4",
+    "\u1c54": "4",
+    "\u0c6a": "4",
+    "\u09eb": "5",
+    "5": "5",
+    "\u0aeb": "5",
+    "\u096b": "5",
+    "\u0ceb": "5",
+    "\u06f5": "5",
+    "\uabf5": "5",
+    "\u0b6b": "5",
+    "\u0a6b": "5",
+    "\u1c55": "5",
+    "\u0c6b": "5",
+    "\u09ec": "6",
+    "6": "6",
+    "\u0aec": "6",
+    "\u096c": "6",
+    "\u0cec": "6",
+    "\u06f6": "6",
+    "\uabf6": "6",
+    "\u0b6c": "6",
+    "\u0a6c": "6",
+    "\u1c56": "6",
+    "\u0c6c": "6",
+    "\u09ed": "7",
+    "7": "7",
+    "\u0aed": "7",
+    "\u096d": "7",
+    "\u0ced": "7",
+    "\u06f7": "7",
+    "\uabf7": "7",
+    "\u0b6d": "7",
+    "\u0a6d": "7",
+    "\u1c57": "7",
+    "\u0c6d": "7",
+    "\u09ee": "8",
+    "8": "8",
+    "\u0aee": "8",
+    "\u096e": "8",
+    "\u0cee": "8",
+    "\u06f8": "8",
+    "\uabf8": "8",
+    "\u0b6e": "8",
+    "\u0a6e": "8",
+    "\u1c58": "8",
+    "\u0c6e": "8",
+    "\u09ef": "9",
+    "9": "9",
+    "\u0aef": "9",
+    "\u096f": "9",
+    "\u0cef": "9",
+    "\u06f9": "9",
+    "\uabf9": "9",
+    "\u0b6f": "9",
+    "\u0a6f": "9",
+    "\u1c59": "9",
+    "\u0c6f": "9",
+}
+multispace_regex = re.compile("[ ]{2,}")
+end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])")
+digit_space_percent = re.compile(r"(\d) %")
+double_quot_punc = re.compile(r"\"([,\.]+)")
+digit_nbsp_digit = re.compile(r"(\d) (\d)")
+def punc_norm(text, lang="en"):
+    text = (
+        text.replace("\r", "")
+        .replace("(", " (")
+        .replace(")", ") ")
+        .replace("( ", "(")
+        .replace(" )", ")")
+        .replace(" :", ":")
+        .replace(" ;", ";")
+        .replace("`", "'")
+        .replace("„", '"')
+        .replace("“", '"')
+        .replace("”", '"')
+        .replace("–", "-")
+        .replace("—", " - ")
+        .replace("´", "'")
+        .replace("‘", "'")
+        .replace("‚", "'")
+        .replace("’", "'")
+        .replace("''", '"')
+        .replace("´´", '"')
+        .replace("…", "...")
+        .replace(" « ", ' "')
+        .replace("« ", '"')
+        .replace("«", '"')
+        .replace(" » ", '" ')
+        .replace(" »", '"')
+        .replace("»", '"')
+        .replace(" %", "%")
+        .replace("nº ", "nº ")
+        .replace(" :", ":")
+        .replace(" ºC", " ºC")
+        .replace(" cm", " cm")
+        .replace(" ?", "?")
+        .replace(" !", "!")
+        .replace(" ;", ";")
+        .replace(", ", ", ")
+    )
+    text = multispace_regex.sub(" ", text)
+    text = end_bracket_space_punc_regex.sub(r")\1", text)
+    text = digit_space_percent.sub(r"\1%", text)
+    text = double_quot_punc.sub(
+        r'\1"', text
+    )  # English "quotation," followed by comma, style
+    text = digit_nbsp_digit.sub(r"\1.\2", text)  # What does it mean?
+    return text.strip(" ")
+URL_PATTERN = r"\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b"
+EMAIL_PATTERN = r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"
+# handles dates, time, percentages, proportion, ratio, etc
+NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
+# handles upi, social media handles and hashtags
+OTHER_PATTERN = r"[A-Za-z0-9]*[#|@]\w+"
+def normalize_indic_numerals(line: str):
+    """
+    Normalize the numerals in Indic languages from native script to Roman script (if present).
+    Args:
+        line (str): an input string with Indic numerals to be normalized.
+    Returns:
+        str: an input string with the all Indic numerals normalized to Roman script.
+    """
+    return "".join([INDIC_NUM_MAP.get(c, c) for c in line])
+def wrap_with_placeholders(text: str, patterns: list) -> Tuple[str, dict]:
+    """
+    Wraps substrings with matched patterns in the given text with placeholders and returns
+    the modified text along with a mapping of the placeholders to their original value.
+    Args:
+        text (str): an input string which needs to be wrapped with the placeholders.
+        pattern (list): list of patterns to search for in the input string.
+    Returns:
+        Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
+            placeholders to their original values.
+    """
+    serial_no = 1
+    placeholder_entity_map = dict()
+    for pattern in patterns:
+        matches = set(re.findall(pattern, text))
+        # wrap common match with placeholder tags
+        for match in matches:
+            if pattern == URL_PATTERN:
+                # Avoids false positive URL matches for names with initials.
+                temp = match.replace(".", "")
+                if len(temp) < 4:
+                    continue
+            if pattern == NUMERAL_PATTERN:
+                # Short numeral patterns do not need placeholder based handling.
+                temp = match.replace(" ", "").replace(".", "").replace(":", "")
+                if len(temp) < 4:
+                    continue
+            # Set of Translations of "ID" in all the suppported languages have been collated.
+            # This has been added to deal with edge cases where placeholders might get translated.
+            indic_failure_cases = [
+                "آی ڈی ",
+                "ꯑꯥꯏꯗꯤ",
+                "आईडी",
+                "आई . डी . ",
+                "आई . डी .",
+                "आई. डी. ",
+                "आई. डी.",
+                "ऐटि",
+                "آئی ڈی ",
+                "ᱟᱭᱰᱤ ᱾",
+                "आयडी",
+                "ऐडि",
+                "आइडि",
+            ]
+            placeholder = "<ID{}>".format(serial_no)
+            alternate_placeholder = "< ID{} >".format(serial_no)
+            placeholder_entity_map[placeholder] = match
+            placeholder_entity_map[alternate_placeholder] = match
+            placeholder = "<ID{}]".format(serial_no)
+            alternate_placeholder = "< ID{} ]".format(serial_no)
+            placeholder_entity_map[placeholder] = match
+            placeholder_entity_map[alternate_placeholder] = match
+            for i in indic_failure_cases:
+                placeholder_temp = "<{}{}>".format(i, serial_no)
+                placeholder_entity_map[placeholder_temp] = match
+                placeholder_temp = "< {}{} >".format(i, serial_no)
+                placeholder_entity_map[placeholder_temp] = match
+                placeholder_temp = "< {} {} >".format(i, serial_no)
+                placeholder_entity_map[placeholder_temp] = match
+                placeholder_temp = "<{} {}]".format(i, serial_no)
+                placeholder_entity_map[placeholder_temp] = match
+                placeholder_temp = "< {} {} ]".format(i, serial_no)
+                placeholder_entity_map[placeholder_temp] = match
+                placeholder_temp = "[{} {}]".format(i, serial_no)
+                placeholder_entity_map[placeholder_temp] = match
+                placeholder_temp = "[ {} {} ]".format(i, serial_no)
+                placeholder_entity_map[placeholder_temp] = match
+            text = text.replace(match, placeholder)
+            serial_no += 1
+    text = re.sub("\s+", " ", text)
+    # Regex has failure cases in trailing "/" in URLs, so this is a workaround.
+    text = text.replace(">/", ">")
+    text = text.replace("]/", "]")
+    return text, placeholder_entity_map
+def normalize(
+    text: str,
+    patterns: list = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN],
+) -> Tuple[str, dict]:
+    """
+    Normalizes and wraps the spans of input string with placeholder tags. It first normalizes
+    the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized
+    Indic numerals to wrap the spans of text matching the pattern with placeholder tags.
+    Args:
+        text (str): input string.
+        pattern (list): list of patterns to search for in the input string.
+    Returns:
+        Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
+            placeholders to their original values.
+    """
+    text = normalize_indic_numerals(text.strip("\n"))
+    text, placeholder_entity_map = wrap_with_placeholders(text, patterns)
+    return text, placeholder_entity_map
+def split_sentences(paragraph: str, lang: str) -> List[str]:
+    """
+    Splits the input text paragraph into sentences. It uses `moses` for English and
+    `indic-nlp` for Indic languages.
+    Args:
+        paragraph (str): input text paragraph.
+        lang (str): flores language code.
+    Returns:
+        List[str] -> list of sentences.
+    """
+    # fails to handle sentence splitting in case of
+    # with MosesSentenceSplitter(lang) as splitter:
+    #     return splitter([paragraph])
+    return (
+        sent_tokenize(paragraph)
+        if lang == "eng_Latn"
+        else sentence_split(
+            paragraph, lang=flores_codes[lang], delim_pat=DELIM_PAT_NO_DANDA
+        )
+    )
+def apply_lang_tags(sents: List[str], src_lang: str, tgt_lang: str) -> List[str]:
+    """
+    Add special tokens indicating source and target language to the start of the each input sentence.
+    Each resulting input sentence will have the format: "`{src_lang} {tgt_lang} {input_sentence}`".
+    Args:
+        sent (str): input sentence to be translated.
+        src_lang (str): flores lang code of the input sentence.
+        tgt_lang (str): flores lang code in which the input sentence will be translated.
+    Returns:
+        List[str]: list of input sentences with the special tokens added to the start.
+    """
+    return Parallel(n_jobs=-1)(
+        delayed(lambda x: f"{src_lang} {tgt_lang} {x.strip()}")(sent) for sent in sents
+    )
+def preprocess_sent(
+    sent: str,
+    normalizer: Union[MosesPunctNormalizer, indic_normalize.IndicNormalizerFactory],
+    lang: str,
+) -> str:
+    """
+    Preprocess an input text sentence by normalizing, tokenization, and possibly transliterating it.
+    Args:
+        sent (str): input text sentence to preprocess.
+        normalizer (Union[MosesPunctNormalizer, indic_normalize.IndicNormalizerFactory]): an object that performs normalization on the text.
+        lang (str): flores language code of the input text sentence.
+    Returns:
+        Tuple[str, dict]: a tuple of preprocessed input text sentence and also a corresponding dictionary
+            mapping placeholders to their original values.
+    """
+    iso_lang = flores_codes[lang]
+    sent = punc_norm(sent, iso_lang)
+    sent, placeholder_entity_map = normalize(sent)
+    transliterate = True
+    if lang.split("_")[1] in ["Arab", "Aran", "Olck", "Mtei", "Latn"]:
+        transliterate = False
+    if iso_lang == "en":
+        processed_sent = " ".join(
+            en_tok.tokenize(en_normalizer.normalize(sent.strip()), escape=False)
+        )
+    elif transliterate:
+        # transliterates from the any specific language to devanagari
+        # which is why we specify lang2_code as "hi".
+        processed_sent = xliterator.transliterate(
+            " ".join(
+                indic_tokenize.trivial_tokenize(
+                    normalizer.normalize(sent.strip()), iso_lang
+                )
+            ),
+            iso_lang,
+            "hi",
+        ).replace(" ् ", "्")
+    else:
+        # we only need to transliterate for joint training
+        processed_sent = " ".join(
+            indic_tokenize.trivial_tokenize(
+                normalizer.normalize(sent.strip()), iso_lang
+            )
+        )
+    return processed_sent, placeholder_entity_map
+def preprocess(sents: List[str], lang: str):
+    """
+    Preprocess an array of sentences by normalizing, tokenization, and possibly transliterating it.
+    Args:
+        batch (List[str]): input list of sentences to preprocess.
+        lang (str): flores language code of the input text sentences.
+    Returns:
+        Tuple[List[str], List[dict]]: a tuple of list of preprocessed input text sentences and also a corresponding list of dictionary
+            mapping placeholders to their original values.
+    """
+    normalizer = (
+        indic_normalize.IndicNormalizerFactory().get_normalizer(flores_codes[lang])
+        if lang != "eng_Latn"
+        else None
+    )
+    processed_sents, placeholder_entity_map_sents = zip(
+        *[preprocess_sent(sent, normalizer, lang) for sent in sents]
+    )
+    return processed_sents, placeholder_entity_map_sents
+def preprocess_batch(batch: List[str], src_lang: str, tgt_lang: str) -> List[str]:
+    """
+    Preprocess an array of sentences by normalizing, tokenization, and possibly transliterating it. It also tokenizes the
+    normalized text sequences using sentence piece tokenizer and also adds language tags.
+    Args:
+        batch (List[str]): input list of sentences to preprocess.
+        src_lang (str): flores language code of the input text sentences.
+        tgt_lang (str): flores language code of the output text sentences.
+    Returns:
+        Tuple[List[str], List[dict]]: a tuple of list of preprocessed input text sentences and also a corresponding list of dictionary
+            mapping placeholders to their original values.
+    """
+    preprocessed_sents, placeholder_entity_map_sents = preprocess(batch, lang=src_lang)
+    tagged_sents = apply_lang_tags(preprocessed_sents, src_lang, tgt_lang)
+    return tagged_sents, placeholder_entity_map_sents
+def postprocess_batch(
+    sents: List[str],
+    placeholder_entity_map: List[dict],
+    lang: str,
+    common_lang: str = "hin_Deva",
+) -> List[str]:
+    """
+    Postprocesses a batch of input sentences after the translation generations.
+    Args:
+        sents (List[str]): batch of translated sentences to postprocess.
+        placeholder_entity_map (List[dict]): dictionary mapping placeholders to the original entity values.
+        lang (str): flores language code of the input sentences.
+        common_lang (str, optional): flores language code of the transliterated language (defaults: hin_Deva).
+    Returns:
+        List[str]: postprocessed batch of input sentences.
+    """
+    lang_code, script_code = lang.split("_")
+    for i in range(len(sents)):
+        sents[i] = sents[i].replace(" ", "").replace("▁", " ").strip()
+        # Fixes for Perso-Arabic scripts
+        # TODO: Move these normalizations inside indic-nlp-library
+        if script_code in {"Arab", "Aran"}:
+            # UrduHack adds space before punctuations. Since the model was trained without fixing this issue, let's fix it now
+            sents[i] = sents[i].replace(" ؟", "؟").replace(" ۔", "۔").replace(" ،", "،")
+            # Kashmiri bugfix for palatalization: https://github.com/AI4Bharat/IndicTrans2/issues/11
+            sents[i] = sents[i].replace("ٮ۪", "ؠ")
+        # Oriya bug: indic-nlp-library produces ଯ଼ instead of ୟ when converting from Devanagari to Odia
+        # TODO: Find out what's the issue with unicode transliterator for Oriya and fix it
+        if lang_code == "or":
+            sents[i] = sents[i].replace("ଯ଼", "ୟ")
+    assert len(sents) == len(placeholder_entity_map)
+    # Replace the placeholders entity
+    for i in range(0, len(sents)):
+        for key in placeholder_entity_map[i].keys():
+            sents[i] = sents[i].replace(key, placeholder_entity_map[i][key])
+    # Detokenize and transliterate to native scripts if applicable
+    if lang == "eng_Latn":
+        postprocessed_sents = [en_detok.detokenize(sent.split(" ")) for sent in sents]
+    else:
+        postprocessed_sents = [
+            indic_detokenize.trivial_detokenize(
+                xliterator.transliterate(
+                    s, flores_codes[common_lang], flores_codes[lang]
+                ),
+                flores_codes[lang],
+            )
+            for s in sents
+        ]
+    assert len(postprocessed_sents) == len(placeholder_entity_map)
+    return postprocessed_sents

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# IndicTrans2 HF Compatible Models
+In this section, we provide details on how to use our [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) models which were originally trained with the [fairseq](https://github.com/facebookresearch/fairseq) to [HuggingFace transformers](https://huggingface.co/docs/transformers/index) for inference purpose. Our scripts for HuggingFace compatible models are adapted from [M2M100 repository](https://github.com/huggingface/transformers/tree/main/src/transformers/models/m2m_100).
+### Setup
+To get started, follow these steps to set up the environment:
+```
+# Clone the github repository and navigate to the project directory.
+git clone https://github.com/AI4Bharat/IndicTrans2
+cd IndicTrans2
+# Install all the dependencies and requirements associated with the project for running HF compatible models.
+source install.sh
+```
+> Note: The `install.sh` script in this directory is specifically for running HF compatible models for inference.
+### Converting
+In order to convert the fairseq checkpoint to a PyTorch checkpoint that is compatible with HuggingFace Transformers, use the following command:
+```bash
+python3 convert_indictrans_checkpoint_to_pytorch.py --fairseq_path <fairseq_checkpoint_best.pt> --pytorch_dump_folder_path <hf_output_dir>
+```
+- `<fairseq_checkpoint_best.pt>`: path to the fairseq `checkpoint_best.pt` that needs to be converted to HF compatible models
+- `<hf_output_dir>`: path to the output directory where the HF compatible models will be saved
+### Models
+| Model    | 🤗 HuggingFace Checkpoints        |
+|----------|-----------------------------------|
+| Preprint En-Indic | [ai4bharat/indictrans2-en-indic-1B](https://huggingface.co/ai4bharat/indictrans2-en-indic-1B) |
+| Preprint Indic-En | [ai4bharat/indictrans2-indic-en-1B](https://huggingface.co/ai4bharat/indictrans2-indic-en-1B) |
+### Inference
+With the conversion complete, you can now perform inference using the HuggingFace Transformers.
+You can start with the provided `example.py` script and customize it for your specific translation use case:
+```bash
+python3 example.py
+```
+Feel free to modify the `example.py` script to suit your translation needs.
+### Citation
+```
+@article{ai4bharat2023indictrans2,
+  title   = {IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
+  author  = {AI4Bharat and Jay Gala and Pranjal A. Chitale and Raghavan AK and Sumanth Doddapaneni and Varun Gumma and Aswanth Kumar and Janki Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M. Khapra and Raj Dabre and Anoop Kunchukuttan},
+  year    = {2023},
+  journal = {arXiv preprint arXiv: 2305.16307}
+}
+```

configuration_indictrans.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# coding=utf-8
+# Copyright 2023 The IndicTrans2 Authors and AI4Bharat team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch IndicTrans config."""
+from collections import OrderedDict
+from typing import Any, Mapping, Optional
+from transformers import PreTrainedTokenizer
+from transformers.configuration_utils import PretrainedConfig
+from transformers.onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
+from transformers.onnx.utils import compute_effective_axis_dimension
+from transformers.utils import TensorType, is_torch_available
+# Copied from transformers.models.m2m_100.configuration_m2m_100.M2M100Config->IndicTrans
+class IndicTransConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IT2Model`]. It is used to instantiate an
+    IT2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the IT2
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the IT2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`IT2Model`] or
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    ```"""
+    model_type = "IndicTrans"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+    }
+    def __init__(
+        self,
+        encoder_vocab_size=None,
+        decoder_vocab_size=None,
+        encoder_embed_dim=512,
+        decoder_embed_dim=512,
+        max_source_positions=210,
+        max_target_positions=210,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.00,
+        decoder_layerdrop=0.00,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        encoder_normalize_before=False,
+        decoder_normalize_before=False,
+        layernorm_embedding=False,
+        share_decoder_input_output_embed=False,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        scale_embedding=True,
+        decoder_start_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.encoder_vocab_size = encoder_vocab_size
+        self.decoder_vocab_size = decoder_vocab_size
+        self.encoder_normalize_before = encoder_normalize_before
+        self.decoder_normalize_before = decoder_normalize_before
+        self.layernorm_embedding = layernorm_embedding
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
+        self.encoder_embed_dim = encoder_embed_dim
+        self.decoder_embed_dim = decoder_embed_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding
+        self.share_decoder_input_output_embed = share_decoder_input_output_embed
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+class IndicTransOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+            ]
+        )
+        if self.use_past:
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+            common_inputs["decoder_attention_mask"] = {
+                0: "batch",
+                1: "past_decoder_sequence + sequence",
+            }
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+            common_inputs["decoder_attention_mask"] = {
+                0: "batch",
+                1: "decoder_sequence",
+            }
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        return common_inputs
+    # Copied from BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering
+    # A better name would be _generate_dummy_inputs_for_encoder_and_decoder because sequence classification and question
+    # answering are not supported for IT2, but this name is preserved to be able to check that the copy matches what
+    # was done for BART so that it can be updated if need be.
+    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        # Copied from OnnxConfig.generate_dummy_inputs
+        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size,
+            fixed_dimension=OnnxConfig.default_fixed_batch,
+            num_token_to_add=0,
+        )
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length,
+            fixed_dimension=OnnxConfig.default_fixed_sequence,
+            num_token_to_add=token_to_add,
+        )
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
+        common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
+        return common_inputs
+    # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_default_and_seq2seq_lm
+    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair, framework
+        )
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, decoder_seq_length, is_pair, framework
+        )
+        decoder_inputs = {
+            f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()
+        }
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError(
+                    "Cannot generate dummy past_keys inputs without PyTorch installed."
+                )
+            else:
+                import torch
+            batch, encoder_seq_length = common_inputs["input_ids"].shape
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            (
+                num_encoder_attention_heads,
+                num_decoder_attention_heads,
+            ) = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_past_length = decoder_seq_length + 3
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                decoder_past_length,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+            common_inputs["decoder_attention_mask"] = torch.cat(
+                [
+                    common_inputs["decoder_attention_mask"],
+                    torch.ones(batch, decoder_past_length),
+                ],
+                dim=1,
+            )
+            common_inputs["past_key_values"] = []
+            # If the number of encoder and decoder layers are present in the model configuration, both are considered
+            num_encoder_layers, num_decoder_layers = self.num_layers
+            min_num_layers = min(num_encoder_layers, num_decoder_layers)
+            max_num_layers = (
+                max(num_encoder_layers, num_decoder_layers) - min_num_layers
+            )
+            remaining_side_name = (
+                "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+            )
+            for _ in range(min_num_layers):
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+            # TODO: test this.
+            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
+            for _ in range(min_num_layers, max_num_layers):
+                common_inputs["past_key_values"].append(
+                    (torch.zeros(shape), torch.zeros(shape))
+                )
+        return common_inputs
+    generate_dummy_inputs = _generate_dummy_inputs_for_default_and_seq2seq_lm

convert_indictrans_checkpoint_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import torch
+import torch.nn as nn
+from configuration_indictrans import IndicTransConfig
+from modeling_indictrans import IndicTransForConditionalGeneration
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "encoder.version",
+        "decoder.version",
+        "model.encoder.version",
+        "model.decoder.version",
+        "_float_tensor",
+        "encoder.embed_positions._float_tensor",
+        "decoder.embed_positions._float_tensor",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+def make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.data
+    return lin_layer
+def convert_fairseq_IT2_checkpoint_from_disk(checkpoint_path):
+    model = torch.load(checkpoint_path, map_location="cpu")
+    args = model["args"] or model["cfg"]["model"]
+    state_dict = model["model"]
+    remove_ignore_keys_(state_dict)
+    encoder_vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
+    decoder_vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
+    config = IndicTransConfig(
+        encoder_vocab_size=encoder_vocab_size,
+        decoder_vocab_size=decoder_vocab_size,
+        max_source_positions=args.max_source_positions,
+        max_target_positions=args.max_target_positions,
+        encoder_layers=args.encoder_layers,
+        decoder_layers=args.decoder_layers,
+        layernorm_embedding=args.layernorm_embedding,
+        encoder_normalize_before=args.encoder_normalize_before,
+        decoder_normalize_before=args.decoder_normalize_before,
+        encoder_attention_heads=args.encoder_attention_heads,
+        decoder_attention_heads=args.decoder_attention_heads,
+        encoder_ffn_dim=args.encoder_ffn_embed_dim,
+        decoder_ffn_dim=args.decoder_ffn_embed_dim,
+        encoder_embed_dim=args.encoder_embed_dim,
+        decoder_embed_dim=args.decoder_embed_dim,
+        encoder_layerdrop=args.encoder_layerdrop,
+        decoder_layerdrop=args.decoder_layerdrop,
+        dropout=args.dropout,
+        attention_dropout=args.attention_dropout,
+        activation_dropout=args.activation_dropout,
+        activation_function=args.activation_fn,
+        share_decoder_input_output_embed=args.share_decoder_input_output_embed,
+        scale_embedding=not args.no_scale_embedding,
+    )
+    model = IndicTransForConditionalGeneration(config)
+    model.model.load_state_dict(state_dict, strict=False)
+    if not args.share_decoder_input_output_embed:
+        model.lm_head = make_linear_from_emb(
+            state_dict["decoder.output_projection.weight"]
+        )
+    print(model)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--fairseq_path",
+        default="indic-en/model/checkpoint_best.pt",
+        type=str,
+        help="path to a model.pt on local filesystem.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default="indic-en/hf_model",
+        type=str,
+        help="Path to the output PyTorch model.",
+    )
+    args = parser.parse_args()
+    model = convert_fairseq_IT2_checkpoint_from_disk(args.fairseq_path)
+    model.save_pretrained(args.pytorch_dump_folder_path)

example.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import sys
+import torch
+from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
+from IndicTransTokenizer.utils import preprocess_batch, postprocess_batch
+from IndicTransTokenizer.tokenizer import IndicTransTokenizer
+en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
+BATCH_SIZE = 16
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+if len(sys.argv)>1:
+    quantization = sys.argv[1]
+else:
+    quantization = ""
+def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
+    if quantization == "4-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+    elif quantization == "8-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_use_double_quant=True,
+            bnb_8bit_compute_dtype=torch.bfloat16,
+        )
+    else:
+        qconfig = None
+    tokenizer = IndicTransTokenizer(direction=direction)
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        ckpt_dir,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        quantization_config=qconfig
+    )
+    if qconfig==None:
+        model = model.to(DEVICE)
+        model.half()
+    model.eval()
+    return tokenizer, model
+def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer):
+    translations = []
+    for i in range(0, len(input_sentences), BATCH_SIZE):
+        batch = input_sentences[i : i + BATCH_SIZE]
+        # Preprocess the batch and extract entity mappings
+        batch, entity_map = preprocess_batch(
+            batch, src_lang=src_lang, tgt_lang=tgt_lang
+        )
+        # Tokenize the batch and generate input encodings
+        inputs = tokenizer(
+            batch,
+            src=True,
+            truncation=True,
+            padding="longest",
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).to(DEVICE)
+        # Generate translations using the model
+        with torch.no_grad():
+            generated_tokens = model.generate(
+                **inputs,
+                use_cache=True,
+                min_length=0,
+                max_length=256,
+                num_beams=5,
+                num_return_sequences=1,
+            )
+        # Decode the generated tokens into text
+        generated_tokens = tokenizer.batch_decode(
+            generated_tokens.detach().cpu().tolist(), src=False
+        )
+        # Postprocess the translations, including entity replacement
+        translations += postprocess_batch(
+            generated_tokens, lang=tgt_lang, placeholder_entity_map=entity_map
+        )
+        del inputs
+        torch.cuda.empty_cache()
+    return translations
+en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(
+    en_indic_ckpt_dir, "en-indic", quantization
+)
+# ---------------------------------------------------------------------------
+#                              English to Hindi
+# ---------------------------------------------------------------------------
+en_sents = [
+    "When I was young, I used to go to the park every day.",
+    "He has many old books, which he inherited from his ancestors.",
+    "I can't figure out how to solve my problem.",
+    "She is very hardworking and intelligent, which is why she got all the good marks.",
+    "We watched a new movie last week, which was very inspiring.",
+    "If you had met me at that time, we would have gone out to eat.",
+    "She went to the market with her sister to buy a new sari.",
+    "Raj told me that he is going to his grandmother's house next month.",
+    "All the kids were having fun at the party and were eating lots of sweets.",
+    "My friend has invited me to his birthday party, and I will give him a gift.",
+]
+src_lang, tgt_lang = "eng_Latn", "hin_Deva"
+hi_translations = batch_translate(
+    en_sents, src_lang, tgt_lang, en_indic_model, en_indic_tokenizer
+)
+print(f"\n{src_lang} - {tgt_lang}")
+for input_sentence, translation in zip(en_sents, hi_translations):
+    print(f"{src_lang}: {input_sentence}")
+    print(f"{tgt_lang}: {translation}")

handler.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from typing import Dict, List, Any
+import sys, os, re
+from tqdm import tqdm
+import torch
+from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
+from IndicTransTokenizer.utils import preprocess_batch, postprocess_batch
+from IndicTransTokenizer.tokenizer import IndicTransTokenizer
+class EndpointHandler():
+    def __init__(self, direction = "en-indic", quantization = ""):
+        self.model_name = "ai4bharat/indictrans2-en-indic-1B"
+        self.utterance_pattern = re.compile(r"^\d+$")
+        self.timestamp_pattern = re.compile(r"(\d+:\d+:\d+,\d+)\s*-->\s*(\d+:\d+:\d+,\d+)")
+        self.BATCH_SIZE = 16
+        self.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = None
+        self.tokenizer = None
+        if quantization == "4-bit":
+            qconfig = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_compute_dtype=torch.bfloat16,
+            )
+        elif quantization == "8-bit":
+            qconfig = BitsAndBytesConfig(
+                load_in_8bit=True,
+                bnb_8bit_use_double_quant=True,
+                bnb_8bit_compute_dtype=torch.bfloat16,
+            )
+        else:
+            qconfig = None
+        self.tokenizer = IndicTransTokenizer(direction=direction)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(
+            self.model_name,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            quantization_config=qconfig
+        )
+        if qconfig==None:
+            self.model = self.model.to(self.DEVICE)
+            self.model.half()
+        self.model.eval()
+    def batch_translate(self, input_sentences, src_lang, tgt_lang):
+        translations = []
+        for i in range(0, len(input_sentences), self.BATCH_SIZE):
+            batch = input_sentences[i : i + self.BATCH_SIZE]
+            # Preprocess the batch and extract entity mappings
+            batch, entity_map = preprocess_batch(
+                batch, src_lang=src_lang, tgt_lang=tgt_lang
+            )
+            # Tokenize the batch and generate input encodings
+            inputs = self.tokenizer(
+                batch,
+                src=True,
+                truncation=True,
+                padding="longest",
+                return_tensors="pt",
+                return_attention_mask=True,
+            ).to(self.DEVICE)
+            # Generate translations using the model
+            with torch.no_grad():
+                generated_tokens = self.model.generate(
+                    **inputs,
+                    use_cache=True,
+                    min_length=0,
+                    max_length=256,
+                    num_beams=5,
+                    num_return_sequences=1,
+                )
+            # Decode the generated tokens into text
+            generated_tokens = self.tokenizer.batch_decode(
+                generated_tokens.detach().cpu().tolist(), src=False
+            )
+            # Postprocess the translations, including entity replacement
+            translations += postprocess_batch(
+                generated_tokens, lang=tgt_lang, placeholder_entity_map=entity_map
+            )
+            del inputs
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        return translations
+    def read_srt(self, srt_path):
+        data = []
+        with open(srt_path, 'r', encoding='utf-8') as fp:
+            utterance_ind = ""
+            start_end = ""
+            text = ""
+            for ind, line in enumerate(fp.readlines()):
+                line = line.strip()
+                if re.search(self.utterance_pattern, line) is not None:
+                    utterance_ind = line
+                elif re.search(self.timestamp_pattern, line) is not None:
+                    start_end = line
+                else:
+                    text += line
+                if utterance_ind!='' and start_end!='' and text!='':
+                    data.append({'utterance_ind': utterance_ind, 'start_end': start_end, 'text': text})
+                    utterance_ind = ''
+                    start_end = ''
+                    text = ''
+        return data
+    def test(self, inputs) -> List[Dict[str, Any]]:
+        """
+       data args:
+            inputs (:obj: (transcript_path : 'str', src_lang : 'str', tgt_lang : 'str')
+            kwargs
+      Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        src_lang = inputs["src_lang"]
+        tgt_lang = inputs["tgt_lang"]
+        transcript_path = inputs["transcript_path"]
+        output_translations = []
+        if self.model is not None:
+            transcriptions = self.read_srt(transcript_path)
+            trans_sents = [entry['text'] for entry in transcriptions]
+            indic_translations = self.batch_translate(trans_sents, src_lang, tgt_lang)
+            for i in tqdm(range(len(transcriptions))):
+                entry = transcriptions[i]
+                entry['text'] = indic_translations[i]
+                output_translations.append(entry)
+            return output_translations
+        else:
+            return []
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+       data args:
+            inputs (:obj: (transcript_path : 'str', src_lang : 'str', tgt_lang : 'str')
+            kwargs
+      Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        inputs = data.pop("inputs",data)
+        src_lang = inputs["src_lang"]
+        tgt_lang = inputs["tgt_lang"]
+        transcript_path = inputs["transcript_path"]
+        output_translations = []
+        if self.model is not None:
+            transcriptions = self.read_srt(transcript_path)
+            indic_translations = self.batch_translate(transcriptions, src_lang, tgt_lang)
+            for i in tqdm(range(len(transcriptions))):
+                entry = transcriptions[i]
+                entry['text'] = indic_translations[i]
+                output_translations.append(entry)
+            return output_translations
+        else:
+            return []
+if __name__ == "__main__":
+    endpoint = EndpointHandler(quantization = "8-bit")
+    inputs = {}
+    inputs['src_lang'] = 'eng_Latn'
+    inputs['tgt_lang'] = 'tel_Telu'
+    inputs['transcript_path'] = './sample.srt'
+    outputs = endpoint.test(inputs)
+    print("Outputs: ")
+    for entry in outputs:
+        print(entry)

install.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+#/bin/bash
+root_dir=$(pwd)
+echo "Setting up the environment in the $root_dir"
+# --------------------------------------------------------------
+#          create and activate the virtual environment
+# --------------------------------------------------------------
+echo "Creating a virtual environment with python3"
+conda create -n itv2_hf python=3.9 -y
+conda activate itv2_hf
+echo "Installing all the dependencies"
+conda install pip
+python3 -m pip install --upgrade pip
+# --------------------------------------------------------------
+#                   PyTorch Installation
+# --------------------------------------------------------------
+python3 -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu118
+# --------------------------------------------------------------
+#       Install IndicNLP library and necessary resources
+# --------------------------------------------------------------
+git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
+export INDIC_RESOURCES_PATH=$root_dir/indic_nlp_resources
+# we use version 0.92 which is the latest in the github repo
+git clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
+cd indic_nlp_library
+python3 -m pip install ./
+cd $root_dir
+# --------------------------------------------------------------
+#               Install additional utility packages
+# --------------------------------------------------------------
+python3 -m pip install sacremoses pandas regex mock transformers==4.33.2 urduhack[tf] mosestokenizer
+python3 -c "import urduhack; urduhack.download()"
+python3 -m pip install bitsandbytes scipy accelerate datasets
+# --------------------------------------------------------------
+#               Sentencepiece for tokenization
+# --------------------------------------------------------------
+# build the cpp binaries from the source repo in order to use the command line utility
+# source repo: https://github.com/google/sentencepiece
+python3 -m pip install sentencepiece
+echo "Setup completed!"

modeling_indictrans.py ADDED Viewed

	@@ -0,0 +1,1449 @@

+# coding=utf-8
+# Copyright 2023 The IndicTrans2 Authors and AI4Bharat team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch IndicTrans model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from transformers.utils import logging
+from transformers.modeling_utils import PreTrainedModel
+from configuration_indictrans import IndicTransConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "IndicTransConfig"
+INDICTRANS_PRETRAINED_MODEL_ARCHIVE_LIST = [""]
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(
+    input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int
+):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+    return shifted_input_ids
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    dtype: torch.dtype,
+    device: torch.device,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(
+                    tgt_len, past_key_values_length, dtype=dtype, device=device
+                ),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(
+        bsz, 1, tgt_len, tgt_len + past_key_values_length
+    )
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(
+        inverted_mask.to(torch.bool), torch.finfo(dtype).min
+    )
+def create_position_ids_from_input_ids(
+    input_ids, padding_idx, past_key_values_length=0
+):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (
+        torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length
+    ) * mask
+    return incremental_indices.long() + padding_idx
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding->IndicTrans
+class IndicTransSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+    def __init__(
+        self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+    def make_weights(
+        self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(
+                dtype=self.weights.dtype, device=self.weights.device
+            )
+        self.register_buffer("weights", emb_weights, persistent=False)
+    @staticmethod
+    def get_embedding(
+        num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
+        """
+        Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
+            1
+        ) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
+            num_embeddings, -1
+        )
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb.to(torch.get_default_dtype())
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor = None,
+        inputs_embeds: torch.Tensor = None,
+        past_key_values_length: int = 0,
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(
+                input_ids, self.padding_idx, past_key_values_length
+            ).to(input_ids.device)
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(
+                inputs_embeds, past_key_values_length
+            )
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(
+                max_pos + self.offset, self.embedding_dim, self.padding_idx
+            )
+        return (
+            self.weights.index_select(0, position_ids.view(-1))
+            .view(bsz, seq_len, self.weights.shape[-1])
+            .detach()
+        )
+    def create_position_ids_from_inputs_embeds(
+        self, inputs_embeds, past_key_values_length
+    ):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+        Args:
+            inputs_embeds: torch.Tensor
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+        position_ids = torch.arange(
+            self.padding_idx + 1,
+            sequence_length + self.padding_idx + 1,
+            dtype=torch.long,
+            device=inputs_embeds.device,
+        )
+        return (
+            position_ids.unsqueeze(0).expand(input_shape).contiguous()
+            + past_key_values_length
+        )
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->IndicTrans
+class IndicTransAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+        else:
+            attn_weights_reshaped = None
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->IndicTrans
+class IndicTransEncoderLayer(nn.Module):
+    def __init__(self, config: IndicTransConfig):
+        super().__init__()
+        self.embed_dim = config.encoder_embed_dim
+        self.self_attn = IndicTransAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.normalize_before = config.encoder_normalize_before
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->IndicTrans
+class IndicTransDecoderLayer(nn.Module):
+    def __init__(self, config: IndicTransConfig):
+        super().__init__()
+        self.embed_dim = config.decoder_embed_dim
+        self.self_attn = IndicTransAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = IndicTransAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.normalize_before = config.decoder_normalize_before
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = (
+            past_key_value[:2] if past_key_value is not None else None
+        )
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            if self.normalize_before:
+                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = (
+                past_key_value[-2:] if past_key_value is not None else None
+            )
+            (
+                hidden_states,
+                cross_attn_weights,
+                cross_attn_present_key_value,
+            ) = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(
+                hidden_states, p=self.dropout, training=self.training
+            )
+            hidden_states = residual + hidden_states
+            if not self.normalize_before:
+                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+        # Fully Connected
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training
+        )
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100PretrainedModel->IndicTrans
+class IndicTransPreTrainedModel(PreTrainedModel):
+    config_class = IndicTransConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["IndicTransAttention"]
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (IndicTransDecoder, IndicTransEncoder)):
+            module.gradient_checkpointing = value
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100EncoderLayer->IndicTrans
+class IndicTransEncoder(IndicTransPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`IndicTransEncoderLayer`].
+    Args:
+        config: IndicTransConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(
+        self, config: IndicTransConfig, embed_tokens: Optional[nn.Embedding] = None
+    ):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        embed_dim = config.encoder_embed_dim
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(
+            config.encoder_vocab_size, embed_dim, self.padding_idx
+        )
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+        self.embed_positions = IndicTransSinusoidalPositionalEmbedding(
+            config.max_source_positions,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList(
+            [IndicTransEncoderLayer(config) for _ in range(config.encoder_layers)]
+        )
+        self.layer_norm = (
+            nn.LayerNorm(embed_dim) if config.encoder_normalize_before else None
+        )
+        self.layernorm_embedding = (
+            nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
+        )
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        embed_pos = self.embed_positions(input_ids, inputs_embeds)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+        hidden_states = inputs_embeds + embed_pos
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            skip_the_layer = (
+                True
+                if self.training and (dropout_probability < self.layerdrop)
+                else False
+            )
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                layer_outputs = (None, None)
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if self.layer_norm is not None:
+            hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100DecoderLayer->IndicTrans
+class IndicTransDecoder(IndicTransPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`IndicTransDecoderLayer`]
+    Args:
+        config: IndicTransConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(
+        self, config: IndicTransConfig, embed_tokens: Optional[nn.Embedding] = None
+    ):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        embed_dim = config.encoder_embed_dim
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.embed_tokens = nn.Embedding(
+            config.decoder_vocab_size, embed_dim, self.padding_idx
+        )
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+        self.embed_positions = IndicTransSinusoidalPositionalEmbedding(
+            config.max_target_positions,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList(
+            [IndicTransDecoderLayer(config) for _ in range(config.decoder_layers)]
+        )
+        self.layer_norm = (
+            nn.LayerNorm(embed_dim) if config.decoder_normalize_before else None
+        )
+        self.layernorm_embedding = (
+            nn.LayerNorm(embed_dim) if config.layernorm_embedding else None
+        )
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            combined_attention_mask = combined_attention_mask + _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+        # embed positions
+        positions = self.embed_positions(
+            input_ids, inputs_embeds, past_key_values_length
+        )
+        positions = positions.to(inputs_embeds.device)
+        hidden_states = inputs_embeds + positions
+        if self.layernorm_embedding is not None:
+            hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting"
+                    " `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip(
+            [head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]
+        ):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            skip_the_layer = (
+                True
+                if self.training and (dropout_probability < self.layerdrop)
+                else False
+            )
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                past_key_value = (
+                    past_key_values[idx] if past_key_values is not None else None
+                )
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            # None for past_key_value
+                            return module(*inputs, output_attentions, use_cache)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(decoder_layer),
+                        hidden_states,
+                        combined_attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        head_mask[idx] if head_mask is not None else None,
+                        cross_attn_head_mask[idx]
+                        if cross_attn_head_mask is not None
+                        else None,
+                        None,
+                    )
+                else:
+                    layer_outputs = decoder_layer(
+                        hidden_states,
+                        attention_mask=combined_attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None
+                        ),
+                        cross_attn_layer_head_mask=(
+                            cross_attn_head_mask[idx]
+                            if cross_attn_head_mask is not None
+                            else None
+                        ),
+                        past_key_value=past_key_value,
+                        output_attentions=output_attentions,
+                        use_cache=use_cache,
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                continue
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+        if self.layer_norm is not None:
+            hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100Model->IndicTrans
+class IndicTransModel(IndicTransPreTrainedModel):
+    _tied_weights_keys = None
+    def __init__(self, config: IndicTransConfig):
+        super().__init__(config)
+        self.encoder = IndicTransEncoder(config)
+        self.decoder = IndicTransDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ForConditionalGeneration->IndicTrans
+class IndicTransForConditionalGeneration(IndicTransPreTrainedModel):
+    base_model_prefix = "model"
+    _tied_weights_keys = None
+    def __init__(self, config: IndicTransConfig):
+        super().__init__(config)
+        self.model = IndicTransModel(config)
+        self.lm_head = nn.Linear(
+            config.decoder_embed_dim, config.decoder_vocab_size, bias=False
+        )
+        if config.share_decoder_input_output_embed:
+            self.lm_head.weight = self.model.decoder.embed_tokens.weight
+        self.post_init()
+    def tie_weights(self):
+        pass
+    def get_encoder(self):
+        return self.model.get_encoder()
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0])
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to the correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                lm_logits.view(-1, self.config.vocab_size), labels.view(-1)
+            )
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return (
+                ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+            )
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past

sample.srt ADDED Viewed

	@@ -0,0 +1,699 @@

+1
+00:00:00,000 --> 00:00:01,845
+Sadhguru: If you activate this dimension of energy,
+2
+00:00:01,845 --> 00:00:05,013
+other dimensions of life will open up.
+3
+00:00:05,013 --> 00:00:07,545
+One thing is,
+4
+00:00:07,545 --> 00:00:10,377
+are you ready for those dimensions?
+5
+00:00:10,377 --> 00:00:11,280
+Kundalini Yoga,
+6
+00:00:11,280 --> 00:00:12,318
+in its essence,
+7
+00:00:12,318 --> 00:00:16,637
+is the most dangerous form of yoga.
+8
+00:00:16,637 --> 00:00:18,008
+I'm saying dangerous,
+9
+00:00:18,008 --> 00:00:20,538
+because it's the most potent also.
+10
+00:00:20,538 --> 00:00:22,890
+If you have to jump into an abyss,
+11
+00:00:22,890 --> 00:00:28,613
+you should be insane or you should have enormous trust in somebody.
+12
+00:00:28:613 --> 00:00:37,465
+So what is Kundalini?
+13
+00:00:37,465 --> 00:00:41,788
+Right now,
+14
+00:00:41,788 --> 00:00:43,100
+I'm speaking,
+15
+00:00:43,100 --> 00:00:46,495
+this is Kundalini.
+16
+00:00:46,495 --> 00:00:47,843
+You are alert and listening.
+17
+00:00:47,843 --> 00:00:49,483
+If you are alert and listening,
+18
+00:00:49,483 --> 00:00:51,278
+that is Kundalini
+19
+00:00:51,278 --> 00:00:54,528
+A flower is blossoming,
+20
+00:00:54,528 --> 00:00:55,680
+that is Kundalini.
+21
+00:00:55,680 --> 00:00:57,645
+A dog is barking,
+22
+00:00:57,645 --> 00:00:59,480
+that is also Kundalini,
+23
+00:00:59,480 --> 00:01:00,553
+or in other words,
+24
+00:01:00,553 --> 00:01:03,545
+the fundamental life force in the existence,
+25
+00:01:03,545 --> 00:01:04,880
+we call it Kundalini.
+26
+00:01:04,880 --> 00:01:07,513
+Now,
+27
+00:01:07,513 --> 00:01:08,555
+within the system,
+28
+00:01:08,555 --> 00:01:10,377
+within the human system,
+29
+00:01:10,377 --> 00:01:14,065
+if you look at this as a kind of a life package,
+30
+00:01:14,065 --> 00:01:15,287
+it's a piece of life.
+31
+00:01:15,287 --> 00:01:18,540
+This piece of life is packed in a certain way,
+32
+00:01:18,540 --> 00:01:20,725
+with layers of this energy.
+33
+00:01:20,725 --> 00:01:24,887
+One dimension of energy comes alive immediately,
+34
+00:01:24,887 --> 00:01:28,667
+because that is necessary for your survival process.
+35
+00:01:28,667 --> 00:01:37,369
+The other dimensions of energy will not come alive unless you do something about it.
+36
+00:01:37,369 --> 00:01:40,203
+Unless you're aware of it and activate it in a certain way,
+37
+00:01:40,203 --> 00:01:42,093
+they do not come into existence.
+38
+00:01:42,093 --> 00:01:43,980
+They remain dormant.
+39
+00:01:43,980 --> 00:01:49,932
+The dormant energy is way bigger than the energy that is in use right now.
+40
+00:01:49,932 --> 00:01:53,582
+To take care of your survival process,
+41
+00:01:53,582 --> 00:01:57,169
+to live a physical life completely,
+42
+00:01:57,169 --> 00:02:00,902
+to live a complete physical and intellectual life,
+43
+00:02:00,902 --> 00:02:08,451
+you need to activate only about twenty-one of your chakras.
+44
+00:02:08,451 --> 00:02:12,246
+Out of this one hundred and fourteen,
+45
+00:02:12,246 --> 00:02:14,444
+if about twenty-one of them are on,
+46
+00:02:14,444 --> 00:02:16,248
+you will live a complete life,
+47
+00:02:16,248 --> 00:02:18,322
+you will not feel any inadequacy.
+48
+00:02:18,322 --> 00:02:21,267
+You will live a complete physical life.
+49
+00:02:21,267 --> 00:02:23,112
+There'll be no problem with your life,
+50
+00:02:23,112 --> 00:02:25,249
+you will think you're a great success,
+51
+00:02:25,249 --> 00:02:29,458
+but you're only twenty-one,
+52
+00:02:29,458 --> 00:02:34,000
+that is less than twenty-one percent out of one hundred and fourteen,
+53
+00:02:34,000 --> 00:02:35,949
+less than twenty percent.
+54
+00:02:35,949 --> 00:02:37,733
+At let… less than twenty percent,
+55
+00:02:37,733 --> 00:02:41,984
+you will feel like a complete life without any inadequacies.
+56
+00:02:41,984 --> 00:02:44,618
+The remaining percentage of life,
+57
+00:02:44,618 --> 00:02:46,339
+what is it about?
+58
+00:02:46,339 --> 00:02:50,632
+It is not even needed if your intention is just to live well.
+59
+00:02:50,632 --> 00:02:53,846
+If you activate this dimension of energy,
+60
+00:02:53,846 --> 00:02:57,143
+other dimensions of life will open up.
+61
+00:02:57,143 --> 00:02:58,865
+One thing is,
+62
+00:02:58,865 --> 00:03:00,980
+“Are you ready for those dimensions?”
+63
+00:03:00,980 --> 00:03:05,003
+The question is not about whether it's good or bad.
+64
+00:03:05,003 --> 00:03:06,890
+The question is just about,
+65
+00:03:06,890 --> 00:03:08,902
+“Are you ready for it?”
+66
+00:03:08,902 --> 00:03:13,920
+Because even if the best things in life come to your life,
+67
+00:03:13,920 --> 00:03:16,803
+when you are not ready for it,
+68
+00:03:16,803 --> 00:03:19,084
+it will not be a good thing for you.
+69
+00:03:19,084 --> 00:03:22,713
+In your experience it will not be a good thing
+70
+00:03:22,713 --> 00:03:25,223
+if something came to you when you're not ready for it,
+71
+00:03:25,223 --> 00:03:26,467
+isn't it so?
+72
+00:03:26,467 --> 00:03:28,250
+Even if it's a greatest thing,
+73
+00:03:28,250 --> 00:03:29,702
+it may be the greatest thing,
+74
+00:03:29,702 --> 00:03:33,539
+but it came to you when you are not ready for it.
+75
+00:03:33,539 --> 00:03:34,969
+Then it is not a good thing,
+76
+00:03:34,969 --> 00:03:36,525
+isn't it?
+77
+00:03:36,525 --> 00:03:38,018
+So are you ready for it,
+78
+00:03:38,018 --> 00:03:39,034
+is the first question.
+79
+00:03:39,034 --> 00:03:40,071
+If you're ready for it,
+80
+00:03:40,071 --> 00:03:41,709
+what can we do for it?
+81
+00:03:41,709 --> 00:03:44,073
+What can we do to activate it?
+82
+00:03:44,073 --> 00:03:46,541
+The various ways of doing this,
+83
+00:03:46,541 --> 00:03:47,143
+many,
+84
+00:03:47,143--> 00:03:48,200
+many ways,
+85
+00:03:48,200 --> 00:03:50,938
+but the Kundalini Yoga…
+86
+00:03:50,938 --> 00:03:56,558
+are people familiar with Kundalini Yoga practicing or…?
+87
+00:03:56,558 --> 00:04:00,083
+Okay.
+88
+00:04:00,083 --> 00:04:01,535
+Kundalini Yoga.
+89
+00:04:01,535 --> 00:04:04,708
+I…I'm not making a comment about anybody,
+90
+00:04:04,708 --> 00:04:07,279
+okay?
+91
+00:04:07,279 --> 00:04:08,378
+Kundalini Yoga,
+92
+00:04:08,378 --> 00:04:13,687
+in its essence is the most dangerous form of yoga.
+93
+00:04:13,687 --> 00:04:15,471
+I'm saying dangerous,
+94
+00:04:15,471 --> 00:04:19,203
+because it's the most potent also.
+95
+00:04:19,203 --> 00:04:27,561
+What is most potent is always the most dangerous if improperly handled.
+96
+00:04:27,561 --> 00:04:31,169
+There are various kinds of energy in the world right now,
+97
+00:04:31,169 --> 00:04:33,886
+even the electricity is being manufactured in… I mean,
+98
+00:04:33,886 --> 00:04:36,126
+produced in so many different ways.
+99
+00:04:36,126 --> 00:04:40,170
+One of the ways that we do it is through nuclear reactions,
+100
+00:04:40,170 --> 00:04:42,907
+nuclear reactors rather..
+101
+00:04:42,907 --> 00:04:46,889
+It is the most efficient way of producing energy that we know right now,
+102
+00:04:46,889 --> 00:04:49,211
+but it is also the most dangerous way,
+103
+00:04:49,211 --> 00:04:51,223
+isn't it?
+104
+00:04:51,223 --> 00:04:52,965
+When things go wrong,
+105
+00:04:52,965 --> 00:04:55,744
+they go seriously wrong.
+106
+00:04:55,744 --> 00:04:57,299
+When they're going right,
+107
+00:04:57,299 --> 00:05:04,288
+it is the easiest and the best way to produce energy on the planet is nuclear energy actually.
+108
+00:05:04,288 --> 00:05:06,258
+But when it goes bad,
+109
+00:05:06,258 --> 00:05:07,502
+it goes bad,
+110
+00:05:07,502 --> 00:05:08,539
+really bad,
+111
+00:05:08,539 --> 00:05:11,256
+like in ways that you can't fix it.
+112
+00:05:11,256 --> 00:05:11,691
+So,
+113
+00:05:11,691 --> 00:05:13,454
+similarly with Kundalini Yoga,
+114
+00:05:13,454 --> 00:05:17,353
+it is the most potent and it is the most dangerous.
+115
+00:05:17,353 --> 00:05:21,106
+Without the necessary preparation and guidance,
+116
+00:05:21,106 --> 00:05:23,387
+without expert guidance,
+117
+00:05:23,387 --> 00:05:25,793
+constant guidance and observation,
+118
+00:05:25,793 --> 00:05:28,219
+nobody should ever attempt it.
+119
+00:05:28,219 --> 00:05:29,588
+But the problem is,
+120
+00:05:29,588 --> 00:05:33,611
+books have been written about it and everybody wants to do the highest yoga.
+121
+00:05:33,611 --> 00:05:35,830
+Nobody wants to start with A,
+122
+00:05:35,830 --> 00:05:38,650
+everybody wants to start the alphabet with Z.
+123
+00:05:38,650 --> 00:05:42,383
+This attitude itself is dangerous.
+124
+00:05:42,383 --> 00:05:50,326
+What can be a life-transforming force can become a life-destructive force
+125
+00:05:50,326 --> 00:05:55,780
+simply because without the necessary commitment and dedication and focus and understanding
+126
+00:05:55,780 --> 00:05:57,294
+it is being handled.
+127
+00:05:57,294 --> 00:05:58,849
+Anyway,
+128
+00:05:58,849 --> 00:06:00,550
+about raising the Kundalini,
+129
+00:06:00,550 --> 00:06:02,292
+if the Kundalini rises,
+130
+00:06:02,292 --> 00:06:08,534
+the dimensions of your life will change so rapidly that you must be willing
+131
+00:06:08,534 --> 00:06:11,562
+to make the outside admin…  adjustments equally,
+132
+00:06:11,562 --> 00:06:12,806
+quick.
+133
+00:06:12,806 --> 00:06:14,320
+Otherwise,
+134
+00:06:14,320 --> 00:06:18,426
+things will fall apart in a big way.
+135
+00:06:18,426 --> 00:06:20,624
+In the classical yogic traditions,
+136
+00:06:20,624 --> 00:06:26,161
+there is a certain type of yoga we teach for people who live in family situations.
+137
+00:06:26,161 --> 00:06:30,205
+There is a certain other type of yoga we teach for ascetics.
+138
+00:06:30,205 --> 00:06:35,159
+In Isha,
+139
+00:06:35,159 --> 00:06:32,486
+we have both the forms,
+140
+00:06:32,486 --> 00:06:36,219
+we have ascetic yoga and we have the general yoga.
+141
+00:06:36,219 --> 00:06:38,915
+We will never teach you the ascetic form.
+142
+00:06:38,915 --> 00:06:41,590
+That is the most pos… potent way to do it.
+143
+00:06:41,590 --> 00:06:46,733
+But it will demand a certain dimension of discipline and focus,
+144
+00:06:46,733 --> 00:06:50,113
+which your regular lives will not allow.
+145
+00:06:50,113 --> 00:06:52,498
+If you do that kind of yoga,
+146
+00:06:52,498 --> 00:06:56,273
+it will dismantle your outside life instantly.
+147
+00:06:56,273 --> 00:06:59,632
+Now this Yoga is not designed to dismantle your life,
+148
+00:06:59,632 --> 00:07:05,874
+this Yoga is designed to make your life happen better.
+149
+00:07:05,874 --> 00:07:07,907
+When life happens better,
+150
+00:07:07,907 --> 00:07:09,835
+when things happen better,
+151
+00:07:09,835 --> 00:07:11,328
+you make more money,
+152
+00:07:11,328 --> 00:07:13,112
+your business is going better,
+153
+00:07:13,112 --> 00:07:14,854
+your profession is happening better.
+154
+00:07:14,854 --> 00:07:19,644
+You're generally unfortunately,
+155
+00:07:19,644 --> 00:07:23,833
+you are longing to seek the higher becomes slower.
+156
+00:07:23,833 --> 00:07:25,512
+Yes.
+157
+00:07:25,512 --> 00:07:27,006
+So in the real sense,
+158
+00:07:27,006 --> 00:07:28,873
+it is not the good way to do it.
+159
+00:07:28,873 --> 00:07:31,714
+But it's the only way it works in today's world.
+160
+00:07:31,714 --> 00:07:34,596
+And it's the only way it works for majority of the people.
+161
+00:07:34,596 --> 00:07:36,255
+For a small number of people,
+162
+00:07:36,255 --> 00:07:38,018
+we can do it other ways.
+163
+00:07:38,018 --> 00:07:43,680
+We can bypass all these things and just do very powerful ways of doing things.
+164
+00:07:43,680 --> 00:07:47,329
+But it will dismantle all social structures around them,
+165
+00:07:47,329 --> 00:07:49,631
+which is not good for everybody to do.
+166
+00:07:49,631 --> 00:07:51,809
+So these are different dimensions.
+167
+00:07:51,809 --> 00:07:52,887
+Kundalini Yoga,
+168
+00:07:52,887 --> 00:07:54,443
+if it has to be practiced,
+169
+00:07:54,443 --> 00:07:57,263
+you must be in a certain kind of atmosphere.
+170
+00:07:57,263 --> 00:08:01,618
+You cannot live in social situations and do Kundalini Yoga
+171
+00:08:01,618 --> 00:08:02,468
+Otherwise,
+172
+00:08:02,468 --> 00:08:03,878
+in the name of Kundalini Yoga,
+173
+00:08:03,878 --> 00:08:06,263
+you're doing something simplistic.
+174
+00:08:06,263 --> 00:08:07,176
+Otherwise,
+175
+00:08:07,176 --> 00:08:10,971
+Kundalini yoga can transform the way you are within days.
+176
+00:08:10,971 --> 00:08:16,674
+Suddenly you find you're a stranger in your own home within two days of practice,
+177
+00:08:16,674 --> 00:08:20,469
+because it will change everything about you.
+178
+00:08:20,469 --> 00:08:20,759
+So,
+179
+00:08:20,759 --> 00:08:22,169
+can we raise the Kundalini?
+180
+00:08:22,169 --> 00:08:22,480
+Yes,
+181
+00:08:22,480 --> 00:08:25,135
+we can.
+182
+00:08:25,135 --> 00:08:30,651
+One way is to create a conducive atmosphere so that slowly it rises.
+183
+00:08:30,651 --> 00:08:35,110
+The other way is to provoke it in such a way that it rises quickly.
+184
+00:08:35,110 --> 00:08:37,225
+If it rises quickly,
+185
+00:08:37,225 --> 00:08:39,423
+then everything changes dramatically.
+186
+00:08:39,423 --> 00:08:42,223
+If it rises slowly over a period of time,
+187
+00:08:42,223 --> 00:08:44,234
+changes will happen slowly,
+188
+00:08:44,234 --> 00:08:48,216
+you will be capable of handling these changes over a period of time.
+189
+00:08:48,216 --> 00:08:49,958
+But if it happens very quick,
+190
+00:08:49,958 --> 00:08:53,193
+then you will not be able to handle the changes,
+191
+00:08:53,193 --> 00:08:56,677
+things will look like things are falling apart.
+192
+00:08:56,677 --> 00:08:58,958
+So there are different ways of doing this.
+193
+00:08:58,958 --> 00:08:59,705
+How many ways?
+194
+00:08:59,705 --> 00:09:00,887
+There are too many ways,
+195
+00:09:00,887 --> 00:09:03,023
+I will not go into how many ways.
+196
+00:09:03,023 --> 00:09:04,972
+There are so many ways of doing it.
+197
+00:09:04,972 --> 00:09:05,885
+Essentially,
+198
+00:09:05,885 --> 00:09:10,903
+there are one hundred and twelve ways of doing it.
+199
+00:09:10,903 --> 00:09:14,595
+There are hundred and twelve ways in which you can take this up,
+200
+00:09:14,595 --> 00:09:17,602
+from the base to…
+201
+00:09:17,602 --> 00:09:18,099
+Oh!
+202
+00:09:18,099 --> 00:09:19,675
+for this you have to know the structure,
+203
+00:09:19,675 --> 00:09:22,122
+otherwise it will become very elaborate.
+204
+00:09:22,122 --> 00:09:25,606
+Hmm,
+205
+00:09:25,606 --> 00:09:28,385
+out of this one hundred and fourteen chakras,
+206
+00:09:28,385 --> 00:09:33,093
+there are seven which we recognize as seven dimensions.
+207
+00:09:33,093 --> 00:09:34,462
+Out of this,
+208
+00:09:34,462 --> 00:09:36,287
+six are within the body,
+209
+00:09:36,287 --> 00:09:39,750
+one just outside the physical body.
+210
+00:09:39,750 --> 00:09:40,144
+So,
+211
+00:09:40,144 --> 00:09:43,192
+if you employ this one hundred and twelve methods,
+212
+00:09:43,192 --> 00:09:45,639
+you will handle the six chakras,
+213
+00:09:45,639 --> 00:09:48,190
+the seventh one you cannot handle.
+214
+00:09:48,190 --> 00:09:52,151
+There are hundred and twelve ways in which you can at… attain to a chakra
+215
+00:09:52,151 --> 00:09:53,976
+which we refer to as Agna,
+216
+00:09:53,976 --> 00:09:55,448
+but from Agna to Sahasrara,
+217
+00:09:55,448 --> 00:09:57,398
+there is no way.
+218
+00:09:57,398 --> 00:09:59,264
+There is no way to do it,
+219
+00:09:59,264 --> 00:10:02,520
+you just have to jump into an abyss.
+220
+00:10:02,520 --> 00:10:04,843
+If you have to jump into an abyss,
+221
+00:10:04,843 --> 00:10:11,417
+you should be insane or you should have enormous trust in somebody.
+222
+00:10:11,417 --> 00:10:15,543
+Somebody says jump and you are jumping because
+223
+00:10:15,543 --> 00:10:19,214
+you have such a deep trust in somebody that when he says jump,
+224
+00:10:19,214 --> 00:10:21,226
+it has to be good for you.
+225
+00:10:21,226 --> 00:10:26,078
+You simply jump into a bottomless pit.
+226
+00:10:26,078 --> 00:10:27,011
+So,
+227
+00:10:27,011 --> 00:10:33,088
+the journey from the Mooladhara to Agna there are one hundred and twelve ways to get there.
+228
+00:10:33,088 --> 00:10:34,311
+But from there to there,
+229
+00:10:34,311 --> 00:10:35,307
+there is no way.
+230
+00:10:35,307 --> 00:10:36,841
+It is just one jump,
+231
+00:10:36,841 --> 00:10:38,998
+that can happen in trust,
+232
+00:10:38,998 --> 00:10:42,316
+in devotion or in madness.
+233
+00:10:42,316 --> 00:10:57,780
+Choice is yours.