ai4bharat
/

indictrans2-indic-en-dist-200M

@@ -59,7 +59,77 @@ Please refer to [section 7.6: Distilled Models](https://openreview.net/forum?id=
 Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
-**Note: IndicTrans2 is not compatible with AutoTokenizer, therefore we provide [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)**
 ### Citation

 Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
+```python
+import torch
+from transformers import (
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+)
+from IndicTransTokenizer import IndicProcessor
+model_name = "ai4bharat/indictrans2-indic-en-dist-200M"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
+ip = IndicProcessor(inference=True)
+input_sentences = [
+    "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
+    "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
+    "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
+    "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
+]
+src_lang, tgt_lang = "hin_Deva", "eng_Latn"
+batch = ip.preprocess_batch(
+    input_sentences,
+    src_lang=src_lang,
+    tgt_lang=tgt_lang,
+)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Tokenize the sentences and generate input encodings
+inputs = tokenizer(
+    batch,
+    truncation=True,
+    padding="longest",
+    return_tensors="pt",
+    return_attention_mask=True,
+).to(DEVICE)
+# Generate translations using the model
+with torch.no_grad():
+    generated_tokens = model.generate(
+        **inputs,
+        use_cache=True,
+        min_length=0,
+        max_length=256,
+        num_beams=5,
+        num_return_sequences=1,
+    )
+# Decode the generated tokens into text
+with tokenizer.as_target_tokenizer():
+    generated_tokens = tokenizer.batch_decode(
+        generated_tokens.detach().cpu().tolist(),
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True,
+    )
+# Postprocess the translations, including entity replacement
+translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
+for input_sentence, translation in zip(input_sentences, translations):
+    print(f"{src_lang}: {input_sentence}")
+    print(f"{tgt_lang}: {translation}")
+```
+**Note: IndicTrans2 is now compatible with AutoTokenizer, however you need to use IndicProcessor from [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer) for preprocessing before tokenization.**
 ### Citation