pranjalchitale commited on
Commit
7645285
1 Parent(s): 7a6b975

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +71 -1
README.md CHANGED
@@ -59,7 +59,77 @@ Please refer to [section 7.6: Distilled Models](https://openreview.net/forum?id=
59
 
60
  Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
61
 
62
- **Note: IndicTrans2 is not compatible with AutoTokenizer, therefore we provide [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
  ### Citation
 
59
 
60
  Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
61
 
62
+ ```python
63
+ import torch
64
+ from transformers import (
65
+ AutoModelForSeq2SeqLM,
66
+ AutoTokenizer,
67
+ )
68
+ from IndicTransTokenizer import IndicProcessor
69
+
70
+
71
+ model_name = "ai4bharat/indictrans2-indic-en-dist-200M"
72
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
73
+
74
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
75
+
76
+ ip = IndicProcessor(inference=True)
77
+
78
+ input_sentences = [
79
+ "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
80
+ "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
81
+ "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
82
+ "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
83
+ ]
84
+
85
+ src_lang, tgt_lang = "hin_Deva", "eng_Latn"
86
+
87
+ batch = ip.preprocess_batch(
88
+ input_sentences,
89
+ src_lang=src_lang,
90
+ tgt_lang=tgt_lang,
91
+ )
92
+
93
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
94
+
95
+ # Tokenize the sentences and generate input encodings
96
+ inputs = tokenizer(
97
+ batch,
98
+ truncation=True,
99
+ padding="longest",
100
+ return_tensors="pt",
101
+ return_attention_mask=True,
102
+ ).to(DEVICE)
103
+
104
+ # Generate translations using the model
105
+ with torch.no_grad():
106
+ generated_tokens = model.generate(
107
+ **inputs,
108
+ use_cache=True,
109
+ min_length=0,
110
+ max_length=256,
111
+ num_beams=5,
112
+ num_return_sequences=1,
113
+ )
114
+
115
+ # Decode the generated tokens into text
116
+ with tokenizer.as_target_tokenizer():
117
+ generated_tokens = tokenizer.batch_decode(
118
+ generated_tokens.detach().cpu().tolist(),
119
+ skip_special_tokens=True,
120
+ clean_up_tokenization_spaces=True,
121
+ )
122
+
123
+ # Postprocess the translations, including entity replacement
124
+ translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
125
+
126
+ for input_sentence, translation in zip(input_sentences, translations):
127
+ print(f"{src_lang}: {input_sentence}")
128
+ print(f"{tgt_lang}: {translation}")
129
+ ```
130
+
131
+ **Note: IndicTrans2 is now compatible with AutoTokenizer, however you need to use IndicProcessor from [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer) for preprocessing before tokenization.**
132
+
133
 
134
 
135
  ### Citation