pranjalchitale commited on
Commit
3657853
1 Parent(s): 2049619

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +71 -1
README.md CHANGED
@@ -55,7 +55,77 @@ Please refer to the [blog](https://ai4bharat.iitm.ac.in/blog/indictrans2-m2m/) f
55
 
56
  Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
57
 
58
- **Note: IndicTrans2 is not compatible with AutoTokenizer, therefore we provide [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  ### Citation
 
55
 
56
  Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
57
 
58
+ ```python
59
+ import torch
60
+ from transformers import (
61
+ AutoModelForSeq2SeqLM,
62
+ AutoTokenizer,
63
+ )
64
+ from IndicTransTokenizer import IndicProcessor
65
+
66
+
67
+ model_name = "ai4bharat/indictrans2-indic-indic-1B"
68
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
69
+
70
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
71
+
72
+ ip = IndicProcessor(inference=True)
73
+
74
+ input_sentences = [
75
+ "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
76
+ "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
77
+ "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
78
+ "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
79
+ ]
80
+
81
+ src_lang, tgt_lang = "hin_Deva", "tam_Taml"
82
+
83
+ batch = ip.preprocess_batch(
84
+ input_sentences,
85
+ src_lang=src_lang,
86
+ tgt_lang=tgt_lang,
87
+ )
88
+
89
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
90
+
91
+ # Tokenize the sentences and generate input encodings
92
+ inputs = tokenizer(
93
+ batch,
94
+ truncation=True,
95
+ padding="longest",
96
+ return_tensors="pt",
97
+ return_attention_mask=True,
98
+ ).to(DEVICE)
99
+
100
+ # Generate translations using the model
101
+ with torch.no_grad():
102
+ generated_tokens = model.generate(
103
+ **inputs,
104
+ use_cache=True,
105
+ min_length=0,
106
+ max_length=256,
107
+ num_beams=5,
108
+ num_return_sequences=1,
109
+ )
110
+
111
+ # Decode the generated tokens into text
112
+ with tokenizer.as_target_tokenizer():
113
+ generated_tokens = tokenizer.batch_decode(
114
+ generated_tokens.detach().cpu().tolist(),
115
+ skip_special_tokens=True,
116
+ clean_up_tokenization_spaces=True,
117
+ )
118
+
119
+ # Postprocess the translations, including entity replacement
120
+ translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
121
+
122
+ for input_sentence, translation in zip(input_sentences, translations):
123
+ print(f"{src_lang}: {input_sentence}")
124
+ print(f"{tgt_lang}: {translation}")
125
+ ```
126
+
127
+ **Note: IndicTrans2 is now compatible with AutoTokenizer, however you need to use IndicProcessor from [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer) for preprocessing before tokenization.**
128
+
129
 
130
 
131
  ### Citation