pranjalchitale commited on
Commit
c26e5b3
·
1 Parent(s): 4a5d9b4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +69 -1
README.md CHANGED
@@ -62,7 +62,75 @@ Please refer to `Appendix D: Model Card` of the [preprint](https://arxiv.org/abs
62
 
63
  Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
64
 
65
- **Note: IndicTrans2 is not compatible with AutoTokenizer, therefore we provide [IndicTransTokenizer](https://github.com/VarunGumma/IndicTransTokenizer)**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
 
68
  ### Citation
 
62
 
63
  Please refer to the [github repository](https://github.com/AI4Bharat/IndicTrans2/tree/main/huggingface_inference) for a detail description on how to use HF compatible IndicTrans2 models for inference.
64
 
65
+ ```python
66
+ import torch
67
+ from transformers import (
68
+ AutoModelForSeq2SeqLM,
69
+ AutoTokenizer,
70
+ )
71
+ from IndicTransTokenizer import IndicProcessor
72
+
73
+
74
+ model_name = "ai4bharat/indictrans2-indic-en-1B"
75
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
76
+
77
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
78
+
79
+ ip = IndicProcessor(inference=True)
80
+
81
+ input_sentences = [
82
+ "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
83
+ "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
84
+ "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
85
+ "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
86
+ ]
87
+
88
+ src_lang, tgt_lang = "hin_Deva", "eng_Latn"
89
+
90
+ batch = ip.preprocess_batch(
91
+ input_sentences,
92
+ src_lang=src_lang,
93
+ tgt_lang=tgt_lang,
94
+ )
95
+
96
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
97
+
98
+ # Tokenize the sentences and generate input encodings
99
+ inputs = tokenizer(
100
+ batch,
101
+ truncation=True,
102
+ padding="longest",
103
+ return_tensors="pt",
104
+ return_attention_mask=True,
105
+ ).to(DEVICE)
106
+
107
+ # Generate translations using the model
108
+ with torch.no_grad():
109
+ generated_tokens = model.generate(
110
+ **inputs,
111
+ use_cache=True,
112
+ min_length=0,
113
+ max_length=256,
114
+ num_beams=5,
115
+ num_return_sequences=1,
116
+ )
117
+
118
+ # Decode the generated tokens into text
119
+ with tokenizer.as_target_tokenizer():
120
+ generated_tokens = tokenizer.batch_decode(
121
+ generated_tokens.detach().cpu().tolist(),
122
+ skip_special_tokens=True,
123
+ clean_up_tokenization_spaces=True,
124
+ )
125
+
126
+ # Postprocess the translations, including entity replacement
127
+ translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)
128
+
129
+ for input_sentence, translation in zip(input_sentences, translations):
130
+ print(f"{src_lang}: {input_sentence}")
131
+ print(f"{tgt_lang}: {translation}")
132
+ ```
133
+
134
 
135
 
136
  ### Citation