kasimali commited on
Commit
b3a699a
·
verified ·
1 Parent(s): e0f94d7

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +3 -8
  2. app.py +675 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,12 +1,7 @@
1
  ---
2
- title: Copy Of Final
3
- emoji: 🏃
4
- colorFrom: green
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 5.49.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Copy of final
3
+ emoji: 🚀
 
 
4
  sdk: gradio
 
 
 
5
  ---
6
 
7
+ # Copy of final
app.py ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy of final
2
+
3
+ # ================================================================
4
+ # = STEP 1: SETUP AND DOWNLOAD (YOUR PROVEN METHOD) =
5
+ # ================================================================
6
+ import os
7
+
8
+ print("--- 1. Installing All Libraries ---")
9
+ print("✅ Libraries installed.")
10
+
11
+ print("\n--- 2. Cloning IndicLID Repository ---")
12
+ # Using your proven method of changing directories
13
+ print("✅ Repository cloned.")
14
+
15
+ # Navigate into the correct directory structure
16
+
17
+ print("\n--- 3. Downloading and Unzipping IndicLID Models ---")
18
+ print("✅ Download commands executed. Unzipping now...")
19
+ print("✅ Unzip commands executed.")
20
+
21
+ print("\n🎉🎉🎉 SETUP COMPLETE. You can now proceed to Step 2. 🎉🎉🎉")
22
+
23
+
24
+ # =========================
25
+ # = STEP 2: INITIALIZE MODELS (EXACTLY AS YOUR OLD CODE) =
26
+ # =========================
27
+ import os
28
+ import sys
29
+ import torch
30
+ print("--- Applying your original add_safe_globals fix... ---")
31
+
32
+ if "/content/IndicLID/Inference" not in sys.path:
33
+ sys.path.append("/content/IndicLID/Inference")
34
+
35
+ from transformers.models.bert.modeling_bert import (
36
+ BertModel, BertPreTrainedModel, BertForSequenceClassification,
37
+ BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
38
+ BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput
39
+ )
40
+ from transformers.models.bert.configuration_bert import BertConfig
41
+ import torch.nn as nn
42
+ from torch.nn.modules.sparse import Embedding
43
+ from torch.nn.modules.container import ModuleList
44
+ from torch.nn.modules.linear import Linear
45
+ from torch.nn.modules.normalization import LayerNorm
46
+ from torch.nn.modules.dropout import Dropout
47
+
48
+ torch.serialization.add_safe_globals([
49
+ BertModel, BertPreTrainedModel, BertForSequenceClassification,
50
+ BertEmbeddings, BertEncoder, BertPooler, BertLayer, BertAttention,
51
+ BertSelfAttention, BertSelfOutput, BertIntermediate, BertOutput, BertConfig,
52
+ Embedding, ModuleList, Linear, LayerNorm, Dropout,
53
+ ])
54
+ print("✅ Comprehensive safe globals added successfully.")
55
+
56
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
57
+ from IndicTransToolkit.processor import IndicProcessor
58
+ from ai4bharat.IndicLID import IndicLID
59
+
60
+ print("--- Loading all models into memory... ---")
61
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
62
+ print(f"Using device: {device}")
63
+
64
+ lid = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
65
+ print("✅ IndicLID model loaded successfully.")
66
+
67
+ MODEL_ID = "ai4bharat/indictrans2-indic-en-1B"
68
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
69
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, trust_remote_code=True).to(device)
70
+ ip = IndicProcessor(inference=True)
71
+ print("✅ IndicTrans2 1B model loaded.")
72
+
73
+ print("🎉 ALL MODELS ARE LOADED. Proceed to direct batch prediction tests.")
74
+
75
+
76
+ import sys
77
+ print(sys.path)
78
+
79
+ pip show transformers
80
+
81
+
82
+
83
+ # ================================================================
84
+ # = STEP 2.5: LOAD ROMANSETU (COMPATIBLE WITH 4.40.2) =
85
+ # ================================================================
86
+
87
+ from transformers import AutoTokenizer, AutoModelForCausalLM
88
+ import torch
89
+
90
+ print("--- Loading RomanSetu model compatible with transformers 4.40.2... ---")
91
+
92
+ # Try smaller, more compatible models first
93
+ model_options = [
94
+ "ai4bharat/romansetu-cpt-roman-100m",
95
+ "ai4bharat/romansetu-cpt-roman-200m"
96
+ ]
97
+
98
+ rs_model = None
99
+ rs_tokenizer = None
100
+
101
+ for model_id in model_options:
102
+ try:
103
+ print(f"Trying model: {model_id}")
104
+ rs_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
105
+ rs_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(device)
106
+ print(f"✅ {model_id} loaded successfully.")
107
+ break
108
+ except Exception as e:
109
+ print(f"❌ {model_id} failed: {e}")
110
+ continue
111
+
112
+ if rs_model is None:
113
+ print("❌ All RomanSetu models failed. Continuing with transliteration-based approach.")
114
+
115
+ def translate_with_romansetu(text, max_new_tokens=50):
116
+ if rs_model is None:
117
+ # Fallback: use enhanced transliteration + IndicTrans2
118
+ from indic_transliteration import sanscript
119
+ from indic_transliteration.sanscript import transliterate
120
+ try:
121
+ # Try to transliterate and then translate with IndicTrans2
122
+ native_text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
123
+ pre = ip.preprocess_batch([native_text], src_lang="hin_Deva", tgt_lang="eng_Latn")
124
+ inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
125
+ with torch.no_grad():
126
+ out = model.generate(**inputs, num_beams=3, max_length=100)
127
+ dec = tokenizer.batch_decode(out, skip_special_tokens=True)
128
+ post = ip.postprocess_batch(dec, lang="hin_Deva")
129
+ return post[0]
130
+ except:
131
+ return text
132
+
133
+ try:
134
+ prompt = f"Translate this romanized Indian text to English: {text}"
135
+ inputs = rs_tokenizer(prompt, return_tensors="pt").to(device)
136
+
137
+ with torch.no_grad():
138
+ outputs = rs_model.generate(
139
+ inputs.input_ids,
140
+ max_new_tokens=max_new_tokens,
141
+ num_beams=2,
142
+ temperature=0.7,
143
+ do_sample=True,
144
+ pad_token_id=rs_tokenizer.eos_token_id
145
+ )
146
+
147
+ full_response = rs_tokenizer.decode(outputs, skip_special_tokens=True)
148
+ translation = full_response.replace(prompt, "").strip()
149
+ return translation if translation and len(translation) > 2 else text
150
+
151
+ except Exception as e:
152
+ return text
153
+
154
+ print("✅ RomanSetu/fallback translation function defined.")
155
+ print("🎉 SETUP COMPLETE with fallback mechanism.")
156
+
157
+
158
+ # ================================================================
159
+ # = STEP 2.6: LOAD INDICXLIT FOR BETTER TRANSLITERATION (CORRECTED) =
160
+ # ================================================================
161
+
162
+ print("--- Installing and loading IndicXlit for better romanized text handling ---")
163
+
164
+ # Install IndicXlit (compatible with your transformers==4.40.2)
165
+
166
+ from ai4bharat.transliteration import XlitEngine
167
+ import torch
168
+
169
+ try:
170
+ # Load IndicXlit engines for different languages (based on official docs)
171
+ xlit_engines = {
172
+ "hindi": XlitEngine("hi", beam_width=4, rescore=True),
173
+ "bengali": XlitEngine("bn", beam_width=4, rescore=True),
174
+ "tamil": XlitEngine("ta", beam_width=4, rescore=True),
175
+ "telugu": XlitEngine("te", beam_width=4, rescore=True),
176
+ "gujarati": XlitEngine("gu", beam_width=4, rescore=True),
177
+ "kannada": XlitEngine("kn", beam_width=4, rescore=True),
178
+ "malayalam": XlitEngine("ml", beam_width=4, rescore=True),
179
+ "punjabi": XlitEngine("pa", beam_width=4, rescore=True),
180
+ "marathi": XlitEngine("mr", beam_width=4, rescore=True),
181
+ "urdu": XlitEngine("ur", beam_width=4, rescore=True),
182
+ }
183
+ print("✅ Multiple IndicXlit engines loaded successfully.")
184
+
185
+ except Exception as e:
186
+ print(f"❌ Error loading IndicXlit: {e}")
187
+ print("💡 Falling back to basic transliteration.")
188
+ xlit_engines = {}
189
+
190
+ def enhanced_transliterate_with_xlit(text, target_lang):
191
+ """
192
+ Enhanced transliteration using IndicXlit (based on official API)
193
+ """
194
+ lang_key = target_lang.lower()
195
+
196
+ if not xlit_engines or lang_key not in xlit_engines:
197
+ # Fallback to your existing transliteration
198
+ from indic_transliteration import sanscript
199
+ from indic_transliteration.sanscript import transliterate
200
+ script_map = {
201
+ "hindi": sanscript.DEVANAGARI, "bengali": sanscript.BENGALI,
202
+ "tamil": sanscript.TAMIL, "telugu": sanscript.TELUGU,
203
+ "kannada": sanscript.KANNADA, "malayalam": sanscript.MALAYALAM,
204
+ "gujarati": sanscript.GUJARATI, "punjabi": sanscript.GURMUKHI,
205
+ "marathi": sanscript.DEVANAGARI, "urdu": 'urdu'
206
+ }
207
+ return transliterate(text, sanscript.ITRANS, script_map.get(lang_key, sanscript.DEVANAGARI))
208
+
209
+ try:
210
+ # Use IndicXlit for better transliteration (official API)
211
+ engine = xlit_engines[lang_key]
212
+
213
+ # For sentences, use translit_sentence (returns dict with lang code as key)
214
+ if ' ' in text:
215
+ result = engine.translit_sentence(text)
216
+ # Get the language code for this engine
217
+ lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
218
+ "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
219
+ "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
220
+ lang_code = lang_codes.get(lang_key, "hi")
221
+ return result.get(lang_code, text)
222
+ else:
223
+ # For single words, use translit_word (returns dict with topk results)
224
+ result = engine.translit_word(text, topk=1)
225
+ lang_codes = {"hindi": "hi", "bengali": "bn", "tamil": "ta", "telugu": "te",
226
+ "gujarati": "gu", "kannada": "kn", "malayalam": "ml",
227
+ "punjabi": "pa", "marathi": "mr", "urdu": "ur"}
228
+ lang_code = lang_codes.get(lang_key, "hi")
229
+ return result.get(lang_code, [text])[0]
230
+
231
+ except Exception as e:
232
+ print(f"IndicXlit error for '{text}': {e}")
233
+ # Fallback if IndicXlit fails
234
+ return text
235
+
236
+ print("✅ Enhanced transliteration function defined.")
237
+ print("🎉 INDICXLIT SETUP COMPLETE.")
238
+
239
+
240
+ import pandas as pd
241
+ from indic_transliteration import sanscript
242
+ from indic_transliteration.sanscript import transliterate
243
+
244
+ # EXPANDED language mapping to handle misdetections
245
+ LID_TO_TRANSLATE = {
246
+ # Hindi variants
247
+ "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
248
+ "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
249
+
250
+ # Maithili (often confused with Hindi) - map to Hindi
251
+ "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
252
+ "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
253
+
254
+ # Bengali variants
255
+ "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
256
+ "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
257
+
258
+ # Assamese (often confused with Bengali) - map to Bengali
259
+ "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
260
+ "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
261
+
262
+ # Tamil variants
263
+ "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
264
+ "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
265
+ "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
266
+
267
+ # Telugu variants
268
+ "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
269
+ "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
270
+
271
+ # Kannada variants
272
+ "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
273
+ "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
274
+
275
+ # Malayalam variants
276
+ "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
277
+ "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
278
+
279
+ # Gujarati variants
280
+ "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
281
+ "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
282
+
283
+ # Punjabi variants
284
+ "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
285
+ "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
286
+
287
+ # Marathi variants
288
+ "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
289
+ "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
290
+
291
+ # Urdu variants
292
+ "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
293
+ "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
294
+
295
+ # Additional commonly misdetected languages
296
+ "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi → Hindi
297
+ "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali → Hindi
298
+ "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani → Hindi
299
+ "gom_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Goan Konkani → Hindi
300
+ "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo → Hindi
301
+ }
302
+
303
+ def enhanced_transliterate_robust(text, target_script):
304
+ """
305
+ Enhanced transliteration with better romanization handling
306
+ """
307
+ try:
308
+ # Preprocess text for better transliteration
309
+ cleaned_text = text.lower().strip()
310
+
311
+ # Handle common romanization patterns
312
+ replacements = {
313
+ 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
314
+ 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
315
+ 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
316
+ }
317
+
318
+ for old, new in replacements.items():
319
+ cleaned_text = cleaned_text.replace(old, new)
320
+
321
+ # Transliterate using your existing library
322
+ result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
323
+ return result if result else text
324
+
325
+ except Exception as e:
326
+ print(f"Transliteration error: {e}")
327
+ return text
328
+
329
+ def detect_and_translate_robust(texts, batch_size=64):
330
+ """
331
+ Robust detection and translation with expanded language mapping
332
+ """
333
+ results = []
334
+ preds = lid.batch_predict(texts, batch_size)
335
+
336
+ for item in preds:
337
+ if isinstance(item, dict):
338
+ text = item.get("text", "")
339
+ lang_code = item.get("lang", item.get("pred_lang", ""))
340
+ score = float(item.get("score", 0.0))
341
+ model_name = item.get("model", "")
342
+ else:
343
+ text, lang_code, score, model_name = item
344
+
345
+ is_romanized = lang_code.endswith("_Latn")
346
+
347
+ if lang_code not in LID_TO_TRANSLATE:
348
+ translation = f"Language '{lang_code}' not supported for translation"
349
+ method = "Unsupported"
350
+ else:
351
+ try:
352
+ lang_info = LID_TO_TRANSLATE[lang_code]
353
+ src_code = lang_info["it_code"]
354
+
355
+ if is_romanized:
356
+ # Use enhanced transliteration
357
+ native_text = enhanced_transliterate_robust(text, lang_info["script"])
358
+ method = f"Enhanced Transliteration + IndicTrans2 (detected as {lang_code})"
359
+ print(f"Enhanced: '{text}' → '{native_text}' (detected: {lang_code})")
360
+ else:
361
+ native_text = text
362
+ method = f"IndicTrans2 (detected as {lang_code})"
363
+
364
+ # Translate with IndicTrans2
365
+ pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
366
+ inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
367
+ with torch.no_grad():
368
+ out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
369
+ dec = tokenizer.batch_decode(out, skip_special_tokens=True)
370
+ post = ip.postprocess_batch(dec, lang=src_code)
371
+ translation = post[0]
372
+
373
+ except Exception as e:
374
+ translation = f"Translation error: {str(e)}"
375
+ method = "Error"
376
+
377
+ results.append({
378
+ "original_text": text,
379
+ "detected_lang": lang_code,
380
+ "script_type": "Romanized" if is_romanized else "Native",
381
+ "confidence": f"{score:.3f}",
382
+ "translation_method": method,
383
+ "english_translation": translation
384
+ })
385
+
386
+ return pd.DataFrame(results)
387
+
388
+ print("✅ Robust translation function with expanded language mapping defined")
389
+
390
+ # Test with the same samples
391
+ sample_texts = [
392
+ "यहाँ कितने लोग हैं?",
393
+ "tum kaha ho",
394
+ "aaj mausam suhana hai",
395
+ "aap kaise hain",
396
+ "আমি ভালো আছি।",
397
+ "ami bhalo achi",
398
+ "mera naam rahul hai",
399
+ "main office jaa raha hun"
400
+ ]
401
+
402
+ print(f"🔍 Testing robust approach with expanded language mapping...")
403
+ df_results = detect_and_translate_robust(sample_texts, batch_size=16)
404
+ display(df_results)
405
+
406
+
407
+ # ================================================================
408
+ # = COMPLETE TEST CODE FOR ALL 22 INDIAN LANGUAGES =
409
+ # ================================================================
410
+
411
+ import pandas as pd
412
+ from indic_transliteration import sanscript
413
+ from indic_transliteration.sanscript import transliterate
414
+
415
+ # Official 22 Indian languages sample sentences (native + romanized)
416
+ sample_sentences = {
417
+ "Assamese": ("আপুনি কেনেকৈ আছেন?", "apuni kenekoi asen?"),
418
+ "Bengali": ("তুমি কেমন আছো?", "tumi kemon acho?"),
419
+ "Bodo": ("नांगनि फाथै खौ?", "nangni phathai kho?"),
420
+ "Dogri": ("तुसीं केहे हो?", "tusi kehe ho?"),
421
+ "Gujarati": ("તમે કેમ છો?", "tame kem cho?"),
422
+ "Hindi": ("तुम कैसे हो?", "tum kaise ho?"),
423
+ "Kannada": ("ನೀವು ಹೇಗಿದ್ದೀರಾ?", "neevu hegiddira?"),
424
+ "Kashmiri": ("तुस की छै?", "tus ki chhai?"),
425
+ "Konkani": ("तुम कशें आसा?", "tum kashen asa?"),
426
+ "Maithili": ("अहाँ कथी छी?", "ahaan kathi chhi?"),
427
+ "Malayalam": ("സുഖമായിരോ?", "sukhamaayiro?"),
428
+ "Manipuri": ("नमस्कार, नखोंगबा तौ?", "namaskaar, nakhongba tau?"),
429
+ "Marathi": ("तू कसा आहेस?", "tu kasa ahes?"),
430
+ "Nepali": ("तिमी कस्तो छौ?", "timi kasto chau?"),
431
+ "Odia": ("ତୁମେ କେମିତି ଅଛ?", "tume kemiti achha?"),
432
+ "Punjabi": ("ਤੁਸੀਂ ਕਿਵੇਂ ਹੋ?", "tusi kiven ho?"),
433
+ "Sanskrit": ("भवतः कथम् अस्ति?", "bhavatah katham asti?"),
434
+ "Santali": ("ᱥᱟᱱᱛᱟᱲᱤ ᱠᱚᱱᱛᱮᱞᱤ ᱟᱹᱲᱤ?", "santalii konteli adii?"),
435
+ "Sindhi": ("توهان ڪيئن آهيو؟", "tohan kayn aahiyo?"),
436
+ "Tamil": ("நீங்கள் எப்படி இருக்கிறீர்கள்?", "neenga epdi irukeenga?"),
437
+ "Telugu": ("మీరు ఎలా ఉన్నారు?", "meeru ela unnaru?"),
438
+ "Urdu": ("آپ کیسے ہیں؟", "aap kaise hain?")
439
+ }
440
+
441
+ # Expanded language mapping (covers common misdetections)
442
+ LID_TO_TRANSLATE = {
443
+ # Hindi variants
444
+ "hin_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
445
+ "hin_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
446
+ "mai_Deva": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Maithili→Hindi
447
+ "mai_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"},
448
+ "nep_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Nepali→Hindi
449
+ "snd_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Sindhi→Hindi
450
+ "kok_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Konkani→Hindi
451
+ "brx_Latn": {"name": "Hindi", "script": sanscript.DEVANAGARI, "it_code": "hin_Deva"}, # Bodo→Hindi
452
+
453
+ # Bengali variants
454
+ "ben_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
455
+ "ben_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
456
+ "asm_Beng": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"}, # Assamese→Bengali
457
+ "asm_Latn": {"name": "Bengali", "script": sanscript.BENGALI, "it_code": "ben_Beng"},
458
+
459
+ # Tamil variants
460
+ "tam_Tamil": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
461
+ "tam_Taml": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
462
+ "tam_Latn": {"name": "Tamil", "script": sanscript.TAMIL, "it_code": "tam_Taml"},
463
+
464
+ # Telugu variants
465
+ "tel_Telu": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
466
+ "tel_Latn": {"name": "Telugu", "script": sanscript.TELUGU, "it_code": "tel_Telu"},
467
+
468
+ # Kannada variants
469
+ "kan_Knda": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
470
+ "kan_Latn": {"name": "Kannada", "script": sanscript.KANNADA, "it_code": "kan_Knda"},
471
+
472
+ # Malayalam variants
473
+ "mal_Mlym": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
474
+ "mal_Latn": {"name": "Malayalam", "script": sanscript.MALAYALAM, "it_code": "mal_Mlym"},
475
+
476
+ # Gujarati variants
477
+ "guj_Gujr": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
478
+ "guj_Latn": {"name": "Gujarati", "script": sanscript.GUJARATI, "it_code": "guj_Gujr"},
479
+
480
+ # Punjabi variants
481
+ "pan_Guru": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
482
+ "pan_Latn": {"name": "Punjabi", "script": sanscript.GURMUKHI, "it_code": "pan_Guru"},
483
+
484
+ # Marathi variants
485
+ "mar_Deva": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
486
+ "mar_Latn": {"name": "Marathi", "script": sanscript.DEVANAGARI, "it_code": "mar_Deva"},
487
+
488
+ # Urdu variants
489
+ "urd_Arab": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
490
+ "urd_Latn": {"name": "Urdu", "script": 'urdu', "it_code": "urd_Arab"},
491
+ }
492
+
493
+ def enhanced_transliterate_robust(text, target_script):
494
+ """Enhanced transliteration with better romanization handling"""
495
+ try:
496
+ cleaned_text = text.lower().strip()
497
+ replacements = {
498
+ 'kh': 'kh', 'ch': 'ch', 'th': 'th', 'ph': 'ph',
499
+ 'bh': 'bh', 'dh': 'dh', 'gh': 'gh', 'jh': 'jh',
500
+ 'aa': 'A', 'ee': 'I', 'oo': 'U', 'ou': 'au'
501
+ }
502
+ for old, new in replacements.items():
503
+ cleaned_text = cleaned_text.replace(old, new)
504
+ result = transliterate(cleaned_text, sanscript.ITRANS, target_script)
505
+ return result if result else text
506
+ except Exception as e:
507
+ print(f"Transliteration error: {e}")
508
+ return text
509
+
510
+ def test_all_22_languages(texts, batch_size=32):
511
+ """Complete testing function for all 22 languages"""
512
+ results = []
513
+ preds = lid.batch_predict(texts, batch_size)
514
+
515
+ for item in preds:
516
+ if isinstance(item, dict):
517
+ text = item.get("text", "")
518
+ lang_code = item.get("lang", item.get("pred_lang", ""))
519
+ score = float(item.get("score", 0.0))
520
+ model_name = item.get("model", "")
521
+ else:
522
+ text, lang_code, score, model_name = item
523
+
524
+ is_romanized = lang_code.endswith("_Latn")
525
+
526
+ if lang_code not in LID_TO_TRANSLATE:
527
+ translation = f"Language '{lang_code}' not supported"
528
+ method = "Unsupported"
529
+ else:
530
+ try:
531
+ lang_info = LID_TO_TRANSLATE[lang_code]
532
+ src_code = lang_info["it_code"]
533
+
534
+ if is_romanized:
535
+ native_text = enhanced_transliterate_robust(text, lang_info["script"])
536
+ method = f"Transliteration+IndicTrans2 (detected: {lang_code})"
537
+ print(f"Romanized: '{text}' → '{native_text}'")
538
+ else:
539
+ native_text = text
540
+ method = f"IndicTrans2 (detected: {lang_code})"
541
+
542
+ # Translate with IndicTrans2
543
+ pre = ip.preprocess_batch([native_text], src_lang=src_code, tgt_lang="eng_Latn")
544
+ inputs = tokenizer(pre, return_tensors="pt", padding=True).to(device)
545
+ with torch.no_grad():
546
+ out = model.generate(**inputs, num_beams=5, max_length=256, early_stopping=True)
547
+ dec = tokenizer.batch_decode(out, skip_special_tokens=True)
548
+ post = ip.postprocess_batch(dec, lang=src_code)
549
+ translation = post[0]
550
+
551
+ except Exception as e:
552
+ translation = f"Translation error: {str(e)}"
553
+ method = "Error"
554
+
555
+ results.append({
556
+ "language": text[:20] + "..." if len(text) > 20 else text,
557
+ "original_text": text,
558
+ "detected_lang": lang_code,
559
+ "script_type": "Romanized" if is_romanized else "Native",
560
+ "confidence": f"{score:.3f}",
561
+ "method": method,
562
+ "english_translation": translation
563
+ })
564
+
565
+ return pd.DataFrame(results)
566
+
567
+ # Create test dataset with all 44 samples (22 native + 22 romanized)
568
+ print("🔍 Creating test dataset for all 22 official Indian languages...")
569
+ all_test_texts = []
570
+ for lang, (native, roman) in sample_sentences.items():
571
+ all_test_texts.append(native)
572
+ all_test_texts.append(roman)
573
+
574
+ print(f"📊 Testing {len(all_test_texts)} samples ({len(sample_sentences)} languages × 2 scripts)...")
575
+
576
+ # Run the complete test
577
+ df_results = test_all_22_languages(all_test_texts, batch_size=32)
578
+
579
+ # Display results
580
+ print("\n🎯 COMPLETE TEST RESULTS:")
581
+ display(df_results)
582
+
583
+ # Summary statistics
584
+ print(f"\n📈 SUMMARY STATISTICS:")
585
+ print(f"Total samples tested: {len(df_results)}")
586
+ print(f"Languages detected: {df_results['detected_lang'].nunique()}")
587
+ print(f"Native script samples: {len(df_results[df_results['script_type'] == 'Native'])}")
588
+ print(f"Romanized samples: {len(df_results[df_results['script_type'] == 'Romanized'])}")
589
+ print(f"Successfully translated: {len(df_results[~df_results['english_translation'].str.contains('error|not supported', case=False)])}")
590
+
591
+
592
+ import pandas as pd
593
+
594
+ def detailed_translation_summary(df_results):
595
+ """
596
+ Generate comprehensive detailed summary of translation results
597
+ """
598
+ # Flag successful translations
599
+ df_results['successful_translation'] = ~df_results['english_translation'].str.contains('error|not supported', case=False, na=False)
600
+
601
+ print("\n=========== OVERALL SUMMARY ===========")
602
+ print(f"Total samples tested: {len(df_results)}")
603
+ print(f"Languages detected: {df_results['detected_lang'].nunique()}")
604
+ print(f"Native script samples: {df_results[df_results['script_type'] == 'Native'].shape[0]}")
605
+ print(f"Romanized samples: {df_results[df_results['script_type'] == 'Romanized'].shape}")
606
+ print(f"Successfully translated: {df_results['successful_translation'].sum()}")
607
+
608
+ overall_success_rate = (df_results['successful_translation'].sum() / len(df_results) * 100)
609
+ print(f"Overall success rate: {overall_success_rate:.1f}%")
610
+
611
+ print("\n=========== DETAILED LANGUAGE BREAKDOWN ===========")
612
+ # Per-language analysis
613
+ lang_summary = df_results.groupby('detected_lang').agg(
614
+ total_samples=('original_text', 'count'),
615
+ native_count=('script_type', lambda x: (x == 'Native').sum()),
616
+ romanized_count=('script_type', lambda x: (x == 'Romanized').sum()),
617
+ mean_confidence=('confidence', lambda x: pd.to_numeric(x, errors='coerce').mean()),
618
+ success=('successful_translation', 'sum'),
619
+ error_count=('successful_translation', lambda x: (~x).sum())
620
+ ).reset_index().sort_values('total_samples', ascending=False)
621
+
622
+ lang_summary['success_rate'] = (lang_summary['success'] / lang_summary['total_samples'] * 100).round(1)
623
+ print(lang_summary)
624
+
625
+ print("\n=========== TOP PERFORMING LANGUAGES ===========")
626
+ top_performers = lang_summary[lang_summary['success_rate'] >= 90].sort_values('success_rate', ascending=False)
627
+ if len(top_performers) > 0:
628
+ print(top_performers[['detected_lang', 'total_samples', 'success_rate']])
629
+ else:
630
+ print("No languages with 90%+ success rate")
631
+
632
+ print("\n=========== CHALLENGING LANGUAGES ===========")
633
+ challenging = lang_summary[lang_summary['success_rate'] < 50].sort_values('success_rate')
634
+ if len(challenging) > 0:
635
+ print(challenging[['detected_lang', 'total_samples', 'success_rate']])
636
+ else:
637
+ print("No languages with <50% success rate")
638
+
639
+ print("\n=========== ERROR ANALYSIS ===========")
640
+ error_df = df_results[~df_results['successful_translation']]
641
+ print(f"Total errors: {len(error_df)}")
642
+ if len(error_df) > 0:
643
+ print("\nError samples:")
644
+ print(error_df[['original_text', 'detected_lang', 'script_type', 'confidence', 'english_translation']])
645
+ else:
646
+ print("No errors found!")
647
+
648
+ print("\n=========== SUCCESS BREAKDOWN BY SCRIPT ===========")
649
+ script_summary = df_results.groupby('script_type').agg(
650
+ total_samples=('original_text', 'count'),
651
+ successful=('successful_translation', 'sum'),
652
+ success_rate=('successful_translation', lambda x: x.mean() * 100)
653
+ ).round(1)
654
+ print(script_summary)
655
+
656
+ print("\n=========== DETECTION CONFIDENCE ANALYSIS ===========")
657
+ confidence_summary = lang_summary[['detected_lang', 'mean_confidence']].sort_values('mean_confidence', ascending=False)
658
+ print("Top 10 most confident detections:")
659
+ print(confidence_summary.head(10))
660
+
661
+ return lang_summary, script_summary, error_df
662
+
663
+ # ===== HOW TO USE =====
664
+ print("✅ Detailed summary function defined")
665
+ print("\n📋 To run on your test results:")
666
+ print(" lang_summary, script_summary, error_df = detailed_translation_summary(df_results)")
667
+ print(" display(lang_summary)")
668
+ print(" display(error_df)")
669
+
670
+
671
+ lang_summary, script_summary, error_df = detailed_translation_summary(df_results)
672
+
673
+
674
+ display(lang_summary)
675
+ display(error_df)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ sentencepiece
4
+ torch
5
+ transformers