anikde commited on
Commit
d04ad72
·
1 Parent(s): c34507c

Translation Added (#2)

Browse files

- Translation Added (39b0c467a8b1ff824e2bdc75e6a70c5801246fd4)

IndicPhotoOCR/translation/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Translation module for IndicPhotoOCR
2
+ from .indictrans_translator import IndicTransTranslator
3
+
4
+ __all__ = ['IndicTransTranslator']
IndicPhotoOCR/translation/indictrans_translator.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ IndicTrans2 Translation Module for IndicPhotoOCR
3
+ Clean implementation with proper error handling and full language support
4
+ Optimized for web app usage with better timeout handling
5
+ """
6
+
7
+ import torch
8
+ from typing import List, Optional, Callable
9
+ import warnings
10
+ import unicodedata
11
+ import re
12
+ warnings.filterwarnings("ignore")
13
+
14
+ try:
15
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
16
+ from IndicTransToolkit import IndicProcessor
17
+ INDICTRANS_AVAILABLE = True
18
+ except ImportError as e:
19
+ print(f"Warning: IndicTransToolkit not available: {e}")
20
+ INDICTRANS_AVAILABLE = False
21
+
22
+ class IndicTransTranslator:
23
+ def __init__(self, device: str = "cpu", progress_callback: Optional[Callable] = None):
24
+ """Initialize the IndicTrans2 translator"""
25
+ if not INDICTRANS_AVAILABLE:
26
+ raise ImportError("IndicTransToolkit not available. Install with: pip install IndicTransToolkit")
27
+
28
+ self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
29
+ self.progress_callback = progress_callback
30
+ self.models = {}
31
+ self.tokenizers = {}
32
+ self.processor = None
33
+ self.initialized = False
34
+
35
+ # Language code mapping for IndicTrans2
36
+ self.lang_codes = {
37
+ 'assamese': 'asm_Beng',
38
+ 'bengali': 'ben_Beng',
39
+ 'gujarati': 'guj_Gujr',
40
+ 'hindi': 'hin_Deva',
41
+ 'kannada': 'kan_Knda', # Alias for Kannada
42
+ 'malayalam': 'mal_Mlym',
43
+ 'marathi': 'mar_Deva',
44
+ 'odia': 'ory_Orya',
45
+ 'punjabi': 'pan_Guru',
46
+ 'tamil': 'tam_Taml',
47
+ 'telugu': 'tel_Telu',
48
+ 'urdu': 'urd_Arab',
49
+ 'english': 'eng_Latn'
50
+ }
51
+
52
+ # Script ranges for efficient language detection
53
+ self.script_ranges = {
54
+ 'devanagari': (0x0900, 0x097F), # Hindi/Marathi
55
+ 'bengali_assamese': (0x0980, 0x09FF), # Bengali/Assamese
56
+ 'gujarati': (0x0A80, 0x0AFF),
57
+ 'tamil': (0x0B80, 0x0BFF),
58
+ 'telugu': (0x0C00, 0x0C7F),
59
+ 'kannada': (0x0C80, 0x0CFF),
60
+ 'malayalam': (0x0D00, 0x0D7F),
61
+ 'odia': (0x0B00, 0x0B7F),
62
+ 'punjabi': (0x0A00, 0x0A7F), # Gurmukhi
63
+ 'urdu': (0x0600, 0x06FF), # Arabic
64
+ }
65
+
66
+ def _clean_text(self, text: str) -> str:
67
+ """Clean and normalize text for better translation"""
68
+ if not text:
69
+ return text
70
+
71
+ # Normalize Unicode and remove problematic characters
72
+ text = unicodedata.normalize('NFC', text)
73
+ text = re.sub(r'[\u200C\u200D\uFEFF]', '', text) # Remove zero-width chars
74
+ text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text) # Remove Unicode escapes
75
+ text = re.sub(r'\s+', ' ', text.strip()) # Normalize whitespace
76
+
77
+ # Fix punctuation spacing
78
+ text = re.sub(r'\s+([।,.!?;:])', r'\1', text)
79
+ text = re.sub(r'([।,.!?;:])\s+', r'\1 ', text)
80
+
81
+ return text.strip()
82
+
83
+ def _update_progress(self, message: str):
84
+ """Update progress via callback if available"""
85
+ if self.progress_callback:
86
+ try:
87
+ self.progress_callback(message)
88
+ except:
89
+ pass
90
+ print(message)
91
+
92
+ def _load_models(self):
93
+ """Load both translation models"""
94
+ if self.initialized:
95
+ return
96
+
97
+ try:
98
+ self._update_progress("Initializing IndicProcessor...")
99
+ self.processor = IndicProcessor(inference=True)
100
+
101
+ # Model names
102
+ model_names = {
103
+ "indic-en": "ai4bharat/indictrans2-indic-en-1B",
104
+ "en-indic": "ai4bharat/indictrans2-en-indic-1B"
105
+ }
106
+
107
+ for key, model_name in model_names.items():
108
+ self._update_progress(f"Loading {model_name}...")
109
+
110
+ # Load tokenizer
111
+ tokenizer = AutoTokenizer.from_pretrained(
112
+ model_name,
113
+ trust_remote_code=True
114
+ )
115
+
116
+ # Load model with memory optimization
117
+ model = AutoModelForSeq2SeqLM.from_pretrained(
118
+ model_name,
119
+ trust_remote_code=True,
120
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
121
+ low_cpu_mem_usage=True
122
+ )
123
+
124
+ self.models[key] = model.to(self.device).eval()
125
+ self.tokenizers[key] = tokenizer
126
+
127
+ self._update_progress(f"{key} model loaded successfully")
128
+
129
+ self.initialized = True
130
+ self._update_progress("All translation models loaded!")
131
+
132
+ except Exception as e:
133
+ self._update_progress(f"Error loading models: {str(e)}")
134
+ raise e
135
+
136
+ def detect_language_from_script(self, text_lines: List[str]) -> str:
137
+ """Detect primary language based on script characteristics"""
138
+ if isinstance(text_lines, str):
139
+ text_lines = [text_lines]
140
+
141
+ full_text = " ".join(text_lines)
142
+ if not full_text.strip():
143
+ return 'english'
144
+
145
+ # Count characters from different scripts
146
+ script_counts = {script: 0 for script in self.script_ranges.keys()}
147
+ script_counts['english'] = 0
148
+ assamese_chars = 0
149
+
150
+ for char in full_text:
151
+ # Check script ranges
152
+ found_script = False
153
+ for script, (start, end) in self.script_ranges.items():
154
+ if start <= ord(char) <= end:
155
+ script_counts[script] += 1
156
+ # Special check for Assamese
157
+ if script == 'bengali_assamese' and char in 'ৰৱ':
158
+ assamese_chars += 1
159
+ found_script = True
160
+ break
161
+
162
+ # If not found in Indic scripts, check for English
163
+ if not found_script and char.isalpha() and ord(char) < 128:
164
+ script_counts['english'] += 1
165
+
166
+ total_chars = sum(script_counts.values())
167
+ if total_chars == 0:
168
+ return 'english'
169
+
170
+ # Find dominant script
171
+ dominant_script = max(script_counts, key=script_counts.get)
172
+ dominant_count = script_counts[dominant_script]
173
+
174
+ # Special handling for Bengali/Assamese
175
+ if dominant_script == 'bengali_assamese':
176
+ return 'assamese' if assamese_chars > 0 else 'bengali'
177
+
178
+ # Check for mixed content
179
+ english_ratio = script_counts['english'] / total_chars
180
+ if english_ratio > 0.2 and any(script_counts[s] > 0 for s in self.script_ranges.keys()):
181
+ return 'mixed'
182
+
183
+ # Map script to language
184
+ script_to_lang = {
185
+ 'devanagari': 'hindi',
186
+ 'gujarati': 'gujarati',
187
+ 'tamil': 'tamil',
188
+ 'telugu': 'telugu',
189
+ 'kannada': 'kannada',
190
+ 'malayalam': 'malayalam',
191
+ 'odia': 'odia',
192
+ 'punjabi': 'punjabi',
193
+ 'urdu': 'urdu',
194
+ 'english': 'english'
195
+ }
196
+
197
+ return script_to_lang.get(dominant_script, 'english')
198
+
199
+ def _translate_batch_direct(self, texts: List[str], src_lang: str, tgt_lang: str) -> List[str]:
200
+ """Direct translation using appropriate model"""
201
+ if not texts:
202
+ return []
203
+
204
+ self._load_models()
205
+
206
+ # Convert to language codes
207
+ src_code = self.lang_codes.get(src_lang.lower(), src_lang)
208
+ tgt_code = self.lang_codes.get(tgt_lang.lower(), tgt_lang)
209
+
210
+ # Determine which model to use
211
+ if src_code == 'eng_Latn' and tgt_code != 'eng_Latn':
212
+ model_key = "en-indic"
213
+ elif src_code != 'eng_Latn' and tgt_code == 'eng_Latn':
214
+ model_key = "indic-en"
215
+ else:
216
+ raise ValueError(f"Unsupported direct translation: {src_lang} -> {tgt_lang}")
217
+
218
+ model = self.models[model_key]
219
+ tokenizer = self.tokenizers[model_key]
220
+
221
+ try:
222
+ # Clean and preprocess
223
+ cleaned_texts = [self._clean_text(text) for text in texts]
224
+ processed_texts = self.processor.preprocess_batch(cleaned_texts, src_lang=src_code, tgt_lang=tgt_code)
225
+
226
+ # Tokenize
227
+ inputs = tokenizer(
228
+ processed_texts,
229
+ truncation=True,
230
+ padding=True,
231
+ max_length=512, # Increased for better context
232
+ return_tensors="pt"
233
+ ).to(self.device)
234
+
235
+ # Generate
236
+ with torch.no_grad():
237
+ generated_tokens = model.generate(
238
+ **inputs,
239
+ max_length=512,
240
+ num_beams=5, # Increased beam search
241
+ early_stopping=True,
242
+ do_sample=False,
243
+ use_cache=False,
244
+ pad_token_id=tokenizer.pad_token_id,
245
+ repetition_penalty=1.1, # Reduce repetition
246
+ length_penalty=1.0 # Balanced length penalty
247
+ )
248
+
249
+ # Decode
250
+ decoded = tokenizer.batch_decode(
251
+ generated_tokens,
252
+ skip_special_tokens=True,
253
+ clean_up_tokenization_spaces=True
254
+ )
255
+
256
+ # Postprocess
257
+ results = self.processor.postprocess_batch(decoded, lang=tgt_code)
258
+
259
+ return [self._clean_text(result) for result in results]
260
+
261
+ except Exception as e:
262
+ print(f"Translation error: {str(e)}")
263
+ return [f"[Translation failed: {text}]" for text in texts]
264
+
265
+ def translate_text(self, text: str, src_lang: str, tgt_lang: str) -> str:
266
+ """Translate single text"""
267
+ if not text or not text.strip():
268
+ return ""
269
+
270
+ result = self.translate_multiple_lines([text], src_lang, tgt_lang)
271
+ return result[0] if result else text
272
+
273
+ def translate_multiple_lines(self, lines: List[str], src_lang: str, tgt_lang: str) -> List[str]:
274
+ """Translate multiple lines intelligently"""
275
+ if not lines:
276
+ return []
277
+
278
+ tgt_lang = tgt_lang.lower()
279
+ translated_lines = []
280
+
281
+ print(f"Processing {len(lines)} lines for translation to {tgt_lang}")
282
+
283
+ for i, line in enumerate(lines):
284
+ if not line or not line.strip():
285
+ translated_lines.append(line)
286
+ continue
287
+
288
+ # Detect language
289
+ line_lang = self.detect_language_from_script([line.strip()])
290
+ print(f"Line {i+1}: detected as {line_lang}")
291
+
292
+ # Keep if already in target language
293
+ if line_lang == tgt_lang:
294
+ print(f" Keeping unchanged")
295
+ translated_lines.append(line)
296
+ continue
297
+
298
+ # Handle mixed content
299
+ if line_lang == 'mixed':
300
+ print(f" Mixed language - attempting translation")
301
+ try:
302
+ assumed_lang = 'hindi' if tgt_lang == 'english' else 'english'
303
+ result = self._translate_batch_direct([line.strip()], assumed_lang, tgt_lang)
304
+ translated_lines.append(result[0] if result and result[0] else line)
305
+ except:
306
+ translated_lines.append(line)
307
+ continue
308
+
309
+ # Translate different language
310
+ try:
311
+ print(f" Translating from {line_lang} to {tgt_lang}")
312
+
313
+ if line_lang == "english" and tgt_lang != "english":
314
+ # English → Indic
315
+ result = self._translate_batch_direct([line.strip()], line_lang, tgt_lang)
316
+ elif line_lang != "english" and tgt_lang == "english":
317
+ # Indic → English
318
+ result = self._translate_batch_direct([line.strip()], line_lang, tgt_lang)
319
+
320
+ elif line_lang != "english" and tgt_lang != "english":
321
+ # Bridge via English
322
+ english_result = self._translate_batch_direct([line.strip()], line_lang, "english")
323
+ result = self._translate_batch_direct(english_result, "english", tgt_lang) if english_result and english_result[0] else [line.strip()]
324
+ else:
325
+ result = [line.strip()]
326
+
327
+ translated_lines.append(result[0] if result and result[0] else line)
328
+
329
+ except Exception as e:
330
+ print(f" Translation error: {e}")
331
+ translated_lines.append(line)
332
+
333
+ print(f" Processing completed for all {len(lines)} lines")
334
+ return translated_lines
335
+
336
+ def get_supported_languages(self) -> List[str]:
337
+ """Get supported languages"""
338
+ return list(self.lang_codes.keys())
339
+
IndicPhotoOCR/translation/simple_translator.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple Translator Module for IndicPhotoOCR
3
+ Fallback translator with basic word mappings and script detection
4
+ """
5
+
6
+ class SimpleTranslator:
7
+ """Simple fallback translator with basic word mappings"""
8
+
9
+ def __init__(self):
10
+ # Basic word mappings for common terms
11
+ self.translations = {
12
+ 'hindi_to_english': {
13
+ 'नमस्ते': 'hello',
14
+ 'धन्यवाद': 'thank you',
15
+ 'हाँ': 'yes',
16
+ 'नहीं': 'no',
17
+ 'अच्छा': 'good',
18
+ 'बुरा': 'bad',
19
+ 'पानी': 'water',
20
+ 'खाना': 'food',
21
+ 'घर': 'home',
22
+ 'स्कूल': 'school'
23
+ },
24
+ 'english_to_hindi': {
25
+ 'hello': 'नमस्ते',
26
+ 'thank you': 'धन्यवाद',
27
+ 'yes': 'हाँ',
28
+ 'no': 'नहीं',
29
+ 'good': 'अच्छा',
30
+ 'bad': 'बुरा',
31
+ 'water': 'पानी',
32
+ 'food': 'खाना',
33
+ 'home': 'घर',
34
+ 'school': 'स्कूल'
35
+ },
36
+ 'bengali_to_english': {
37
+ 'নমস্কার': 'hello',
38
+ 'ধন্যবাদ': 'thank you',
39
+ 'হ্যাঁ': 'yes',
40
+ 'না': 'no',
41
+ 'ভাল': 'good',
42
+ 'খারাপ': 'bad'
43
+ },
44
+ 'english_to_bengali': {
45
+ 'hello': 'নমস্কার',
46
+ 'thank you': 'ধন্যবাদ',
47
+ 'yes': 'হ্যাঁ',
48
+ 'no': 'না',
49
+ 'good': 'ভাল',
50
+ 'bad': 'খারাপ'
51
+ }
52
+ }
53
+
54
+ def detect_language_from_script(self, text_lines):
55
+ """Simple language detection based on Unicode ranges"""
56
+ if isinstance(text_lines, str):
57
+ text_lines = [text_lines]
58
+
59
+ full_text = " ".join(text_lines)
60
+
61
+ if not full_text.strip():
62
+ return 'english'
63
+
64
+ # Check for different scripts
65
+ # Devanagari (Hindi/Marathi)
66
+ if any(0x0900 <= ord(char) <= 0x097F for char in full_text):
67
+ return 'hindi'
68
+
69
+ # Bengali script
70
+ if any(0x0980 <= ord(char) <= 0x09FF for char in full_text):
71
+ return 'bengali'
72
+
73
+ # Gujarati script
74
+ if any(0x0A80 <= ord(char) <= 0x0AFF for char in full_text):
75
+ return 'gujarati'
76
+
77
+ # Tamil script
78
+ if any(0x0B80 <= ord(char) <= 0x0BFF for char in full_text):
79
+ return 'tamil'
80
+
81
+ # Telugu script
82
+ if any(0x0C00 <= ord(char) <= 0x0C7F for char in full_text):
83
+ return 'telugu'
84
+
85
+ # Kannada script
86
+ if any(0x0C80 <= ord(char) <= 0x0CFF for char in full_text):
87
+ return 'kannada'
88
+
89
+ # Malayalam script
90
+ if any(0x0D00 <= ord(char) <= 0x0D7F for char in full_text):
91
+ return 'malayalam'
92
+
93
+ # Odia script
94
+ if any(0x0B00 <= ord(char) <= 0x0B7F for char in full_text):
95
+ return 'odia'
96
+
97
+ # Punjabi script (Gurmukhi)
98
+ if any(0x0A00 <= ord(char) <= 0x0A7F for char in full_text):
99
+ return 'punjabi'
100
+
101
+ # Urdu script (Arabic)
102
+ if any(0x0600 <= ord(char) <= 0x06FF for char in full_text):
103
+ return 'urdu'
104
+
105
+ return 'english'
106
+
107
+ def translate_text(self, text, src_lang, tgt_lang):
108
+ """Simple word-by-word translation"""
109
+ if not text or src_lang == tgt_lang:
110
+ return text
111
+
112
+ # Get translation dictionary
113
+ dict_key = f"{src_lang}_to_{tgt_lang}"
114
+ trans_dict = self.translations.get(dict_key, {})
115
+
116
+ if not trans_dict:
117
+ return f"[Simple translation not available: {src_lang} → {tgt_lang}] {text}"
118
+
119
+ # Simple word replacement
120
+ words = text.split()
121
+ translated_words = []
122
+
123
+ for word in words:
124
+ # Try exact match first, then lowercase
125
+ translated_word = trans_dict.get(word, trans_dict.get(word.lower(), word))
126
+ translated_words.append(translated_word)
127
+
128
+ return " ".join(translated_words)
129
+
130
+ def translate_multiple_lines(self, lines, src_lang, tgt_lang):
131
+ """Translate multiple lines"""
132
+ return [self.translate_text(line, src_lang, tgt_lang) for line in lines]
app.py CHANGED
@@ -1,138 +1,331 @@
1
- import gradio as gr
2
- from PIL import Image
3
- import tempfile
4
- import os
5
- from IndicPhotoOCR.ocr import OCR # Ensure OCR class is saved in a file named ocr.py
6
- from IndicPhotoOCR.theme import Seafoam
7
- from IndicPhotoOCR.utils.helper import detect_para
8
-
9
- # Possible values for identifier_lang
10
- VALID_IDENTIFIER_LANGS = ["hindi", "assamese", "bengali", "gujarati", "kannada", "malayalam","odia", "punjabi", "tamil", "telugu", "auto"] # Add more as needed
11
-
12
- def process_image(image, identifier_lang):
13
- """
14
- Processes the uploaded image for text detection and recognition.
15
- - Detects bounding boxes in the image
16
- - Draws bounding boxes on the image and identifies script in each detected area
17
- - Recognizes text in each cropped region and returns the annotated image and recognized text
18
-
19
- Parameters:
20
- image (PIL.Image): The input image to be processed.
21
- identifier_lang (str): The script identifier model to use.
22
-
23
- Returns:
24
- tuple: A PIL.Image with bounding boxes and a string of recognized text.
25
- """
26
-
27
- # Save the input image temporarily
28
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_input:
29
- image.save(temp_input.name)
30
- image_path = temp_input.name
31
-
32
- # Initialize OCR with the selected identifier language
33
- ocr = OCR(identifier_lang=identifier_lang, verbose=False)
34
-
35
- # Detect bounding boxes on the image using OCR
36
- detections = ocr.detect(image_path)
37
-
38
- output_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
39
-
40
- # Draw bounding boxes on the image and save it as output
41
- ocr.visualize_detection(image_path, detections, save_path=output_path)
42
-
43
- # Load the annotated image with bounding boxes drawn
44
- output_image = Image.open(output_path)
45
-
46
- # Recognize text from the detected areas
47
- recognized_text = ocr.ocr(image_path)
48
- recognized_text = '\n'.join([' '.join(line) for line in recognized_text])
49
-
50
- return output_image, recognized_text
51
-
52
- # Custom HTML for interface header with logos and alignment
53
- interface_html = """
54
- <div style="text-align: left; padding: 10px;">
55
- <div style="background-color: white; padding: 10px; display: inline-block;">
56
- <img src="https://iitj.ac.in/images/logo/Design-of-New-Logo-of-IITJ-2.png" alt="IITJ Logo" style="width: 100px; height: 100px;">
57
- </div>
58
- <img src="https://play-lh.googleusercontent.com/_FXSr4xmhPfBykmNJvKvC0GIAVJmOLhFl6RA5fobCjV-8zVSypxX8yb8ka6zu6-4TEft=w240-h480-rw" alt="Bhashini Logo" style="width: 100px; height: 100px; float: right;">
59
- </div>
60
- """
61
-
62
-
63
-
64
- # Links to GitHub and Dataset repositories with GitHub icon
65
- links_html = """
66
- <div style="text-align: center; padding-top: 20px;">
67
- <a href="https://github.com/Bhashini-IITJ/IndicPhotoOCR" target="_blank" style="margin-right: 20px; font-size: 18px; text-decoration: none;">
68
- GitHub Repository
69
- </a>
70
- <a href="https://github.com/Bhashini-IITJ/BharatSceneTextDataset" target="_blank" style="font-size: 18px; text-decoration: none;">
71
- Dataset Repository
72
- </a>
73
- </div>
74
- """
75
-
76
- # Custom CSS to style the text box and center the title
77
- custom_css = """
78
- .custom-textbox textarea {
79
- font-size: 20px !important;
80
- }
81
-
82
- #title {
83
- text-align: center;
84
- font-size: 28px;
85
- font-weight: bold;
86
- margin-bottom: 20px;
87
- }
88
- """
89
-
90
- # Create an instance of the Seafoam theme for a consistent visual style
91
- seafoam = Seafoam()
92
-
93
- # Clear function
94
- def clear_inputs():
95
- return None, "auto", None, ""
96
-
97
- # Define the Gradio Blocks interface
98
- with gr.Blocks(theme=seafoam, css=custom_css) as demo:
99
-
100
- gr.Markdown("# IndicPhotoOCR - Indic Scene Text Recogniser Toolkit", elem_id="title")
101
- gr.Markdown("# Developed by IIT Jodhpur", elem_id="title")
102
- gr.Markdown(interface_html + links_html)
103
-
104
- with gr.Row():
105
- with gr.Column():
106
- input_image = gr.Image(type="pil", image_mode="RGB", label="Upload Image")
107
- lang_dropdown = gr.Dropdown(VALID_IDENTIFIER_LANGS, label="Identifier Language", value="auto")
108
- run_button = gr.Button("Run OCR")
109
- clear_button = gr.Button("Clear", variant="stop") # Added Clear Button
110
-
111
-
112
- with gr.Column():
113
- output_image = gr.Image(type="pil", label="Processed Image")
114
- output_text = gr.Textbox(label="Recognized Text", lines=10, elem_classes="custom-textbox")
115
-
116
- # Examples shown separately (to avoid schema error)
117
- gr.Examples(
118
- examples=[["test_images/image_88.jpg", "auto"],
119
- ["test_images/image_742.jpg", "hindi"]],
120
- inputs=[input_image, lang_dropdown],
121
- label="Try an example"
122
- )
123
-
124
- # Connect logic
125
- run_button.click(fn=process_image, inputs=[input_image, lang_dropdown], outputs=[output_image, output_text])
126
- clear_button.click(fn=clear_inputs, outputs=[input_image, lang_dropdown, output_image, output_text]) # Clear logic
127
-
128
-
129
- # Launch
130
- demo.launch(share=True)
131
-
132
- # # 👇 Local server launch config
133
- # if __name__ == "__main__":
134
- # demo.launch(
135
- # server_name="0.0.0.0",
136
- # server_port=7866,
137
- # share=False
138
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ import tempfile
4
+ import os
5
+ from IndicPhotoOCR.ocr import OCR # Ensure OCR class is saved in a file named ocr.py
6
+ from IndicPhotoOCR.theme import Seafoam
7
+ from IndicPhotoOCR.utils.helper import detect_para
8
+ from IndicPhotoOCR.translation.indictrans_translator import IndicTransTranslator
9
+ from IndicPhotoOCR.translation.simple_translator import SimpleTranslator
10
+
11
+ # Possible values for identifier_lang
12
+ VALID_IDENTIFIER_LANGS = ["hindi", "assamese", "bengali", "gujarati", "kannada", "malayalam","odia", "punjabi", "tamil", "telugu", "auto"] # Add more as needed
13
+
14
+ # Translation target languages (what users can translate TO)
15
+ TRANSLATION_LANGUAGES = ["None", "assamese", "bengali", "english", "gujarati", "hindi", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "urdu"]
16
+
17
+ # Initialize the translators (will be loaded when needed)
18
+ advanced_translator = None
19
+ simple_translator = SimpleTranslator()
20
+
21
+ def process_image(image, identifier_lang):
22
+ """
23
+ Processes the uploaded image for text detection and recognition only.
24
+ - Detects bounding boxes in the image
25
+ - Draws bounding boxes on the image and identifies script in each detected area
26
+ - Recognizes text in each cropped region
27
+
28
+ Parameters:
29
+ image (PIL.Image): The input image to be processed.
30
+ identifier_lang (str): The script identifier model to use.
31
+
32
+ Returns:
33
+ tuple: A PIL.Image with bounding boxes and recognized text.
34
+ """
35
+
36
+ # Save the input image temporarily
37
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_input:
38
+ image.save(temp_input.name)
39
+ image_path = temp_input.name
40
+
41
+ # Initialize OCR with the selected identifier language
42
+ ocr = OCR(identifier_lang=identifier_lang, verbose=False, device="cpu")
43
+
44
+ # Detect bounding boxes on the image using OCR
45
+ detections = ocr.detect(image_path)
46
+
47
+ output_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
48
+
49
+ # Draw bounding boxes on the image and save it as output
50
+ ocr.visualize_detection(image_path, detections, save_path=output_path)
51
+
52
+ # Load the annotated image with bounding boxes drawn
53
+ output_image = Image.open(output_path)
54
+
55
+ # Recognize text from the detected areas
56
+ recognized_text = ocr.ocr(image_path)
57
+ recognized_text_lines = [' '.join(line) for line in recognized_text]
58
+ recognized_text_str = '\n'.join(recognized_text_lines)
59
+
60
+ return output_image, recognized_text_str
61
+
62
+ def translate_text(recognized_text, target_lang):
63
+ """
64
+ Translates the recognized text to the target language.
65
+ Uses IndicTrans2 for all translation.
66
+
67
+ Parameters:
68
+ recognized_text (str): The text to translate.
69
+ target_lang (str): Target language for translation.
70
+
71
+ Returns:
72
+ str: Translated text.
73
+ """
74
+ global advanced_translator, simple_translator
75
+
76
+ if not recognized_text or not recognized_text.strip():
77
+ return "No text to translate - Please run OCR first"
78
+
79
+ if target_lang == "None" or target_lang.lower() == "none":
80
+ return "Please select a target language for translation"
81
+
82
+ # Clean problematic Unicode characters from input
83
+ import re
84
+ import unicodedata
85
+
86
+ def clean_input_text(text):
87
+ """Clean problematic Unicode characters from input text"""
88
+ # Normalize Unicode
89
+ text = unicodedata.normalize('NFC', text)
90
+
91
+ # Remove problematic Unicode characters that cause translation issues
92
+ problematic_chars = {
93
+ '\u09BC': '', # Bengali nukta - your specific issue
94
+ '\u093C': '', # Devanagari nukta
95
+ '\u200C': '', # Zero width non-joiner
96
+ '\u200D': '', # Zero width joiner
97
+ '\uFEFF': '', # Byte order mark
98
+ }
99
+
100
+ for char, replacement in problematic_chars.items():
101
+ text = text.replace(char, replacement)
102
+
103
+ # Remove Unicode escape sequences
104
+ text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
105
+
106
+ return text.strip()
107
+
108
+ # Clean the input text
109
+ recognized_text = clean_input_text(recognized_text)
110
+
111
+ # Progress tracking for UI
112
+ progress_messages = []
113
+
114
+ def progress_callback(msg):
115
+ progress_messages.append(msg)
116
+ return f"LOADING MODELS... Please wait (may take 2-5 minutes)\n\n{msg}\n\n Original text:\n{recognized_text[:200]}..."
117
+
118
+ try:
119
+ print(f"\nStarting translation to {target_lang}...")
120
+ print(f"Text to translate (first 200 chars): {recognized_text[:200]}...")
121
+
122
+ # Split text into lines
123
+ text_lines = [line.strip() for line in recognized_text.split('\n') if line.strip()]
124
+
125
+ if not text_lines:
126
+ return "No valid text lines to translate"
127
+
128
+ print(f"Found {len(text_lines)} lines to translate")
129
+
130
+ # Initialize advanced translator if not already done
131
+ if advanced_translator is None:
132
+ try:
133
+ print("Initializing IndicTrans2 translator...")
134
+ print("This may take 2-5 minutes for first-time model download...")
135
+ print("Please be patient - models are large (~1-2GB each)")
136
+
137
+ # Show initial loading message
138
+ loading_msg = "INITIALIZING TRANSLATION MODELS...\n\n"
139
+ loading_msg += "First-time setup may take 2-5 minutes\n"
140
+ loading_msg += "Downloading models (~2GB)\n"
141
+ loading_msg += "Please be patient and don't refresh the page\n\n"
142
+ loading_msg += f"Text to translate:\n{recognized_text[:300]}..."
143
+
144
+ # Initialize with progress callback
145
+ advanced_translator = IndicTransTranslator(device="cpu", progress_callback=progress_callback)
146
+ print("IndicTrans2 translator initialized!")
147
+
148
+ except Exception as init_error:
149
+ print(f"Failed to initialize IndicTrans2: {str(init_error)}")
150
+ print("Falling back to simple translator...")
151
+
152
+ # Fallback to simple translator
153
+ try:
154
+ source_lang = simple_translator.detect_language_from_script(text_lines)
155
+ print(f"Simple translator detected language: {source_lang}")
156
+
157
+ simple_translated_lines = []
158
+ for line in text_lines:
159
+ simple_result = simple_translator.translate_text(line, source_lang, target_lang)
160
+ simple_translated_lines.append(simple_result)
161
+
162
+ fallback_result = '\n'.join(simple_translated_lines)
163
+ return f"Advanced translator unavailable, using simple translation:\n\n{fallback_result}"
164
+
165
+ except Exception as simple_error:
166
+ return f"All translators failed:\nAdvanced: {str(init_error)}\nSimple: {str(simple_error)}"
167
+
168
+ # Detect source language
169
+ source_lang = advanced_translator.detect_language_from_script(text_lines)
170
+ print(f"Detected source language: {source_lang}")
171
+
172
+ if source_lang == target_lang.lower():
173
+ return f"Source and target languages are the same ({source_lang}). No translation needed."
174
+
175
+ # Use the improved batch translation for efficiency
176
+ print("Starting batch translation...")
177
+ print("Model loading in progress... Please wait...")
178
+
179
+ try:
180
+ translated_lines = advanced_translator.translate_multiple_lines(text_lines, source_lang, target_lang)
181
+
182
+ # Combine results
183
+ result = '\n'.join(translated_lines)
184
+
185
+ print("Translation completed successfully!")
186
+
187
+ # Final cleaning of the result
188
+ result = clean_input_text(result)
189
+ return result
190
+
191
+ except Exception as translate_error:
192
+ print(f"Advanced translation failed: {str(translate_error)}")
193
+ print("Trying simple translator fallback...")
194
+
195
+ # Try simple translator as fallback
196
+ source_lang = simple_translator.detect_language_from_script(text_lines)
197
+ simple_translated_lines = []
198
+ for line in text_lines:
199
+ simple_result = simple_translator.translate_text(line, source_lang, target_lang)
200
+ simple_translated_lines.append(simple_result)
201
+
202
+ fallback_result = '\n'.join(simple_translated_lines)
203
+ return f"Advanced translation failed, using simple translation:\n\n{fallback_result}"
204
+
205
+ except Exception as e:
206
+ error_msg = f"Translation error: {str(e)}"
207
+ print(f"ERROR: {error_msg}")
208
+ import traceback
209
+ traceback.print_exc()
210
+
211
+ # Final fallback to simple translator
212
+ try:
213
+ print("Attempting simple translator fallback...")
214
+ text_lines = [line.strip() for line in recognized_text.split('\n') if line.strip()]
215
+ source_lang = simple_translator.detect_language_from_script(text_lines)
216
+
217
+ simple_translated_lines = []
218
+ for line in text_lines:
219
+ simple_result = simple_translator.translate_text(line, source_lang, target_lang)
220
+ simple_translated_lines.append(simple_result)
221
+
222
+ fallback_result = '\n'.join(simple_translated_lines)
223
+ return f"Advanced translation failed with error, using simple translation:\n{error_msg}\n\n{fallback_result}"
224
+
225
+ except Exception as e2:
226
+ return f"All translation methods failed:\nAdvanced: {error_msg}\nSimple: {str(e2)}"
227
+
228
+ # Custom HTML for interface header with logos and alignment
229
+ interface_html = """
230
+ <div style="text-align: left; padding: 10px;">
231
+ <div style="background-color: white; padding: 10px; display: inline-block;">
232
+ <img src="https://iitj.ac.in/images/logo/Design-of-New-Logo-of-IITJ-2.png" alt="IITJ Logo" style="width: 100px; height: 100px;">
233
+ </div>
234
+ <img src="https://play-lh.googleusercontent.com/_FXSr4xmhPfBykmNJvKvC0GIAVJmOLhFl6RA5fobCjV-8zVSypxX8yb8ka6zu6-4TEft=w240-h480-rw" alt="Bhashini Logo" style="width: 100px; height: 100px; float: right;">
235
+ </div>
236
+ """
237
+
238
+
239
+
240
+ # Links to GitHub and Dataset repositories with GitHub icon
241
+ links_html = """
242
+ <div style="text-align: center; padding-top: 20px;">
243
+ <a href="https://github.com/Bhashini-IITJ/IndicPhotoOCR" target="_blank" style="margin-right: 20px; font-size: 18px; text-decoration: none;">
244
+ GitHub Repository
245
+ </a>
246
+ <a href="https://github.com/Bhashini-IITJ/BharatSceneTextDataset" target="_blank" style="font-size: 18px; text-decoration: none;">
247
+ Dataset Repository
248
+ </a>
249
+ </div>
250
+ """
251
+
252
+ # Custom CSS to style the text box and center the title
253
+ custom_css = """
254
+ .custom-textbox textarea {
255
+ font-size: 20px !important;
256
+ }
257
+
258
+ #title {
259
+ text-align: center;
260
+ font-size: 28px;
261
+ font-weight: bold;
262
+ margin-bottom: 20px;
263
+ }
264
+ """
265
+
266
+ # Create an instance of the Seafoam theme for a consistent visual style
267
+ seafoam = Seafoam()
268
+
269
+ # Clear function
270
+ def clear_inputs():
271
+ return None, "auto", "None", None, "", ""
272
+
273
+ # Define the Gradio Blocks interface
274
+ with gr.Blocks(theme=seafoam, css=custom_css) as demo:
275
+
276
+ gr.Markdown("# IndicPhotoOCR - Indic Scene Text Recogniser Toolkit", elem_id="title")
277
+ gr.Markdown("# Developed by IIT Jodhpur", elem_id="title")
278
+ gr.Markdown(interface_html + links_html)
279
+
280
+ with gr.Row():
281
+ with gr.Column():
282
+ input_image = gr.Image(type="pil", image_mode="RGB", label="Upload Image")
283
+ lang_dropdown = gr.Dropdown(VALID_IDENTIFIER_LANGS, label="Identifier Language", value="auto")
284
+ translation_dropdown = gr.Dropdown(TRANSLATION_LANGUAGES, label="Translate to Language", value="None")
285
+
286
+ with gr.Row():
287
+ run_ocr_button = gr.Button("Run OCR", variant="primary")
288
+ translate_button = gr.Button("Translate Text", variant="secondary")
289
+
290
+ clear_button = gr.Button("Clear", variant="stop")
291
+
292
+ with gr.Column():
293
+ output_image = gr.Image(type="pil", label="Processed Image")
294
+ output_text = gr.Textbox(label="Recognized Text", lines=8, elem_classes="custom-textbox")
295
+ translated_text = gr.Textbox(label="Translated Text", lines=8, elem_classes="custom-textbox")
296
+
297
+ # Examples shown separately (to avoid schema error)
298
+ gr.Examples(
299
+ examples=[["test_images/image_88.jpg", "auto", "english"],
300
+ ["test_images/image_742.jpg", "hindi", "english"]],
301
+ inputs=[input_image, lang_dropdown, translation_dropdown],
302
+ label="Try an example"
303
+ )
304
+
305
+ # Connect logic
306
+ run_ocr_button.click(fn=process_image, inputs=[input_image, lang_dropdown], outputs=[output_image, output_text])
307
+ translate_button.click(fn=translate_text, inputs=[output_text, translation_dropdown], outputs=translated_text)
308
+ clear_button.click(fn=clear_inputs, outputs=[input_image, lang_dropdown, translation_dropdown, output_image, output_text, translated_text]) # Clear logic
309
+
310
+
311
+ # Launch
312
+ if __name__ == "__main__":
313
+ print("Starting IndicPhotoOCR...")
314
+ try:
315
+ demo.launch(
316
+ share=True,
317
+ server_name="0.0.0.0",
318
+ server_port=7860, # Changed to avoid port conflict
319
+ debug=True
320
+ # show_error=True,
321
+ # quiet=False,
322
+ # max_threads=40,
323
+ # inbrowser=False, # Don't try to open browser
324
+ # prevent_thread_lock=False # Keep app running
325
+ )
326
+ except KeyboardInterrupt:
327
+ print("App interrupted by user")
328
+ except Exception as e:
329
+ print(f"Error launching app: {e}")
330
+ import traceback
331
+ traceback.print_exc()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- aiohappyeyeballs==2.4.3
2
  aiohttp==3.10.10
3
  aiosignal==1.3.1
4
  async-timeout==4.0.3
@@ -45,3 +45,6 @@ easydict==1.13
45
  scipy==1.13.1
46
  transformers==4.45.1
47
  datasets==3.1.0
 
 
 
 
1
+ aiohappyeyeballs>=2.0.0
2
  aiohttp==3.10.10
3
  aiosignal==1.3.1
4
  async-timeout==4.0.3
 
45
  scipy==1.13.1
46
  transformers==4.45.1
47
  datasets==3.1.0
48
+ IndicTransToolkit>=1.0.0
49
+ sentencepiece>=0.1.99
50
+ sacremoses>=0.0.53