Translation Added (#2)
Browse files- Translation Added (39b0c467a8b1ff824e2bdc75e6a70c5801246fd4)
- IndicPhotoOCR/translation/__init__.py +4 -0
- IndicPhotoOCR/translation/indictrans_translator.py +339 -0
- IndicPhotoOCR/translation/simple_translator.py +132 -0
- app.py +331 -138
- requirements.txt +4 -1
IndicPhotoOCR/translation/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Translation module for IndicPhotoOCR
|
| 2 |
+
from .indictrans_translator import IndicTransTranslator
|
| 3 |
+
|
| 4 |
+
__all__ = ['IndicTransTranslator']
|
IndicPhotoOCR/translation/indictrans_translator.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
IndicTrans2 Translation Module for IndicPhotoOCR
|
| 3 |
+
Clean implementation with proper error handling and full language support
|
| 4 |
+
Optimized for web app usage with better timeout handling
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from typing import List, Optional, Callable
|
| 9 |
+
import warnings
|
| 10 |
+
import unicodedata
|
| 11 |
+
import re
|
| 12 |
+
warnings.filterwarnings("ignore")
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 16 |
+
from IndicTransToolkit import IndicProcessor
|
| 17 |
+
INDICTRANS_AVAILABLE = True
|
| 18 |
+
except ImportError as e:
|
| 19 |
+
print(f"Warning: IndicTransToolkit not available: {e}")
|
| 20 |
+
INDICTRANS_AVAILABLE = False
|
| 21 |
+
|
| 22 |
+
class IndicTransTranslator:
|
| 23 |
+
def __init__(self, device: str = "cpu", progress_callback: Optional[Callable] = None):
|
| 24 |
+
"""Initialize the IndicTrans2 translator"""
|
| 25 |
+
if not INDICTRANS_AVAILABLE:
|
| 26 |
+
raise ImportError("IndicTransToolkit not available. Install with: pip install IndicTransToolkit")
|
| 27 |
+
|
| 28 |
+
self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
|
| 29 |
+
self.progress_callback = progress_callback
|
| 30 |
+
self.models = {}
|
| 31 |
+
self.tokenizers = {}
|
| 32 |
+
self.processor = None
|
| 33 |
+
self.initialized = False
|
| 34 |
+
|
| 35 |
+
# Language code mapping for IndicTrans2
|
| 36 |
+
self.lang_codes = {
|
| 37 |
+
'assamese': 'asm_Beng',
|
| 38 |
+
'bengali': 'ben_Beng',
|
| 39 |
+
'gujarati': 'guj_Gujr',
|
| 40 |
+
'hindi': 'hin_Deva',
|
| 41 |
+
'kannada': 'kan_Knda', # Alias for Kannada
|
| 42 |
+
'malayalam': 'mal_Mlym',
|
| 43 |
+
'marathi': 'mar_Deva',
|
| 44 |
+
'odia': 'ory_Orya',
|
| 45 |
+
'punjabi': 'pan_Guru',
|
| 46 |
+
'tamil': 'tam_Taml',
|
| 47 |
+
'telugu': 'tel_Telu',
|
| 48 |
+
'urdu': 'urd_Arab',
|
| 49 |
+
'english': 'eng_Latn'
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# Script ranges for efficient language detection
|
| 53 |
+
self.script_ranges = {
|
| 54 |
+
'devanagari': (0x0900, 0x097F), # Hindi/Marathi
|
| 55 |
+
'bengali_assamese': (0x0980, 0x09FF), # Bengali/Assamese
|
| 56 |
+
'gujarati': (0x0A80, 0x0AFF),
|
| 57 |
+
'tamil': (0x0B80, 0x0BFF),
|
| 58 |
+
'telugu': (0x0C00, 0x0C7F),
|
| 59 |
+
'kannada': (0x0C80, 0x0CFF),
|
| 60 |
+
'malayalam': (0x0D00, 0x0D7F),
|
| 61 |
+
'odia': (0x0B00, 0x0B7F),
|
| 62 |
+
'punjabi': (0x0A00, 0x0A7F), # Gurmukhi
|
| 63 |
+
'urdu': (0x0600, 0x06FF), # Arabic
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
def _clean_text(self, text: str) -> str:
|
| 67 |
+
"""Clean and normalize text for better translation"""
|
| 68 |
+
if not text:
|
| 69 |
+
return text
|
| 70 |
+
|
| 71 |
+
# Normalize Unicode and remove problematic characters
|
| 72 |
+
text = unicodedata.normalize('NFC', text)
|
| 73 |
+
text = re.sub(r'[\u200C\u200D\uFEFF]', '', text) # Remove zero-width chars
|
| 74 |
+
text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text) # Remove Unicode escapes
|
| 75 |
+
text = re.sub(r'\s+', ' ', text.strip()) # Normalize whitespace
|
| 76 |
+
|
| 77 |
+
# Fix punctuation spacing
|
| 78 |
+
text = re.sub(r'\s+([।,.!?;:])', r'\1', text)
|
| 79 |
+
text = re.sub(r'([।,.!?;:])\s+', r'\1 ', text)
|
| 80 |
+
|
| 81 |
+
return text.strip()
|
| 82 |
+
|
| 83 |
+
def _update_progress(self, message: str):
|
| 84 |
+
"""Update progress via callback if available"""
|
| 85 |
+
if self.progress_callback:
|
| 86 |
+
try:
|
| 87 |
+
self.progress_callback(message)
|
| 88 |
+
except:
|
| 89 |
+
pass
|
| 90 |
+
print(message)
|
| 91 |
+
|
| 92 |
+
def _load_models(self):
|
| 93 |
+
"""Load both translation models"""
|
| 94 |
+
if self.initialized:
|
| 95 |
+
return
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
self._update_progress("Initializing IndicProcessor...")
|
| 99 |
+
self.processor = IndicProcessor(inference=True)
|
| 100 |
+
|
| 101 |
+
# Model names
|
| 102 |
+
model_names = {
|
| 103 |
+
"indic-en": "ai4bharat/indictrans2-indic-en-1B",
|
| 104 |
+
"en-indic": "ai4bharat/indictrans2-en-indic-1B"
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
for key, model_name in model_names.items():
|
| 108 |
+
self._update_progress(f"Loading {model_name}...")
|
| 109 |
+
|
| 110 |
+
# Load tokenizer
|
| 111 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 112 |
+
model_name,
|
| 113 |
+
trust_remote_code=True
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# Load model with memory optimization
|
| 117 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 118 |
+
model_name,
|
| 119 |
+
trust_remote_code=True,
|
| 120 |
+
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
|
| 121 |
+
low_cpu_mem_usage=True
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
self.models[key] = model.to(self.device).eval()
|
| 125 |
+
self.tokenizers[key] = tokenizer
|
| 126 |
+
|
| 127 |
+
self._update_progress(f"{key} model loaded successfully")
|
| 128 |
+
|
| 129 |
+
self.initialized = True
|
| 130 |
+
self._update_progress("All translation models loaded!")
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
self._update_progress(f"Error loading models: {str(e)}")
|
| 134 |
+
raise e
|
| 135 |
+
|
| 136 |
+
def detect_language_from_script(self, text_lines: List[str]) -> str:
|
| 137 |
+
"""Detect primary language based on script characteristics"""
|
| 138 |
+
if isinstance(text_lines, str):
|
| 139 |
+
text_lines = [text_lines]
|
| 140 |
+
|
| 141 |
+
full_text = " ".join(text_lines)
|
| 142 |
+
if not full_text.strip():
|
| 143 |
+
return 'english'
|
| 144 |
+
|
| 145 |
+
# Count characters from different scripts
|
| 146 |
+
script_counts = {script: 0 for script in self.script_ranges.keys()}
|
| 147 |
+
script_counts['english'] = 0
|
| 148 |
+
assamese_chars = 0
|
| 149 |
+
|
| 150 |
+
for char in full_text:
|
| 151 |
+
# Check script ranges
|
| 152 |
+
found_script = False
|
| 153 |
+
for script, (start, end) in self.script_ranges.items():
|
| 154 |
+
if start <= ord(char) <= end:
|
| 155 |
+
script_counts[script] += 1
|
| 156 |
+
# Special check for Assamese
|
| 157 |
+
if script == 'bengali_assamese' and char in 'ৰৱ':
|
| 158 |
+
assamese_chars += 1
|
| 159 |
+
found_script = True
|
| 160 |
+
break
|
| 161 |
+
|
| 162 |
+
# If not found in Indic scripts, check for English
|
| 163 |
+
if not found_script and char.isalpha() and ord(char) < 128:
|
| 164 |
+
script_counts['english'] += 1
|
| 165 |
+
|
| 166 |
+
total_chars = sum(script_counts.values())
|
| 167 |
+
if total_chars == 0:
|
| 168 |
+
return 'english'
|
| 169 |
+
|
| 170 |
+
# Find dominant script
|
| 171 |
+
dominant_script = max(script_counts, key=script_counts.get)
|
| 172 |
+
dominant_count = script_counts[dominant_script]
|
| 173 |
+
|
| 174 |
+
# Special handling for Bengali/Assamese
|
| 175 |
+
if dominant_script == 'bengali_assamese':
|
| 176 |
+
return 'assamese' if assamese_chars > 0 else 'bengali'
|
| 177 |
+
|
| 178 |
+
# Check for mixed content
|
| 179 |
+
english_ratio = script_counts['english'] / total_chars
|
| 180 |
+
if english_ratio > 0.2 and any(script_counts[s] > 0 for s in self.script_ranges.keys()):
|
| 181 |
+
return 'mixed'
|
| 182 |
+
|
| 183 |
+
# Map script to language
|
| 184 |
+
script_to_lang = {
|
| 185 |
+
'devanagari': 'hindi',
|
| 186 |
+
'gujarati': 'gujarati',
|
| 187 |
+
'tamil': 'tamil',
|
| 188 |
+
'telugu': 'telugu',
|
| 189 |
+
'kannada': 'kannada',
|
| 190 |
+
'malayalam': 'malayalam',
|
| 191 |
+
'odia': 'odia',
|
| 192 |
+
'punjabi': 'punjabi',
|
| 193 |
+
'urdu': 'urdu',
|
| 194 |
+
'english': 'english'
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
return script_to_lang.get(dominant_script, 'english')
|
| 198 |
+
|
| 199 |
+
def _translate_batch_direct(self, texts: List[str], src_lang: str, tgt_lang: str) -> List[str]:
|
| 200 |
+
"""Direct translation using appropriate model"""
|
| 201 |
+
if not texts:
|
| 202 |
+
return []
|
| 203 |
+
|
| 204 |
+
self._load_models()
|
| 205 |
+
|
| 206 |
+
# Convert to language codes
|
| 207 |
+
src_code = self.lang_codes.get(src_lang.lower(), src_lang)
|
| 208 |
+
tgt_code = self.lang_codes.get(tgt_lang.lower(), tgt_lang)
|
| 209 |
+
|
| 210 |
+
# Determine which model to use
|
| 211 |
+
if src_code == 'eng_Latn' and tgt_code != 'eng_Latn':
|
| 212 |
+
model_key = "en-indic"
|
| 213 |
+
elif src_code != 'eng_Latn' and tgt_code == 'eng_Latn':
|
| 214 |
+
model_key = "indic-en"
|
| 215 |
+
else:
|
| 216 |
+
raise ValueError(f"Unsupported direct translation: {src_lang} -> {tgt_lang}")
|
| 217 |
+
|
| 218 |
+
model = self.models[model_key]
|
| 219 |
+
tokenizer = self.tokenizers[model_key]
|
| 220 |
+
|
| 221 |
+
try:
|
| 222 |
+
# Clean and preprocess
|
| 223 |
+
cleaned_texts = [self._clean_text(text) for text in texts]
|
| 224 |
+
processed_texts = self.processor.preprocess_batch(cleaned_texts, src_lang=src_code, tgt_lang=tgt_code)
|
| 225 |
+
|
| 226 |
+
# Tokenize
|
| 227 |
+
inputs = tokenizer(
|
| 228 |
+
processed_texts,
|
| 229 |
+
truncation=True,
|
| 230 |
+
padding=True,
|
| 231 |
+
max_length=512, # Increased for better context
|
| 232 |
+
return_tensors="pt"
|
| 233 |
+
).to(self.device)
|
| 234 |
+
|
| 235 |
+
# Generate
|
| 236 |
+
with torch.no_grad():
|
| 237 |
+
generated_tokens = model.generate(
|
| 238 |
+
**inputs,
|
| 239 |
+
max_length=512,
|
| 240 |
+
num_beams=5, # Increased beam search
|
| 241 |
+
early_stopping=True,
|
| 242 |
+
do_sample=False,
|
| 243 |
+
use_cache=False,
|
| 244 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 245 |
+
repetition_penalty=1.1, # Reduce repetition
|
| 246 |
+
length_penalty=1.0 # Balanced length penalty
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
# Decode
|
| 250 |
+
decoded = tokenizer.batch_decode(
|
| 251 |
+
generated_tokens,
|
| 252 |
+
skip_special_tokens=True,
|
| 253 |
+
clean_up_tokenization_spaces=True
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# Postprocess
|
| 257 |
+
results = self.processor.postprocess_batch(decoded, lang=tgt_code)
|
| 258 |
+
|
| 259 |
+
return [self._clean_text(result) for result in results]
|
| 260 |
+
|
| 261 |
+
except Exception as e:
|
| 262 |
+
print(f"Translation error: {str(e)}")
|
| 263 |
+
return [f"[Translation failed: {text}]" for text in texts]
|
| 264 |
+
|
| 265 |
+
def translate_text(self, text: str, src_lang: str, tgt_lang: str) -> str:
|
| 266 |
+
"""Translate single text"""
|
| 267 |
+
if not text or not text.strip():
|
| 268 |
+
return ""
|
| 269 |
+
|
| 270 |
+
result = self.translate_multiple_lines([text], src_lang, tgt_lang)
|
| 271 |
+
return result[0] if result else text
|
| 272 |
+
|
| 273 |
+
def translate_multiple_lines(self, lines: List[str], src_lang: str, tgt_lang: str) -> List[str]:
|
| 274 |
+
"""Translate multiple lines intelligently"""
|
| 275 |
+
if not lines:
|
| 276 |
+
return []
|
| 277 |
+
|
| 278 |
+
tgt_lang = tgt_lang.lower()
|
| 279 |
+
translated_lines = []
|
| 280 |
+
|
| 281 |
+
print(f"Processing {len(lines)} lines for translation to {tgt_lang}")
|
| 282 |
+
|
| 283 |
+
for i, line in enumerate(lines):
|
| 284 |
+
if not line or not line.strip():
|
| 285 |
+
translated_lines.append(line)
|
| 286 |
+
continue
|
| 287 |
+
|
| 288 |
+
# Detect language
|
| 289 |
+
line_lang = self.detect_language_from_script([line.strip()])
|
| 290 |
+
print(f"Line {i+1}: detected as {line_lang}")
|
| 291 |
+
|
| 292 |
+
# Keep if already in target language
|
| 293 |
+
if line_lang == tgt_lang:
|
| 294 |
+
print(f" Keeping unchanged")
|
| 295 |
+
translated_lines.append(line)
|
| 296 |
+
continue
|
| 297 |
+
|
| 298 |
+
# Handle mixed content
|
| 299 |
+
if line_lang == 'mixed':
|
| 300 |
+
print(f" Mixed language - attempting translation")
|
| 301 |
+
try:
|
| 302 |
+
assumed_lang = 'hindi' if tgt_lang == 'english' else 'english'
|
| 303 |
+
result = self._translate_batch_direct([line.strip()], assumed_lang, tgt_lang)
|
| 304 |
+
translated_lines.append(result[0] if result and result[0] else line)
|
| 305 |
+
except:
|
| 306 |
+
translated_lines.append(line)
|
| 307 |
+
continue
|
| 308 |
+
|
| 309 |
+
# Translate different language
|
| 310 |
+
try:
|
| 311 |
+
print(f" Translating from {line_lang} to {tgt_lang}")
|
| 312 |
+
|
| 313 |
+
if line_lang == "english" and tgt_lang != "english":
|
| 314 |
+
# English → Indic
|
| 315 |
+
result = self._translate_batch_direct([line.strip()], line_lang, tgt_lang)
|
| 316 |
+
elif line_lang != "english" and tgt_lang == "english":
|
| 317 |
+
# Indic → English
|
| 318 |
+
result = self._translate_batch_direct([line.strip()], line_lang, tgt_lang)
|
| 319 |
+
|
| 320 |
+
elif line_lang != "english" and tgt_lang != "english":
|
| 321 |
+
# Bridge via English
|
| 322 |
+
english_result = self._translate_batch_direct([line.strip()], line_lang, "english")
|
| 323 |
+
result = self._translate_batch_direct(english_result, "english", tgt_lang) if english_result and english_result[0] else [line.strip()]
|
| 324 |
+
else:
|
| 325 |
+
result = [line.strip()]
|
| 326 |
+
|
| 327 |
+
translated_lines.append(result[0] if result and result[0] else line)
|
| 328 |
+
|
| 329 |
+
except Exception as e:
|
| 330 |
+
print(f" Translation error: {e}")
|
| 331 |
+
translated_lines.append(line)
|
| 332 |
+
|
| 333 |
+
print(f" Processing completed for all {len(lines)} lines")
|
| 334 |
+
return translated_lines
|
| 335 |
+
|
| 336 |
+
def get_supported_languages(self) -> List[str]:
|
| 337 |
+
"""Get supported languages"""
|
| 338 |
+
return list(self.lang_codes.keys())
|
| 339 |
+
|
IndicPhotoOCR/translation/simple_translator.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simple Translator Module for IndicPhotoOCR
|
| 3 |
+
Fallback translator with basic word mappings and script detection
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
class SimpleTranslator:
|
| 7 |
+
"""Simple fallback translator with basic word mappings"""
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
# Basic word mappings for common terms
|
| 11 |
+
self.translations = {
|
| 12 |
+
'hindi_to_english': {
|
| 13 |
+
'नमस्ते': 'hello',
|
| 14 |
+
'धन्यवाद': 'thank you',
|
| 15 |
+
'हाँ': 'yes',
|
| 16 |
+
'नहीं': 'no',
|
| 17 |
+
'अच्छा': 'good',
|
| 18 |
+
'बुरा': 'bad',
|
| 19 |
+
'पानी': 'water',
|
| 20 |
+
'खाना': 'food',
|
| 21 |
+
'घर': 'home',
|
| 22 |
+
'स्कूल': 'school'
|
| 23 |
+
},
|
| 24 |
+
'english_to_hindi': {
|
| 25 |
+
'hello': 'नमस्ते',
|
| 26 |
+
'thank you': 'धन्यवाद',
|
| 27 |
+
'yes': 'हाँ',
|
| 28 |
+
'no': 'नहीं',
|
| 29 |
+
'good': 'अच्छा',
|
| 30 |
+
'bad': 'बुरा',
|
| 31 |
+
'water': 'पानी',
|
| 32 |
+
'food': 'खाना',
|
| 33 |
+
'home': 'घर',
|
| 34 |
+
'school': 'स्कूल'
|
| 35 |
+
},
|
| 36 |
+
'bengali_to_english': {
|
| 37 |
+
'নমস্কার': 'hello',
|
| 38 |
+
'ধন্যবাদ': 'thank you',
|
| 39 |
+
'হ্যাঁ': 'yes',
|
| 40 |
+
'না': 'no',
|
| 41 |
+
'ভাল': 'good',
|
| 42 |
+
'খারাপ': 'bad'
|
| 43 |
+
},
|
| 44 |
+
'english_to_bengali': {
|
| 45 |
+
'hello': 'নমস্কার',
|
| 46 |
+
'thank you': 'ধন্যবাদ',
|
| 47 |
+
'yes': 'হ্যাঁ',
|
| 48 |
+
'no': 'না',
|
| 49 |
+
'good': 'ভাল',
|
| 50 |
+
'bad': 'খারাপ'
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
def detect_language_from_script(self, text_lines):
|
| 55 |
+
"""Simple language detection based on Unicode ranges"""
|
| 56 |
+
if isinstance(text_lines, str):
|
| 57 |
+
text_lines = [text_lines]
|
| 58 |
+
|
| 59 |
+
full_text = " ".join(text_lines)
|
| 60 |
+
|
| 61 |
+
if not full_text.strip():
|
| 62 |
+
return 'english'
|
| 63 |
+
|
| 64 |
+
# Check for different scripts
|
| 65 |
+
# Devanagari (Hindi/Marathi)
|
| 66 |
+
if any(0x0900 <= ord(char) <= 0x097F for char in full_text):
|
| 67 |
+
return 'hindi'
|
| 68 |
+
|
| 69 |
+
# Bengali script
|
| 70 |
+
if any(0x0980 <= ord(char) <= 0x09FF for char in full_text):
|
| 71 |
+
return 'bengali'
|
| 72 |
+
|
| 73 |
+
# Gujarati script
|
| 74 |
+
if any(0x0A80 <= ord(char) <= 0x0AFF for char in full_text):
|
| 75 |
+
return 'gujarati'
|
| 76 |
+
|
| 77 |
+
# Tamil script
|
| 78 |
+
if any(0x0B80 <= ord(char) <= 0x0BFF for char in full_text):
|
| 79 |
+
return 'tamil'
|
| 80 |
+
|
| 81 |
+
# Telugu script
|
| 82 |
+
if any(0x0C00 <= ord(char) <= 0x0C7F for char in full_text):
|
| 83 |
+
return 'telugu'
|
| 84 |
+
|
| 85 |
+
# Kannada script
|
| 86 |
+
if any(0x0C80 <= ord(char) <= 0x0CFF for char in full_text):
|
| 87 |
+
return 'kannada'
|
| 88 |
+
|
| 89 |
+
# Malayalam script
|
| 90 |
+
if any(0x0D00 <= ord(char) <= 0x0D7F for char in full_text):
|
| 91 |
+
return 'malayalam'
|
| 92 |
+
|
| 93 |
+
# Odia script
|
| 94 |
+
if any(0x0B00 <= ord(char) <= 0x0B7F for char in full_text):
|
| 95 |
+
return 'odia'
|
| 96 |
+
|
| 97 |
+
# Punjabi script (Gurmukhi)
|
| 98 |
+
if any(0x0A00 <= ord(char) <= 0x0A7F for char in full_text):
|
| 99 |
+
return 'punjabi'
|
| 100 |
+
|
| 101 |
+
# Urdu script (Arabic)
|
| 102 |
+
if any(0x0600 <= ord(char) <= 0x06FF for char in full_text):
|
| 103 |
+
return 'urdu'
|
| 104 |
+
|
| 105 |
+
return 'english'
|
| 106 |
+
|
| 107 |
+
def translate_text(self, text, src_lang, tgt_lang):
|
| 108 |
+
"""Simple word-by-word translation"""
|
| 109 |
+
if not text or src_lang == tgt_lang:
|
| 110 |
+
return text
|
| 111 |
+
|
| 112 |
+
# Get translation dictionary
|
| 113 |
+
dict_key = f"{src_lang}_to_{tgt_lang}"
|
| 114 |
+
trans_dict = self.translations.get(dict_key, {})
|
| 115 |
+
|
| 116 |
+
if not trans_dict:
|
| 117 |
+
return f"[Simple translation not available: {src_lang} → {tgt_lang}] {text}"
|
| 118 |
+
|
| 119 |
+
# Simple word replacement
|
| 120 |
+
words = text.split()
|
| 121 |
+
translated_words = []
|
| 122 |
+
|
| 123 |
+
for word in words:
|
| 124 |
+
# Try exact match first, then lowercase
|
| 125 |
+
translated_word = trans_dict.get(word, trans_dict.get(word.lower(), word))
|
| 126 |
+
translated_words.append(translated_word)
|
| 127 |
+
|
| 128 |
+
return " ".join(translated_words)
|
| 129 |
+
|
| 130 |
+
def translate_multiple_lines(self, lines, src_lang, tgt_lang):
|
| 131 |
+
"""Translate multiple lines"""
|
| 132 |
+
return [self.translate_text(line, src_lang, tgt_lang) for line in lines]
|
app.py
CHANGED
|
@@ -1,138 +1,331 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from PIL import Image
|
| 3 |
-
import tempfile
|
| 4 |
-
import os
|
| 5 |
-
from IndicPhotoOCR.ocr import OCR # Ensure OCR class is saved in a file named ocr.py
|
| 6 |
-
from IndicPhotoOCR.theme import Seafoam
|
| 7 |
-
from IndicPhotoOCR.utils.helper import detect_para
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
#
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
#
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
#
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
#
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
#
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import tempfile
|
| 4 |
+
import os
|
| 5 |
+
from IndicPhotoOCR.ocr import OCR # Ensure OCR class is saved in a file named ocr.py
|
| 6 |
+
from IndicPhotoOCR.theme import Seafoam
|
| 7 |
+
from IndicPhotoOCR.utils.helper import detect_para
|
| 8 |
+
from IndicPhotoOCR.translation.indictrans_translator import IndicTransTranslator
|
| 9 |
+
from IndicPhotoOCR.translation.simple_translator import SimpleTranslator
|
| 10 |
+
|
| 11 |
+
# Possible values for identifier_lang
|
| 12 |
+
VALID_IDENTIFIER_LANGS = ["hindi", "assamese", "bengali", "gujarati", "kannada", "malayalam","odia", "punjabi", "tamil", "telugu", "auto"] # Add more as needed
|
| 13 |
+
|
| 14 |
+
# Translation target languages (what users can translate TO)
|
| 15 |
+
TRANSLATION_LANGUAGES = ["None", "assamese", "bengali", "english", "gujarati", "hindi", "kannada", "malayalam", "marathi", "odia", "punjabi", "tamil", "telugu", "urdu"]
|
| 16 |
+
|
| 17 |
+
# Initialize the translators (will be loaded when needed)
|
| 18 |
+
advanced_translator = None
|
| 19 |
+
simple_translator = SimpleTranslator()
|
| 20 |
+
|
| 21 |
+
def process_image(image, identifier_lang):
|
| 22 |
+
"""
|
| 23 |
+
Processes the uploaded image for text detection and recognition only.
|
| 24 |
+
- Detects bounding boxes in the image
|
| 25 |
+
- Draws bounding boxes on the image and identifies script in each detected area
|
| 26 |
+
- Recognizes text in each cropped region
|
| 27 |
+
|
| 28 |
+
Parameters:
|
| 29 |
+
image (PIL.Image): The input image to be processed.
|
| 30 |
+
identifier_lang (str): The script identifier model to use.
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
tuple: A PIL.Image with bounding boxes and recognized text.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
# Save the input image temporarily
|
| 37 |
+
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_input:
|
| 38 |
+
image.save(temp_input.name)
|
| 39 |
+
image_path = temp_input.name
|
| 40 |
+
|
| 41 |
+
# Initialize OCR with the selected identifier language
|
| 42 |
+
ocr = OCR(identifier_lang=identifier_lang, verbose=False, device="cpu")
|
| 43 |
+
|
| 44 |
+
# Detect bounding boxes on the image using OCR
|
| 45 |
+
detections = ocr.detect(image_path)
|
| 46 |
+
|
| 47 |
+
output_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
|
| 48 |
+
|
| 49 |
+
# Draw bounding boxes on the image and save it as output
|
| 50 |
+
ocr.visualize_detection(image_path, detections, save_path=output_path)
|
| 51 |
+
|
| 52 |
+
# Load the annotated image with bounding boxes drawn
|
| 53 |
+
output_image = Image.open(output_path)
|
| 54 |
+
|
| 55 |
+
# Recognize text from the detected areas
|
| 56 |
+
recognized_text = ocr.ocr(image_path)
|
| 57 |
+
recognized_text_lines = [' '.join(line) for line in recognized_text]
|
| 58 |
+
recognized_text_str = '\n'.join(recognized_text_lines)
|
| 59 |
+
|
| 60 |
+
return output_image, recognized_text_str
|
| 61 |
+
|
| 62 |
+
def translate_text(recognized_text, target_lang):
|
| 63 |
+
"""
|
| 64 |
+
Translates the recognized text to the target language.
|
| 65 |
+
Uses IndicTrans2 for all translation.
|
| 66 |
+
|
| 67 |
+
Parameters:
|
| 68 |
+
recognized_text (str): The text to translate.
|
| 69 |
+
target_lang (str): Target language for translation.
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
str: Translated text.
|
| 73 |
+
"""
|
| 74 |
+
global advanced_translator, simple_translator
|
| 75 |
+
|
| 76 |
+
if not recognized_text or not recognized_text.strip():
|
| 77 |
+
return "No text to translate - Please run OCR first"
|
| 78 |
+
|
| 79 |
+
if target_lang == "None" or target_lang.lower() == "none":
|
| 80 |
+
return "Please select a target language for translation"
|
| 81 |
+
|
| 82 |
+
# Clean problematic Unicode characters from input
|
| 83 |
+
import re
|
| 84 |
+
import unicodedata
|
| 85 |
+
|
| 86 |
+
def clean_input_text(text):
|
| 87 |
+
"""Clean problematic Unicode characters from input text"""
|
| 88 |
+
# Normalize Unicode
|
| 89 |
+
text = unicodedata.normalize('NFC', text)
|
| 90 |
+
|
| 91 |
+
# Remove problematic Unicode characters that cause translation issues
|
| 92 |
+
problematic_chars = {
|
| 93 |
+
'\u09BC': '', # Bengali nukta - your specific issue
|
| 94 |
+
'\u093C': '', # Devanagari nukta
|
| 95 |
+
'\u200C': '', # Zero width non-joiner
|
| 96 |
+
'\u200D': '', # Zero width joiner
|
| 97 |
+
'\uFEFF': '', # Byte order mark
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
for char, replacement in problematic_chars.items():
|
| 101 |
+
text = text.replace(char, replacement)
|
| 102 |
+
|
| 103 |
+
# Remove Unicode escape sequences
|
| 104 |
+
text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
|
| 105 |
+
|
| 106 |
+
return text.strip()
|
| 107 |
+
|
| 108 |
+
# Clean the input text
|
| 109 |
+
recognized_text = clean_input_text(recognized_text)
|
| 110 |
+
|
| 111 |
+
# Progress tracking for UI
|
| 112 |
+
progress_messages = []
|
| 113 |
+
|
| 114 |
+
def progress_callback(msg):
|
| 115 |
+
progress_messages.append(msg)
|
| 116 |
+
return f"LOADING MODELS... Please wait (may take 2-5 minutes)\n\n{msg}\n\n Original text:\n{recognized_text[:200]}..."
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
print(f"\nStarting translation to {target_lang}...")
|
| 120 |
+
print(f"Text to translate (first 200 chars): {recognized_text[:200]}...")
|
| 121 |
+
|
| 122 |
+
# Split text into lines
|
| 123 |
+
text_lines = [line.strip() for line in recognized_text.split('\n') if line.strip()]
|
| 124 |
+
|
| 125 |
+
if not text_lines:
|
| 126 |
+
return "No valid text lines to translate"
|
| 127 |
+
|
| 128 |
+
print(f"Found {len(text_lines)} lines to translate")
|
| 129 |
+
|
| 130 |
+
# Initialize advanced translator if not already done
|
| 131 |
+
if advanced_translator is None:
|
| 132 |
+
try:
|
| 133 |
+
print("Initializing IndicTrans2 translator...")
|
| 134 |
+
print("This may take 2-5 minutes for first-time model download...")
|
| 135 |
+
print("Please be patient - models are large (~1-2GB each)")
|
| 136 |
+
|
| 137 |
+
# Show initial loading message
|
| 138 |
+
loading_msg = "INITIALIZING TRANSLATION MODELS...\n\n"
|
| 139 |
+
loading_msg += "First-time setup may take 2-5 minutes\n"
|
| 140 |
+
loading_msg += "Downloading models (~2GB)\n"
|
| 141 |
+
loading_msg += "Please be patient and don't refresh the page\n\n"
|
| 142 |
+
loading_msg += f"Text to translate:\n{recognized_text[:300]}..."
|
| 143 |
+
|
| 144 |
+
# Initialize with progress callback
|
| 145 |
+
advanced_translator = IndicTransTranslator(device="cpu", progress_callback=progress_callback)
|
| 146 |
+
print("IndicTrans2 translator initialized!")
|
| 147 |
+
|
| 148 |
+
except Exception as init_error:
|
| 149 |
+
print(f"Failed to initialize IndicTrans2: {str(init_error)}")
|
| 150 |
+
print("Falling back to simple translator...")
|
| 151 |
+
|
| 152 |
+
# Fallback to simple translator
|
| 153 |
+
try:
|
| 154 |
+
source_lang = simple_translator.detect_language_from_script(text_lines)
|
| 155 |
+
print(f"Simple translator detected language: {source_lang}")
|
| 156 |
+
|
| 157 |
+
simple_translated_lines = []
|
| 158 |
+
for line in text_lines:
|
| 159 |
+
simple_result = simple_translator.translate_text(line, source_lang, target_lang)
|
| 160 |
+
simple_translated_lines.append(simple_result)
|
| 161 |
+
|
| 162 |
+
fallback_result = '\n'.join(simple_translated_lines)
|
| 163 |
+
return f"Advanced translator unavailable, using simple translation:\n\n{fallback_result}"
|
| 164 |
+
|
| 165 |
+
except Exception as simple_error:
|
| 166 |
+
return f"All translators failed:\nAdvanced: {str(init_error)}\nSimple: {str(simple_error)}"
|
| 167 |
+
|
| 168 |
+
# Detect source language
|
| 169 |
+
source_lang = advanced_translator.detect_language_from_script(text_lines)
|
| 170 |
+
print(f"Detected source language: {source_lang}")
|
| 171 |
+
|
| 172 |
+
if source_lang == target_lang.lower():
|
| 173 |
+
return f"Source and target languages are the same ({source_lang}). No translation needed."
|
| 174 |
+
|
| 175 |
+
# Use the improved batch translation for efficiency
|
| 176 |
+
print("Starting batch translation...")
|
| 177 |
+
print("Model loading in progress... Please wait...")
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
translated_lines = advanced_translator.translate_multiple_lines(text_lines, source_lang, target_lang)
|
| 181 |
+
|
| 182 |
+
# Combine results
|
| 183 |
+
result = '\n'.join(translated_lines)
|
| 184 |
+
|
| 185 |
+
print("Translation completed successfully!")
|
| 186 |
+
|
| 187 |
+
# Final cleaning of the result
|
| 188 |
+
result = clean_input_text(result)
|
| 189 |
+
return result
|
| 190 |
+
|
| 191 |
+
except Exception as translate_error:
|
| 192 |
+
print(f"Advanced translation failed: {str(translate_error)}")
|
| 193 |
+
print("Trying simple translator fallback...")
|
| 194 |
+
|
| 195 |
+
# Try simple translator as fallback
|
| 196 |
+
source_lang = simple_translator.detect_language_from_script(text_lines)
|
| 197 |
+
simple_translated_lines = []
|
| 198 |
+
for line in text_lines:
|
| 199 |
+
simple_result = simple_translator.translate_text(line, source_lang, target_lang)
|
| 200 |
+
simple_translated_lines.append(simple_result)
|
| 201 |
+
|
| 202 |
+
fallback_result = '\n'.join(simple_translated_lines)
|
| 203 |
+
return f"Advanced translation failed, using simple translation:\n\n{fallback_result}"
|
| 204 |
+
|
| 205 |
+
except Exception as e:
|
| 206 |
+
error_msg = f"Translation error: {str(e)}"
|
| 207 |
+
print(f"ERROR: {error_msg}")
|
| 208 |
+
import traceback
|
| 209 |
+
traceback.print_exc()
|
| 210 |
+
|
| 211 |
+
# Final fallback to simple translator
|
| 212 |
+
try:
|
| 213 |
+
print("Attempting simple translator fallback...")
|
| 214 |
+
text_lines = [line.strip() for line in recognized_text.split('\n') if line.strip()]
|
| 215 |
+
source_lang = simple_translator.detect_language_from_script(text_lines)
|
| 216 |
+
|
| 217 |
+
simple_translated_lines = []
|
| 218 |
+
for line in text_lines:
|
| 219 |
+
simple_result = simple_translator.translate_text(line, source_lang, target_lang)
|
| 220 |
+
simple_translated_lines.append(simple_result)
|
| 221 |
+
|
| 222 |
+
fallback_result = '\n'.join(simple_translated_lines)
|
| 223 |
+
return f"Advanced translation failed with error, using simple translation:\n{error_msg}\n\n{fallback_result}"
|
| 224 |
+
|
| 225 |
+
except Exception as e2:
|
| 226 |
+
return f"All translation methods failed:\nAdvanced: {error_msg}\nSimple: {str(e2)}"
|
| 227 |
+
|
| 228 |
+
# Custom HTML for interface header with logos and alignment
|
| 229 |
+
interface_html = """
|
| 230 |
+
<div style="text-align: left; padding: 10px;">
|
| 231 |
+
<div style="background-color: white; padding: 10px; display: inline-block;">
|
| 232 |
+
<img src="https://iitj.ac.in/images/logo/Design-of-New-Logo-of-IITJ-2.png" alt="IITJ Logo" style="width: 100px; height: 100px;">
|
| 233 |
+
</div>
|
| 234 |
+
<img src="https://play-lh.googleusercontent.com/_FXSr4xmhPfBykmNJvKvC0GIAVJmOLhFl6RA5fobCjV-8zVSypxX8yb8ka6zu6-4TEft=w240-h480-rw" alt="Bhashini Logo" style="width: 100px; height: 100px; float: right;">
|
| 235 |
+
</div>
|
| 236 |
+
"""
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# Links to GitHub and Dataset repositories with GitHub icon
|
| 241 |
+
links_html = """
|
| 242 |
+
<div style="text-align: center; padding-top: 20px;">
|
| 243 |
+
<a href="https://github.com/Bhashini-IITJ/IndicPhotoOCR" target="_blank" style="margin-right: 20px; font-size: 18px; text-decoration: none;">
|
| 244 |
+
GitHub Repository
|
| 245 |
+
</a>
|
| 246 |
+
<a href="https://github.com/Bhashini-IITJ/BharatSceneTextDataset" target="_blank" style="font-size: 18px; text-decoration: none;">
|
| 247 |
+
Dataset Repository
|
| 248 |
+
</a>
|
| 249 |
+
</div>
|
| 250 |
+
"""
|
| 251 |
+
|
| 252 |
+
# Custom CSS to style the text box and center the title
|
| 253 |
+
custom_css = """
|
| 254 |
+
.custom-textbox textarea {
|
| 255 |
+
font-size: 20px !important;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
#title {
|
| 259 |
+
text-align: center;
|
| 260 |
+
font-size: 28px;
|
| 261 |
+
font-weight: bold;
|
| 262 |
+
margin-bottom: 20px;
|
| 263 |
+
}
|
| 264 |
+
"""
|
| 265 |
+
|
| 266 |
+
# Create an instance of the Seafoam theme for a consistent visual style
|
| 267 |
+
seafoam = Seafoam()
|
| 268 |
+
|
| 269 |
+
# Clear function
|
| 270 |
+
def clear_inputs():
|
| 271 |
+
return None, "auto", "None", None, "", ""
|
| 272 |
+
|
| 273 |
+
# Define the Gradio Blocks interface
|
| 274 |
+
with gr.Blocks(theme=seafoam, css=custom_css) as demo:
|
| 275 |
+
|
| 276 |
+
gr.Markdown("# IndicPhotoOCR - Indic Scene Text Recogniser Toolkit", elem_id="title")
|
| 277 |
+
gr.Markdown("# Developed by IIT Jodhpur", elem_id="title")
|
| 278 |
+
gr.Markdown(interface_html + links_html)
|
| 279 |
+
|
| 280 |
+
with gr.Row():
|
| 281 |
+
with gr.Column():
|
| 282 |
+
input_image = gr.Image(type="pil", image_mode="RGB", label="Upload Image")
|
| 283 |
+
lang_dropdown = gr.Dropdown(VALID_IDENTIFIER_LANGS, label="Identifier Language", value="auto")
|
| 284 |
+
translation_dropdown = gr.Dropdown(TRANSLATION_LANGUAGES, label="Translate to Language", value="None")
|
| 285 |
+
|
| 286 |
+
with gr.Row():
|
| 287 |
+
run_ocr_button = gr.Button("Run OCR", variant="primary")
|
| 288 |
+
translate_button = gr.Button("Translate Text", variant="secondary")
|
| 289 |
+
|
| 290 |
+
clear_button = gr.Button("Clear", variant="stop")
|
| 291 |
+
|
| 292 |
+
with gr.Column():
|
| 293 |
+
output_image = gr.Image(type="pil", label="Processed Image")
|
| 294 |
+
output_text = gr.Textbox(label="Recognized Text", lines=8, elem_classes="custom-textbox")
|
| 295 |
+
translated_text = gr.Textbox(label="Translated Text", lines=8, elem_classes="custom-textbox")
|
| 296 |
+
|
| 297 |
+
# Examples shown separately (to avoid schema error)
|
| 298 |
+
gr.Examples(
|
| 299 |
+
examples=[["test_images/image_88.jpg", "auto", "english"],
|
| 300 |
+
["test_images/image_742.jpg", "hindi", "english"]],
|
| 301 |
+
inputs=[input_image, lang_dropdown, translation_dropdown],
|
| 302 |
+
label="Try an example"
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# Connect logic
|
| 306 |
+
run_ocr_button.click(fn=process_image, inputs=[input_image, lang_dropdown], outputs=[output_image, output_text])
|
| 307 |
+
translate_button.click(fn=translate_text, inputs=[output_text, translation_dropdown], outputs=translated_text)
|
| 308 |
+
clear_button.click(fn=clear_inputs, outputs=[input_image, lang_dropdown, translation_dropdown, output_image, output_text, translated_text]) # Clear logic
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
# Launch
|
| 312 |
+
if __name__ == "__main__":
|
| 313 |
+
print("Starting IndicPhotoOCR...")
|
| 314 |
+
try:
|
| 315 |
+
demo.launch(
|
| 316 |
+
share=True,
|
| 317 |
+
server_name="0.0.0.0",
|
| 318 |
+
server_port=7860, # Changed to avoid port conflict
|
| 319 |
+
debug=True
|
| 320 |
+
# show_error=True,
|
| 321 |
+
# quiet=False,
|
| 322 |
+
# max_threads=40,
|
| 323 |
+
# inbrowser=False, # Don't try to open browser
|
| 324 |
+
# prevent_thread_lock=False # Keep app running
|
| 325 |
+
)
|
| 326 |
+
except KeyboardInterrupt:
|
| 327 |
+
print("App interrupted by user")
|
| 328 |
+
except Exception as e:
|
| 329 |
+
print(f"Error launching app: {e}")
|
| 330 |
+
import traceback
|
| 331 |
+
traceback.print_exc()
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
aiohappyeyeballs
|
| 2 |
aiohttp==3.10.10
|
| 3 |
aiosignal==1.3.1
|
| 4 |
async-timeout==4.0.3
|
|
@@ -45,3 +45,6 @@ easydict==1.13
|
|
| 45 |
scipy==1.13.1
|
| 46 |
transformers==4.45.1
|
| 47 |
datasets==3.1.0
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiohappyeyeballs>=2.0.0
|
| 2 |
aiohttp==3.10.10
|
| 3 |
aiosignal==1.3.1
|
| 4 |
async-timeout==4.0.3
|
|
|
|
| 45 |
scipy==1.13.1
|
| 46 |
transformers==4.45.1
|
| 47 |
datasets==3.1.0
|
| 48 |
+
IndicTransToolkit>=1.0.0
|
| 49 |
+
sentencepiece>=0.1.99
|
| 50 |
+
sacremoses>=0.0.53
|