jb100's picture
Update app.py
5e4d3c1 verified
# Code v15
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import PyPDF2
from docx import Document
import tempfile
import os
from typing import Optional, Tuple
import logging
import spaces
import time
import re
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Authentication credentials from environment variables
VALID_USERNAME = os.getenv("USERNAME", "admin")
VALID_PASSWORD = os.getenv("PASSWORD", "password123")
# Session management
authenticated_sessions = set()
def authenticate(username: str, password: str) -> tuple:
"""Authenticate user credentials and return session info"""
if username == VALID_USERNAME and password == VALID_PASSWORD:
session_id = f"session_{int(time.time())}_{hash(username)}"
authenticated_sessions.add(session_id)
logger.info(f"Successful login for user: {username}")
return True, session_id
else:
logger.warning(f"Failed login attempt for user: {username}")
return False, None
def is_authenticated(session_id: str) -> bool:
"""Check if session is authenticated"""
return session_id in authenticated_sessions
def logout_session(session_id: str):
"""Remove session from authenticated sessions"""
if session_id in authenticated_sessions:
authenticated_sessions.remove(session_id)
logger.info(f"Session logged out: {session_id}")
class NLLBTranslator:
def __init__(self, model_size="600M"):
self.model = None
self.tokenizer = None
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model_size = model_size
self.load_model()
def load_model(self):
"""Load the NLLB model and tokenizer"""
try:
# Use the smaller, more stable model by default
if self.model_size == "600M":
model_name = "facebook/nllb-200-distilled-600M"
elif self.model_size == "1.3B":
model_name = "facebook/nllb-200-1.3B"
else: # 3.3B
model_name = "facebook/nllb-200-3.3B"
logger.info(f"Loading NLLB model: {model_name}")
if torch.cuda.is_available():
logger.info(f"CUDA available: {torch.cuda.get_device_name(0)}")
torch_dtype = torch.float16
else:
logger.warning("CUDA not available, using CPU")
torch_dtype = torch.float32
# Load tokenizer
logger.info("Loading NLLB tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load model
logger.info("Loading NLLB model...")
self.model = AutoModelForSeq2SeqLM.from_pretrained(
model_name,
torch_dtype=torch_dtype,
low_cpu_mem_usage=True
)
self.model = self.model.to(self.device)
self.model.eval()
logger.info("NLLB model loaded successfully!")
except Exception as e:
logger.error(f"Error loading NLLB model: {str(e)}")
raise e
def split_into_sentences(self, text: str) -> tuple:
"""Split text into sentences while preserving paragraph structure"""
paragraphs = re.split(r'\n\s*\n', text)
sentence_list = []
paragraph_markers = []
for para_idx, paragraph in enumerate(paragraphs):
if not paragraph.strip():
continue
sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
for sent_idx, sentence in enumerate(sentences):
if sentence.strip():
sentence_list.append(sentence.strip())
is_para_end = (sent_idx == len(sentences) - 1)
is_last_para = (para_idx == len(paragraphs) - 1)
paragraph_markers.append({
'is_paragraph_end': is_para_end and not is_last_para,
'original_sentence': sentence.strip()
})
return sentence_list, paragraph_markers
def reconstruct_formatting(self, translated_sentences: list, paragraph_markers: list) -> str:
"""Reconstruct text with original paragraph formatting"""
if len(translated_sentences) != len(paragraph_markers):
return ' '.join(translated_sentences)
result = []
for i, (translation, marker) in enumerate(zip(translated_sentences, paragraph_markers)):
result.append(translation)
if marker['is_paragraph_end']:
result.append('\n\n')
elif i < len(translated_sentences) - 1:
result.append(' ')
return ''.join(result)
@spaces.GPU
def translate_text(self, text: str, source_lang: str, target_lang: str) -> str:
"""Translate text from source language to target language"""
try:
source_code = LANGUAGE_CODES.get(source_lang)
target_code = LANGUAGE_CODES.get(target_lang)
if not source_code or not target_code:
return f"Unsupported language: {source_lang} or {target_lang}"
if source_lang == target_lang:
return text
logger.info(f"Translating from {source_lang} ({source_code}) to {target_lang} ({target_code})")
# For simple test, try a direct approach first
if text.strip() == "Hello, how are you today?":
logger.info("Using simple test translation")
return self.simple_translate(text, source_code, target_code)
# Check if simple or complex text
if '\n' not in text and len(text.split('.')) <= 2:
input_sentences = [text.strip()]
paragraph_markers = None
else:
input_sentences, paragraph_markers = self.split_into_sentences(text)
if not input_sentences:
return "No valid text found to translate."
return self.perform_translation(input_sentences, source_code, target_code, paragraph_markers)
except Exception as e:
logger.error(f"Translation error: {str(e)}")
import traceback
traceback.print_exc()
return f"Error during translation: {str(e)}"
def simple_translate(self, text: str, source_code: str, target_code: str) -> str:
"""Simple translation method for testing"""
try:
# Set source language
self.tokenizer.src_lang = source_code
# Tokenize
inputs = self.tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=512
).to(self.device)
# Generate without forced language token to avoid tokenizer issues
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=512,
num_beams=5,
early_stopping=True,
do_sample=False,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
# Decode
translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
logger.info(f"Simple translation result: {translation}")
return translation.strip() if translation.strip() else "Translation produced empty result"
except Exception as e:
logger.error(f"Simple translation failed: {str(e)}")
return f"Simple translation failed: {str(e)}"
def perform_translation(self, input_sentences: list, source_code: str, target_code: str, paragraph_markers: list) -> str:
"""Perform the actual translation using NLLB model"""
batch_size = 2 # Conservative batch size for stability
# For very long sentences, use single processing
avg_sentence_length = sum(len(s.split()) for s in input_sentences) / len(input_sentences) if input_sentences else 0
if avg_sentence_length > 100:
batch_size = 1
logger.info(f"Using batch size {batch_size} for average sentence length {avg_sentence_length:.1f} words")
logger.info(f"Translating from {source_code} to {target_code}")
all_translations = []
for i in range(0, len(input_sentences), batch_size):
batch_sentences = input_sentences[i:i + batch_size]
try:
# Tokenize input with source language
self.tokenizer.src_lang = source_code
inputs = self.tokenizer(
batch_sentences,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
).to(self.device)
# Get target language token ID using different methods
target_token_id = None
try:
# Method 1: Try lang_code_to_id
if hasattr(self.tokenizer, 'lang_code_to_id'):
target_token_id = self.tokenizer.lang_code_to_id[target_code]
# Method 2: Try convert_tokens_to_ids
elif hasattr(self.tokenizer, 'convert_tokens_to_ids'):
target_token_id = self.tokenizer.convert_tokens_to_ids(target_code)
# Method 3: Try getting from vocabulary
else:
target_token_id = self.tokenizer.get_vocab().get(target_code)
except (KeyError, AttributeError):
logger.warning(f"Could not find target language token for {target_code}")
target_token_id = None
# Generate translation
generation_kwargs = {
"max_length": 512,
"num_beams": 4,
"early_stopping": True,
"do_sample": False,
"pad_token_id": self.tokenizer.pad_token_id,
"eos_token_id": self.tokenizer.eos_token_id
}
# Only add forced_bos_token_id if we found a valid target token
if target_token_id is not None:
generation_kwargs["forced_bos_token_id"] = target_token_id
with torch.no_grad():
translated_tokens = self.model.generate(**inputs, **generation_kwargs)
# Decode translations
translations = self.tokenizer.batch_decode(
translated_tokens,
skip_special_tokens=True
)
# Clean up translations
cleaned_translations = []
for trans in translations:
cleaned = trans.strip()
if cleaned:
cleaned_translations.append(cleaned)
else:
cleaned_translations.append("Translation produced empty result")
all_translations.extend(cleaned_translations)
# Progress logging
if len(input_sentences) > 10:
progress = min(100, int(((i + batch_size) / len(input_sentences)) * 100))
logger.info(f"Translation progress: {progress}%")
except Exception as e:
logger.error(f"Translation error in batch: {str(e)}")
# Fallback: process sentences individually with simpler approach
for single_sentence in batch_sentences:
try:
# Set source language
self.tokenizer.src_lang = source_code
inputs = self.tokenizer(
single_sentence,
return_tensors="pt",
truncation=True,
max_length=512
).to(self.device)
# Use simple generation without forced language tokens
with torch.no_grad():
translated_tokens = self.model.generate(
**inputs,
max_length=512,
num_beams=2,
early_stopping=True,
do_sample=False,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
translation = self.tokenizer.decode(
translated_tokens[0],
skip_special_tokens=True
)
# Clean the translation
cleaned_translation = translation.strip()
if cleaned_translation:
all_translations.append(cleaned_translation)
else:
all_translations.append("Empty translation result")
except Exception as single_e:
logger.error(f"Failed to translate sentence '{single_sentence}': {str(single_e)}")
all_translations.append(f"Translation failed: {str(single_e)}")
# Reconstruct formatting
if paragraph_markers and len(all_translations) == len(paragraph_markers):
final_translation = self.reconstruct_formatting(all_translations, paragraph_markers)
else:
final_translation = ' '.join(all_translations) if all_translations else "Translation failed - no output generated"
return final_translation
# NLLB-200 supported languages (comprehensive list)
LANGUAGE_CODES = {
# Major European Languages
"English": "eng_Latn",
"French": "fra_Latn",
"German": "deu_Latn",
"Spanish": "spa_Latn",
"Italian": "ita_Latn",
"Portuguese": "por_Latn",
"Russian": "rus_Cyrl",
"Dutch": "nld_Latn",
"Polish": "pol_Latn",
"Czech": "ces_Latn",
"Swedish": "swe_Latn",
"Danish": "dan_Latn",
"Norwegian": "nob_Latn",
"Finnish": "fin_Latn",
"Greek": "ell_Grek",
"Hungarian": "hun_Latn",
"Romanian": "ron_Latn",
"Bulgarian": "bul_Cyrl",
"Croatian": "hrv_Latn",
"Slovak": "slk_Latn",
"Ukrainian": "ukr_Cyrl",
"Belarusian": "bel_Cyrl",
"Serbian": "srp_Cyrl",
"Slovenian": "slv_Latn",
"Estonian": "est_Latn",
"Latvian": "lav_Latn",
"Lithuanian": "lit_Latn",
"Macedonian": "mkd_Cyrl",
"Albanian": "als_Latn",
"Bosnian": "bos_Latn",
"Montenegrin": "cnr_Latn",
"Maltese": "mlt_Latn",
"Luxembourgish": "ltz_Latn",
# Asian Languages - East Asian
"Chinese (Simplified)": "zho_Hans",
"Chinese (Traditional)": "zho_Hant",
"Japanese": "jpn_Jpan",
"Korean": "kor_Hang",
"Mongolian": "khk_Cyrl",
# Asian Languages - Southeast Asian
"Vietnamese": "vie_Latn",
"Thai": "tha_Thai",
"Indonesian": "ind_Latn",
"Malay": "zsm_Latn",
"Filipino": "fil_Latn",
"Tagalog": "tgl_Latn",
"Javanese": "jav_Latn",
"Sundanese": "sun_Latn",
"Burmese": "mya_Mymr",
"Khmer": "khm_Khmr",
"Lao": "lao_Laoo",
"Cebuano": "ceb_Latn",
"Minangkabau": "min_Latn",
"Acehnese": "ace_Latn",
"Balinese": "ban_Latn",
"Banjarese": "bjn_Latn",
"Bugis": "bug_Latn",
"Madurese": "mad_Latn",
# Asian Languages - South Asian
"Hindi": "hin_Deva",
"Bengali": "ben_Beng",
"Tamil": "tam_Taml",
"Telugu": "tel_Telu",
"Marathi": "mar_Deva",
"Gujarati": "guj_Gujr",
"Kannada": "kan_Knda",
"Malayalam": "mal_Mlym",
"Punjabi": "pan_Guru",
"Urdu": "urd_Arab",
"Nepali": "nep_Deva",
"Sinhala": "sin_Sinh",
"Assamese": "asm_Beng",
"Oriya": "ory_Orya",
"Sanskrit": "san_Deva",
"Kashmiri": "kas_Arab",
"Sindhi": "snd_Arab",
"Maithili": "mai_Deva",
"Santali": "sat_Olck",
"Manipuri": "mni_Beng",
"Bodo": "brx_Deva",
"Dogri": "doi_Deva",
"Konkani": "gom_Deva",
# Middle Eastern Languages
"Arabic": "arb_Arab",
"Hebrew": "heb_Hebr",
"Persian": "pes_Arab",
"Turkish": "tur_Latn",
"Kurdish": "ckb_Arab",
"Pashto": "pbt_Arab",
"Dari": "prs_Arab",
"Azerbaijani": "azj_Latn",
"Kazakh": "kaz_Cyrl",
"Kyrgyz": "kir_Cyrl",
"Uzbek": "uzn_Latn",
"Tajik": "tgk_Cyrl",
"Turkmen": "tuk_Latn",
"Uighur": "uig_Arab",
"Armenian": "hye_Armn",
"Georgian": "kat_Geor",
"Amharic": "amh_Ethi",
"Tigrinya": "tir_Ethi",
"Oromo": "orm_Ethi",
# African Languages
"Swahili": "swh_Latn",
"Yoruba": "yor_Latn",
"Igbo": "ibo_Latn",
"Hausa": "hau_Latn",
"Zulu": "zul_Latn",
"Xhosa": "xho_Latn",
"Afrikaans": "afr_Latn",
"Somali": "som_Latn",
"Shona": "sna_Latn",
"Kinyarwanda": "kin_Latn",
"Rundi": "run_Latn",
"Chichewa": "nya_Latn",
"Luganda": "lug_Latn",
"Wolof": "wol_Latn",
"Fula": "fuv_Latn",
"Twi": "twi_Latn",
"Lingala": "lin_Latn",
"Bambara": "bam_Latn",
"Mossi": "mos_Latn",
"Ewe": "ewe_Latn",
"Akan": "aka_Latn",
"Malagasy": "plt_Latn",
"Sesotho": "sot_Latn",
"Tswana": "tsn_Latn",
"Venda": "ven_Latn",
"Tsonga": "tso_Latn",
"Ndebele": "nso_Latn",
"Swati": "ssw_Latn",
# European Celtic & Regional Languages
"Welsh": "cym_Latn",
"Irish": "gle_Latn",
"Scottish Gaelic": "gla_Latn",
"Breton": "bre_Latn",
"Cornish": "cor_Latn",
"Manx": "glv_Latn",
"Basque": "eus_Latn",
"Catalan": "cat_Latn",
"Galician": "glg_Latn",
"Occitan": "oci_Latn",
"Sardinian": "srd_Latn",
"Corsican": "cos_Latn",
"Faroese": "fao_Latn",
"Icelandic": "isl_Latn",
"Frisian": "fry_Latn",
"Kashubian": "csb_Latn",
"Sorbian": "hsb_Latn",
"Romansh": "roh_Latn",
# Americas Indigenous Languages
"Quechua": "quy_Latn",
"Guarani": "grn_Latn",
"Aymara": "ayr_Latn",
"Nahuatl": "nah_Latn",
"Maya": "mam_Latn",
"Wayuu": "guc_Latn",
"Otomi": "oto_Latn",
"Zapotec": "zap_Latn",
"Mixe": "mie_Latn",
"Tzeltal": "tzh_Latn",
"Tzotzil": "tzo_Latn",
"Tarahumara": "tar_Latn",
"Huichol": "hch_Latn",
"Mazatec": "maz_Latn",
"Chatino": "ctp_Latn",
"Chinantec": "chq_Latn",
"Mixtec": "mxt_Latn",
"Triqui": "trc_Latn",
"Mazahua": "maz_Latn",
"Purépecha": "tsz_Latn",
"Totonac": "top_Latn",
"Huastec": "hus_Latn",
"Zoque": "zos_Latn",
"Chol": "ctu_Latn",
"Mam": "mam_Latn",
"Kʼicheʼ": "quc_Latn",
"Kaqchikel": "cak_Latn",
"Achuar": "acu_Latn",
"Shuar": "jiv_Latn",
"Awajún": "agr_Latn",
"Shipibo": "shp_Latn",
"Asháninka": "cni_Latn",
# Pacific Languages
"Māori": "mri_Latn",
"Samoan": "smo_Latn",
"Tongan": "ton_Latn",
"Fijian": "fij_Latn",
"Hawaiian": "haw_Latn",
"Tahitian": "tah_Latn",
"Chamorro": "cha_Latn",
"Palauan": "pau_Latn",
"Marshallese": "mah_Latn",
"Chuukese": "chk_Latn",
"Kosraean": "kos_Latn",
"Pohnpeian": "pon_Latn",
"Yapese": "yap_Latn",
# Additional Asian Languages
"Tibetan": "bod_Tibt",
"Dzongkha": "dzo_Tibt",
"Ladakhi": "lbj_Tibt",
"Sherpa": "xsr_Deva",
"Newari": "new_Deva",
"Maithili": "mai_Deva",
"Bhojpuri": "bho_Deva",
"Magahi": "mag_Deva",
"Angika": "anp_Deva",
"Bajjika": "bpy_Beng",
"Chittagonian": "ctg_Beng",
"Sylheti": "syl_Beng",
"Rohingya": "rhg_Arab",
"Meitei": "mni_Beng",
"Tripuri": "trp_Latn",
"Garo": "grt_Beng",
"Kokborok": "trp_Latn",
"Mizo": "lus_Latn",
"Nagamese": "nag_Latn",
"Khasi": "kha_Latn",
"Balochi": "bal_Arab",
"Brahui": "brh_Arab",
"Burushaski": "bsk_Arab",
"Gilgiti": "shx_Arab",
"Hindko": "hno_Arab",
"Pahari": "phr_Deva",
"Garhwali": "gbm_Deva",
"Kumaoni": "kfy_Deva",
# Additional African Languages
"Berber": "ber_Latn",
"Tamazight": "tzm_Latn",
"Kabyle": "kab_Latn",
"Tuareg": "taq_Latn",
"Nuer": "nus_Latn",
"Dinka": "din_Latn",
"Kanuri": "knc_Latn",
"Tiv": "tiv_Latn",
"Efik": "efi_Latn",
"Ibibio": "ibb_Latn",
"Annang": "anw_Latn",
"Ijaw": "ijc_Latn",
"Urhobo": "urh_Latn",
"Edo": "bin_Latn",
"Igala": "igl_Latn",
"Idoma": "idu_Latn",
"Berom": "bom_Latn",
"Gbagyi": "gbr_Latn",
"Nupe": "nup_Latn",
"Jukun": "jbu_Latn",
"Chadic": "cdc_Latn",
"Adamawa": "adm_Latn",
"Gur": "gur_Latn",
"Kru": "kru_Latn",
"Mande": "mnd_Latn",
"Nilotic": "nil_Latn",
"Cushitic": "cus_Latn",
"Omotic": "omo_Latn",
"Khoisan": "khi_Latn",
# Sign Languages (limited support)
"American Sign Language": "ase_Sgnw",
"British Sign Language": "bfi_Sgnw",
"French Sign Language": "fsl_Sgnw",
"German Sign Language": "gsg_Sgnw",
"Japanese Sign Language": "jsl_Sgnw",
"Chinese Sign Language": "csl_Sgnw",
# Historical and Classical Languages
"Latin": "lat_Latn",
"Ancient Greek": "grc_Grek",
"Old Church Slavonic": "chu_Cyrl",
"Middle English": "enm_Latn",
"Old English": "ang_Latn",
"Old Norse": "non_Latn",
"Gothic": "got_Goth",
"Aramaic": "arc_Armi",
"Coptic": "cop_Copt",
"Ge'ez": "gez_Ethi",
"Akkadian": "akk_Xsux",
"Sumerian": "sux_Xsux",
"Hittite": "hit_Xsux",
"Phoenician": "phn_Phnx",
"Ugaritic": "uga_Ugar",
"Pahlavi": "pal_Phlv",
"Avestan": "ave_Avst",
"Old Persian": "peo_Xpeo",
"Sogdian": "sog_Sogd",
"Tocharian": "txb_Latn",
"Khotanese": "kho_Brah",
"Gandhari": "pgd_Khar",
"Prakrit": "prc_Brah",
"Pali": "pli_Latn",
}
# Create a sorted list for better UI
LANGUAGE_NAMES = sorted(LANGUAGE_CODES.keys())
def extract_text_from_pdf(file_path: str) -> str:
"""Extract text from PDF file while preserving paragraph structure"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
paragraphs = []
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text.strip():
page_paragraphs = [p.strip() for p in page_text.split('\n\n') if p.strip()]
paragraphs.extend(page_paragraphs)
return '\n\n'.join(paragraphs)
except Exception as e:
logger.error(f"Error extracting text from PDF: {str(e)}")
return f"Error reading PDF: {str(e)}"
def extract_text_from_docx(file_path: str) -> Tuple[str, list]:
"""Extract text from DOCX file while preserving paragraph structure and formatting info"""
try:
doc = Document(file_path)
paragraphs = []
formatting_info = []
for para in doc.paragraphs:
text = para.text.strip()
if text:
paragraphs.append(text)
# Store comprehensive paragraph formatting
para_format = {
'alignment': para.alignment,
'left_indent': para.paragraph_format.left_indent,
'right_indent': para.paragraph_format.right_indent,
'first_line_indent': para.paragraph_format.first_line_indent,
'space_before': para.paragraph_format.space_before,
'space_after': para.paragraph_format.space_after,
'line_spacing': para.paragraph_format.line_spacing,
'runs': []
}
# Store detailed run-level formatting
for run in para.runs:
if run.text.strip():
run_format = {
'text': run.text,
'bold': run.bold,
'italic': run.italic,
'underline': run.underline,
'font_name': run.font.name,
'font_size': run.font.size,
'font_color_rgb': None,
'font_color_theme': None,
'highlight_color': None,
'superscript': None,
'subscript': None,
'strike': None,
'double_strike': None,
'all_caps': None,
'small_caps': None
}
# Get font color (RGB)
try:
if run.font.color and run.font.color.rgb:
run_format['font_color_rgb'] = run.font.color.rgb
except:
pass
# Get font color (theme color)
try:
if run.font.color and run.font.color.theme_color:
run_format['font_color_theme'] = run.font.color.theme_color
except:
pass
# Get highlight color
try:
if run.font.highlight_color:
run_format['highlight_color'] = run.font.highlight_color
except:
pass
# Get additional formatting
try:
run_format['superscript'] = run.font.superscript
run_format['subscript'] = run.font.subscript
run_format['strike'] = run.font.strike
run_format['double_strike'] = run.font.double_strike
run_format['all_caps'] = run.font.all_caps
run_format['small_caps'] = run.font.small_caps
except:
pass
para_format['runs'].append(run_format)
formatting_info.append(para_format)
text = '\n\n'.join(paragraphs)
return text, formatting_info
except Exception as e:
logger.error(f"Error extracting text from DOCX: {str(e)}")
return f"Error reading DOCX: {str(e)}", []
def create_formatted_docx(translated_paragraphs: list, formatting_info: list, filename: str) -> str:
"""Create a DOCX file with translated text while preserving original formatting"""
try:
doc = Document()
# Remove default paragraph
if doc.paragraphs:
p = doc.paragraphs[0]
p._element.getparent().remove(p._element)
for i, (para_text, para_format) in enumerate(zip(translated_paragraphs, formatting_info)):
if not para_text.strip():
continue
paragraph = doc.add_paragraph()
# Apply paragraph-level formatting
try:
if para_format.get('alignment') is not None:
paragraph.alignment = para_format['alignment']
if para_format.get('left_indent') is not None:
paragraph.paragraph_format.left_indent = para_format['left_indent']
if para_format.get('right_indent') is not None:
paragraph.paragraph_format.right_indent = para_format['right_indent']
if para_format.get('first_line_indent') is not None:
paragraph.paragraph_format.first_line_indent = para_format['first_line_indent']
if para_format.get('space_before') is not None:
paragraph.paragraph_format.space_before = para_format['space_before']
if para_format.get('space_after') is not None:
paragraph.paragraph_format.space_after = para_format['space_after']
if para_format.get('line_spacing') is not None:
paragraph.paragraph_format.line_spacing = para_format['line_spacing']
except Exception as e:
logger.warning(f"Could not apply paragraph formatting: {e}")
# Apply run-level formatting with full preservation
runs_info = para_format.get('runs', [])
if runs_info:
# Analyze the dominant formatting for the paragraph
total_runs = len(runs_info)
# Count formatting occurrences
bold_count = sum(1 for r in runs_info if r.get('bold'))
italic_count = sum(1 for r in runs_info if r.get('italic'))
underline_count = sum(1 for r in runs_info if r.get('underline'))
# Get most common formatting values
font_names = [r.get('font_name') for r in runs_info if r.get('font_name')]
font_sizes = [r.get('font_size') for r in runs_info if r.get('font_size')]
font_colors_rgb = [r.get('font_color_rgb') for r in runs_info if r.get('font_color_rgb')]
font_colors_theme = [r.get('font_color_theme') for r in runs_info if r.get('font_color_theme')]
highlight_colors = [r.get('highlight_color') for r in runs_info if r.get('highlight_color')]
# Create run with translated text
run = paragraph.add_run(para_text)
try:
# Apply basic formatting (use majority rule)
if bold_count > total_runs / 2:
run.bold = True
if italic_count > total_runs / 2:
run.italic = True
if underline_count > total_runs / 2:
run.underline = True
# Apply font name (most common)
if font_names:
most_common_font = max(set(font_names), key=font_names.count)
run.font.name = most_common_font
# Apply font size (most common)
if font_sizes:
most_common_size = max(set(font_sizes), key=font_sizes.count)
run.font.size = most_common_size
# Apply font color (RGB - most common)
if font_colors_rgb:
most_common_color = max(set(font_colors_rgb), key=font_colors_rgb.count)
run.font.color.rgb = most_common_color
# Apply font color (theme - most common)
elif font_colors_theme:
most_common_theme = max(set(font_colors_theme), key=font_colors_theme.count)
run.font.color.theme_color = most_common_theme
# Apply highlight color (most common)
if highlight_colors:
most_common_highlight = max(set(highlight_colors), key=highlight_colors.count)
run.font.highlight_color = most_common_highlight
# Apply additional formatting if majority of runs have it
superscript_count = sum(1 for r in runs_info if r.get('superscript'))
subscript_count = sum(1 for r in runs_info if r.get('subscript'))
strike_count = sum(1 for r in runs_info if r.get('strike'))
double_strike_count = sum(1 for r in runs_info if r.get('double_strike'))
all_caps_count = sum(1 for r in runs_info if r.get('all_caps'))
small_caps_count = sum(1 for r in runs_info if r.get('small_caps'))
if superscript_count > total_runs / 2:
run.font.superscript = True
if subscript_count > total_runs / 2:
run.font.subscript = True
if strike_count > total_runs / 2:
run.font.strike = True
if double_strike_count > total_runs / 2:
run.font.double_strike = True
if all_caps_count > total_runs / 2:
run.font.all_caps = True
if small_caps_count > total_runs / 2:
run.font.small_caps = True
except Exception as e:
logger.warning(f"Could not apply some run formatting: {e}")
else:
# No run formatting info, just add the text
paragraph.add_run(para_text)
doc.save(filename)
logger.info(f"Created formatted DOCX with full formatting preservation: {filename}")
return filename
except Exception as e:
logger.error(f"Error creating formatted DOCX: {str(e)}")
return create_docx_with_text('\n\n'.join(translated_paragraphs), filename)
def create_docx_with_text(text: str, filename: str) -> str:
"""Create a DOCX file with the given text"""
try:
doc = Document()
paragraphs = text.split('\n\n')
for para_text in paragraphs:
if para_text.strip():
cleaned_text = para_text.replace('\n', ' ').strip()
doc.add_paragraph(cleaned_text)
doc.save(filename)
return filename
except Exception as e:
logger.error(f"Error creating DOCX: {str(e)}")
return None
@spaces.GPU
def translate_text_input(text: str, source_lang: str, target_lang: str, session_id: str = "") -> str:
"""Handle text input translation"""
if not is_authenticated(session_id):
return "❌ Please log in to use this feature."
if not text.strip():
return "Please enter some text to translate."
if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES:
return "Invalid language selection."
return translator.translate_text(text, source_lang, target_lang)
@spaces.GPU
def translate_document(file, source_lang: str, target_lang: str, session_id: str = "") -> Tuple[Optional[str], str]:
"""Handle document translation while preserving original formatting"""
if not is_authenticated(session_id):
return None, "❌ Please log in to use this feature."
if file is None:
return None, "Please upload a document."
if source_lang not in LANGUAGE_CODES or target_lang not in LANGUAGE_CODES:
return None, "Invalid language selection."
start_time = time.time()
try:
file_extension = os.path.splitext(file.name)[1].lower()
formatting_info = None
logger.info(f"Starting document translation: {source_lang}{target_lang}")
if file_extension == '.pdf':
text = extract_text_from_pdf(file.name)
elif file_extension == '.docx':
text, formatting_info = extract_text_from_docx(file.name)
else:
return None, "Unsupported file format. Please upload PDF or DOCX files only."
if text.startswith("Error"):
return None, text
word_count = len(text.split())
char_count = len(text)
logger.info(f"Document stats: {word_count} words, {char_count} characters")
# Translate the text
translate_start = time.time()
translated_text = translator.translate_text(text, source_lang, target_lang)
translate_end = time.time()
translate_duration = translate_end - translate_start
logger.info(f"Core translation took: {translate_duration:.2f} seconds")
# Create output file
output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.docx"
output_path = os.path.join(tempfile.gettempdir(), output_filename)
# Create formatted output
if formatting_info and file_extension == '.docx':
translated_paragraphs = translated_text.split('\n\n')
if len(translated_paragraphs) == len(formatting_info):
create_formatted_docx(translated_paragraphs, formatting_info, output_path)
else:
logger.warning(f"Paragraph count mismatch, using fallback")
create_docx_with_text(translated_text, output_path)
else:
create_docx_with_text(translated_text, output_path)
# Calculate timing
end_time = time.time()
total_duration = end_time - start_time
minutes = int(total_duration // 60)
seconds = int(total_duration % 60)
time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
# Calculate speed
if word_count > 0 and total_duration > 0:
words_per_minute = int((word_count / total_duration) * 60)
speed_info = f" • Speed: {words_per_minute} words/min"
else:
speed_info = ""
translation_type = "Same language processed" if source_lang == target_lang else "NLLB translation"
status_message = (
f"✅ Translation completed successfully!\n"
f"⏱️ Time taken: {time_str}\n"
f"📄 Document: {word_count} words, {char_count} characters\n"
f"🔄 Type: {translation_type}{speed_info}\n"
f"📁 Original formatting preserved in output file."
)
logger.info(f"Document translation completed in {total_duration:.2f} seconds")
return output_path, status_message
except Exception as e:
end_time = time.time()
total_duration = end_time - start_time
minutes = int(total_duration // 60)
seconds = int(total_duration % 60)
time_str = f"{minutes}m {seconds}s" if minutes > 0 else f"{seconds}s"
logger.error(f"Document translation error after {time_str}: {str(e)}")
return None, f"❌ Error during document translation (after {time_str}): {str(e)}"
# Initialize translator
print("Initializing NLLB Translator...")
translator = NLLBTranslator(model_size="600M") # Use smaller model for stability
# Create the Gradio app
with gr.Blocks(title="NLLB Universal Translator", theme=gr.themes.Soft()) as demo:
session_state = gr.State("")
# Login interface
with gr.Column(visible=True) as login_column:
gr.Markdown("""
# 🌍 NLLB Universal Translator - Authentication Required
Translate between **200+ languages** using Meta's NLLB (No Language Left Behind) model.
Please enter your credentials to access the translation tool.
""")
with gr.Row():
with gr.Column(scale=1):
pass
with gr.Column(scale=2):
with gr.Group():
gr.Markdown("### Login")
username_input = gr.Textbox(
label="Username",
placeholder="Enter username",
type="text"
)
password_input = gr.Textbox(
label="Password",
placeholder="Enter password",
type="password"
)
login_btn = gr.Button("Login", variant="primary", size="lg")
login_status = gr.Markdown("")
with gr.Column(scale=1):
pass
gr.Markdown("""
---
**Features:**
- 🔒 Secure authentication system
- 🌍 Support for **200+ languages** using Meta's NLLB model
- 📄 Document translation with formatting preservation
- 🚀 High-quality neural machine translation
- 💾 Preserves original document formatting and styling
- 🗺️ Includes indigenous, regional, and low-resource languages
- 📚 Historical and classical languages support
""")
# Main translator interface
with gr.Column(visible=False) as main_column:
gr.Markdown("""
# 🌍 NLLB Universal Translator
Translate text and documents between **200+ languages** using Meta's NLLB model.
Supports major world languages plus indigenous, regional, and low-resource languages.
""")
with gr.Tabs():
# Text Translation Tab
with gr.TabItem("📝 Text Translation"):
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter text to translate...",
lines=6
)
with gr.Row():
source_lang_text = gr.Dropdown(
choices=LANGUAGE_NAMES,
label="Source Language",
value="English",
filterable=True
)
target_lang_text = gr.Dropdown(
choices=LANGUAGE_NAMES,
label="Target Language",
value="Spanish",
filterable=True
)
translate_text_btn = gr.Button("🔄 Translate Text", variant="primary", size="lg")
with gr.Column():
text_output = gr.Textbox(
label="Translated Text",
lines=6,
interactive=False
)
gr.Markdown("""
**Supported Languages (200+):**
- 🇪🇺 **European**: English, Spanish, French, German, Italian, Russian, etc.
- 🇨🇳 **East Asian**: Chinese, Japanese, Korean, Mongolian
- 🇮🇳 **South Asian**: Hindi, Bengali, Tamil, Telugu, Urdu, Sanskrit, etc.
- 🇸🇦 **Middle Eastern**: Arabic, Persian, Hebrew, Turkish, Kurdish
- 🌍 **African**: Swahili, Yoruba, Hausa, Zulu, Amharic, Berber
- 🇻🇳 **Southeast Asian**: Vietnamese, Thai, Indonesian, Filipino, Burmese
- 🏝️ **Pacific**: Māori, Samoan, Hawaiian, Fijian, Tahitian
- 🏛️ **Historical**: Latin, Ancient Greek, Sanskrit, Aramaic
- 🗺️ **Indigenous**: Quechua, Guarani, Nahuatl, Maya, and many more
- 🔤 **Regional**: Welsh, Basque, Catalan, Breton, Faroese
""")
# Document Translation Tab
with gr.TabItem("📄 Document Translation"):
with gr.Row():
with gr.Column():
file_input = gr.File(
label="📁 Upload Document",
file_types=[".pdf", ".docx"],
type="filepath"
)
with gr.Row():
source_lang_doc = gr.Dropdown(
choices=LANGUAGE_NAMES,
label="Source Language",
value="English",
filterable=True
)
target_lang_doc = gr.Dropdown(
choices=LANGUAGE_NAMES,
label="Target Language",
value="French",
filterable=True
)
translate_doc_btn = gr.Button("🔄 Translate Document", variant="primary", size="lg")
gr.Markdown("""
**Document Features:**
- 📝 Preserves original formatting
- 📋 Maintains paragraph structure
- 🎨 Keeps basic styling (bold, italic, underline)
- 📊 Supports PDF and DOCX files
- 💾 Outputs formatted DOCX file
""")
with gr.Column():
doc_status = gr.Textbox(
label="📊 Translation Status",
lines=6,
interactive=False
)
doc_output = gr.File(
label="📥 Download Translated Document"
)
# Examples
gr.Examples(
examples=[
["Hello, how are you today?", "English", "Spanish"],
["Bonjour, comment allez-vous?", "French", "English"],
["你好,你今天好吗?", "Chinese (Simplified)", "English"],
["नमस्ते, आप कैसे हैं?", "Hindi", "English"],
["مرحبا، كيف حالك؟", "Arabic", "English"],
["Machine learning is transforming the world.", "English", "French"],
],
inputs=[text_input, source_lang_text, target_lang_text],
outputs=[text_output],
fn=lambda text, src, tgt: translate_text_input(text, src, tgt, ""),
cache_examples=False,
label="Try these examples:"
)
# Logout functionality
with gr.Row():
logout_btn = gr.Button("🔓 Logout", variant="secondary", size="sm")
def handle_login(username, password):
success, session_id = authenticate(username, password)
if success:
return (
gr.Markdown("✅ **Login successful!** Welcome to the NLLB Universal Translator."),
gr.Column(visible=False),
gr.Column(visible=True),
session_id
)
else:
return (
gr.Markdown("❌ **Invalid credentials.** Please check your username and password."),
gr.Column(visible=True),
gr.Column(visible=False),
""
)
def handle_logout(session_id):
if session_id:
logout_session(session_id)
return (
gr.Column(visible=True),
gr.Column(visible=False),
"",
gr.Textbox(value=""),
gr.Textbox(value=""),
gr.Markdown("🔓 **Logged out successfully.** Please login again to continue.")
)
# Event handlers
login_btn.click(
fn=handle_login,
inputs=[username_input, password_input],
outputs=[login_status, login_column, main_column, session_state]
)
logout_btn.click(
fn=handle_logout,
inputs=[session_state],
outputs=[login_column, main_column, session_state, username_input, password_input, login_status]
)
translate_text_btn.click(
fn=lambda text, src, tgt, session: translate_text_input(text, src, tgt, session),
inputs=[text_input, source_lang_text, target_lang_text, session_state],
outputs=[text_output]
)
translate_doc_btn.click(
fn=lambda file, src, tgt, session: translate_document(file, src, tgt, session),
inputs=[file_input, source_lang_doc, target_lang_doc, session_state],
outputs=[doc_output, doc_status]
)
print("NLLB Universal Translator initialized successfully!")
# Launch the app
if __name__ == "__main__":
demo.launch(share=True)