from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import gradio as gr from PyPDF2 import PdfReader import docx import os import re import torch from datetime import datetime import pytz from io import BytesIO from docx import Document import tempfile # Load translation model def load_translation_model(): try: model_name = "facebook/nllb-200-distilled-600M" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) return tokenizer, model except Exception as e: print(f"Error loading model: {str(e)}") return None, None # Initialize models tokenizer, model = load_translation_model() MODELS = {"nllb": (tokenizer, model)} if tokenizer and model else None # Extract text from documents def extract_text(file): try: if isinstance(file, str): # File path provided ext = os.path.splitext(file)[1].lower() else: # File object provided ext = os.path.splitext(file.name)[1].lower() if ext == ".pdf": try: # Create a BytesIO object to hold the file content if isinstance(file, str): with open(file, 'rb') as f: file_content = BytesIO(f.read()) else: file_content = BytesIO(file.read()) # Create PdfReader object from the BytesIO reader = PdfReader(file_content) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text.strip() except Exception as e: raise Exception(f"PDF extraction error: {str(e)}") finally: if 'file_content' in locals(): file_content.close() elif ext == ".docx": if isinstance(file, str): doc = docx.Document(file) else: doc = docx.Document(file) text = "" for para in doc.paragraphs: text += para.text + "\n" return text.strip() elif ext == ".txt": if isinstance(file, str): with open(file, 'r', encoding='utf-8') as f: return f.read().strip() else: return file.read().decode("utf-8").strip() else: raise ValueError("Unsupported file format") except Exception as e: raise Exception(f"Error extracting text: {str(e)}") # Preprocess idioms def preprocess_idioms(text, src_lang, tgt_lang): idiom_map = {} if src_lang == "en" and tgt_lang == "hi": idiom_map = { "no piece of cake": "कोई आसान काम नहीं", "piece of cake": "बहुत आसान काम", "bite the bullet": "दांतों तले उंगली दबाना", "tackle it head-on": "सीधे मुकाबला करना", "fell into place": "सब कुछ ठीक हो गया", "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना", "with a little perseverance": "थोड़े से धैर्य से", # Additional common idioms "break a leg": "बहुत बहुत शुभकामनाएं", "hit the nail on the head": "बिल्कुल सही बात कहना", "once in a blue moon": "बहुत कम, कभी-कभार", "under the weather": "तबीयत ठीक नहीं", "cost an arm and a leg": "बहुत महंगा", "beating around the bush": "इधर-उधर की बात करना", "call it a day": "काम समाप्त करना", "burn the midnight oil": "रात-रात भर जागकर काम करना", "get the ball rolling": "शुरुआत करना", "pull yourself together": "खुद को संभालो", "shoot yourself in the foot": "अपना ही नुकसान करना", "take it with a grain of salt": "संदेह से लेना", "the last straw": "सहनशीलता की आखिरी सीमा", "time flies": "समय पंख लगाकर उड़ता है", "wrap your head around": "समझने की कोशिश करना", "cut corners": "काम में छोटा रास्ता अपनाना", "back to square one": "फिर से शुरू से", "blessing in disguise": "छिपा हुआ वरदान", "cry over spilled milk": "बीती बात पर पछताना", "keep your chin up": "हिम्मत रखना", # Work-related idioms "think outside the box": "नए तरीके से सोचना", "raise the bar": "मानक ऊंचा करना", "learning curve": "सीखने की प्रक्रिया", "up and running": "चालू और कार्यरत", "back to the drawing board": "फिर से योजना बनाना", # Project-related phrases "running into issues": "समस्याओं का सामना करना", "iron out the bugs": "खामियां दूर करना", "in the pipeline": "विचाराधीन", "moving forward": "आगे बढ़ते हुए", "touch base": "संपर्क में रहना", # Technical phrases "user-friendly": "उपयोगकर्ता के अनुकूल", "cutting-edge": "अत्याधुनिक", "state of the art": "अत्याधुनिक तकनीक", "proof of concept": "व्यवहार्यता का प्रमाण", "game changer": "खेल बदलने वाला", "a blessing in disguise": "छुपा हुआ वरदान", "actions speak louder than words": "कर्म शब्दों से अधिक प्रभावी होते हैं", "add fuel to the fire": "आग में घी डालना", "barking up the wrong tree": "गलत दिशा में प्रयास करना", "best of both worlds": "दोनों चीजों का लाभ", "cut to the chase": "मुद्दे पर आना", "don't judge a book by its cover": "किसी को उसके रूप से मत आंकिए", "easy does it": "धीरे-धीरे करो", "every cloud has a silver lining": "हर मुश्किल में आशा की किरण होती है", "get a taste of your own medicine": "जैसा किया वैसा भुगतो", "hit the sack": "सोने जाना", "let the cat out of the bag": "राज़ खोल देना", "miss the boat": "मौका चूक जाना", "no pain no gain": "बिना मेहनत के कुछ नहीं मिलता", "on the ball": "सचेत और सतर्क", "pull the plug": "काम रोक देना", "spill the beans": "राज़ खोलना", "the ball is in your court": "अब निर्णय तुम्हारे हाथ में है", "through thick and thin": "हर परिस्थिति में", "you can't have your cake and eat it too": "दोनों फायदे एक साथ नहीं हो सकते" } elif src_lang == "en" and tgt_lang == "mr": idiom_map = { "no piece of cake": "सोपं काम नाही", "piece of cake": "अतिशय सोपं काम", "bite the bullet": "कठीण निर्णय घेणे", "tackle it head-on": "समस्येला थेट सामोरे जाणे", "fell into place": "सगळं व्यवस्थित झालं", "see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे", "with a little perseverance": "थोड्या धीराने", "break a leg": "खूप शुभेच्छा", "hit the nail on the head": "अगदी बरोबर बोललात", "once in a blue moon": "क्वचितच, कधीतरी", "under the weather": "तब्येत ठीक नसणे", "cost an arm and a leg": "खूप महाग", "beating around the bush": "गोल गोल फिरवणे", "call it a day": "दिवसाचं काम संपवणे", "burn the midnight oil": "रात्रंदिवस मेहनत करणे", "get the ball rolling": "सुरुवात करणे", "pull yourself together": "स्वतःला सावरा", "shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे", "take it with a grain of salt": "साशंक दृष्टीने पाहणे", "the last straw": "सहनशक्तीची शेवटची मर्यादा", "time flies": "वेळ पंख लावून उडतो", "wrap your head around": "समजून घेण्याचा प्रयत्न करणे", "cut corners": "कमी वेळात काम उरकणे", "back to square one": "पुन्हा सुरुवातीला", "blessing in disguise": "आशीर्वाद लपलेला", "cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे", "keep your chin up": "धीर धरा", # Work-related idioms "think outside the box": "वेगळ्या पद्धतीने विचार करणे", "raise the bar": "पातळी उंचावणे", "learning curve": "शिकण्याची प्रक्रिया", "up and running": "सुरू आणि कार्यरत", "back to the drawing board": "पुन्हा नव्याने योजना आखणे", # Project-related phrases "running into issues": "अडचणींना सामोरे जाणे", "iron out the bugs": "त्रुटी दूर करणे", "in the pipeline": "विचाराधीन", "moving forward": "पुढे जाताना", "touch base": "संपर्कात राहणे", # Technical phrases "user-friendly": "वापरकर्त्यास सोयीस्कर", "cutting-edge": "अत्याधुनिक", "state of the art": "सर्वोत्कृष्ट तंत्रज्ञान", "proof of concept": "संकल्पनेची सिद्धता", "game changer": "खेळ बदलणारी गोष्ट", "a blessing in disguise": "छुपलेले वरदान", "actions speak louder than words": "कृती शब्दांपेक्षा प्रभावी असतात", "add fuel to the fire": "आग ला फुंकर घालणे", "barking up the wrong tree": "चुकीच्या गोष्टीकडे लक्ष देणे", "best of both worlds": "दोनही गोष्टींचा लाभ", "cut to the chase": "थेट मुद्द्यावर येणे", "don't judge a book by its cover": "फक्त बाह्यरूप पाहून अंदाज लावू नका", "easy does it": "हळूहळू करा", "every cloud has a silver lining": "प्रत्येक संकटात संधी असते", "get a taste of your own medicine": "जसे कराल तसे भराल", "hit the sack": "झोपायला जाणे", "let the cat out of the bag": "गुपित उघड करणे", "miss the boat": "संधी गमावणे", "no pain no gain": "कष्टाशिवाय यश नाही", "on the ball": "सतर्क असणे", "pull the plug": "काम बंद करणे", "spill the beans": "गुपित सांगणे", "the ball is in your court": "निर्णय तुमच्या हाती आहे", "through thick and thin": "संकटसमयीही साथ देणे", "you can't have your cake and eat it too": "सगळं काही मिळवता येत नाही" } if idiom_map: # Sort idioms by length (longest first) to handle overlapping phrases sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True) pattern = '|'.join(map(re.escape, sorted_idioms)) # Create a regex pattern and replace idioms if pattern: regex = re.compile(pattern, flags=re.IGNORECASE) text = regex.sub(lambda m: idiom_map[m.group(0).lower()], text) return text # Translation function def translate_text(text, src_lang, tgt_lang): if src_lang == tgt_lang: return text lang_map = {"English": "eng_Latn", "Hindi": "hin_Deva", "Marathi": "mar_Deva"} src_lang_code = lang_map.get(src_lang) tgt_lang_code = lang_map.get(tgt_lang) if not src_lang_code or not tgt_lang_code: return "Error: Unsupported language combination" try: # First apply idiom preprocessing preprocessed_text = preprocess_idioms(text, src_lang[:2].lower(), tgt_lang[:2].lower()) tokenizer, model = MODELS["nllb"] chunks = [] current_chunk = "" # Split text into manageable chunks for sentence in re.split('([.!?।]+)', preprocessed_text): if sentence.strip(): if len(current_chunk) + len(sentence) < 450: current_chunk += sentence else: if current_chunk: chunks.append(current_chunk) current_chunk = sentence if current_chunk: chunks.append(current_chunk) translated_text = "" # Translate each chunk for chunk in chunks: if chunk.strip(): inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512) tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code) translated = model.generate( **inputs, forced_bos_token_id=tgt_lang_id, max_length=512, num_beams=5, length_penalty=1.0, no_repeat_ngram_size=3 ) translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True) translated_text += translated_chunk + " " return translated_text.strip() except Exception as e: return f"Error during translation: {str(e)}" # Document translation function def translate_document(file, source_lang, target_lang): try: if file is None: return "Please upload a file", None input_ext = os.path.splitext(file.name)[1].lower() temp_dir = tempfile.gettempdir() # Change output extension to .txt for PDF inputs if input_ext == '.pdf': output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.txt" else: output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}{input_ext}" output_path = os.path.join(temp_dir, output_filename) if input_ext == '.pdf': try: # Create a BytesIO object for the PDF content if isinstance(file, str): with open(file, 'rb') as f: file_content = BytesIO(f.read()) else: file_content = BytesIO(file.read()) # Create PdfReader object reader = PdfReader(file_content) translated_pages = [] # Process each page while preserving structure for page_num, page in enumerate(reader.pages, 1): # Extract text from the page page_text = page.extract_text() if not page_text.strip(): continue # Split into paragraphs while preserving structure paragraphs = page_text.split('\n\n') translated_paragraphs = [] for paragraph in paragraphs: # Handle individual lines within paragraphs lines = paragraph.split('\n') translated_lines = [] for line in lines: if line.strip(): translated_line = translate_text(line, source_lang, target_lang) translated_lines.append(translated_line) else: translated_lines.append('') # Preserve empty lines translated_paragraphs.append('\n'.join(translated_lines)) # Combine translated paragraphs with proper spacing translated_pages.append('\n\n'.join(translated_paragraphs)) # Combine all translated pages final_text = '\n\n'.join(translated_pages) # Save as formatted txt file with open(output_path, 'w', encoding='utf-8') as f: f.write(final_text) return final_text, output_path except Exception as e: raise Exception(f"PDF processing error: {str(e)}") finally: if 'file_content' in locals(): file_content.close() elif input_ext == '.docx': # Handle DOCX with formatting preservation doc = Document(file) new_doc = Document() # Copy styles from original document for style in doc.styles: if style.name not in new_doc.styles: new_doc.styles.add_style( style.name, style.type, True if style.base_style else False ) # Process each paragraph while preserving formatting for para in doc.paragraphs: if not para.text.strip(): # Preserve empty paragraphs new_doc.add_paragraph() continue # Create new paragraph with same style new_para = new_doc.add_paragraph(style=para.style.name if para.style else None) # Buffer to collect text for translation runs_buffer = [] formatting_map = [] # Collect text and formatting information for run in para.runs: if run.text.strip(): runs_buffer.append(run.text) # Store formatting attributes formatting_map.append({ 'bold': run.bold, 'italic': run.italic, 'underline': run.underline, 'font_size': run.font.size if run.font.size else None, 'font_name': run.font.name if run.font.name else None, 'color': run.font.color.rgb if run.font.color and run.font.color.rgb else None }) if runs_buffer: # Translate the combined text combined_text = " ".join(runs_buffer) translated_text = translate_text(combined_text, source_lang, target_lang) # Split translated text approximately matching original structure translated_parts = translated_text.split() avg_len = len(translated_parts) // len(formatting_map) # Apply formatting to translated parts current_index = 0 for i, format_info in enumerate(formatting_map): # Calculate text chunk for this run end_index = min(current_index + avg_len, len(translated_parts)) if i == len(formatting_map) - 1: # Last run gets all remaining text end_index = len(translated_parts) chunk_text = " ".join(translated_parts[current_index:end_index]) current_index = end_index # Create new run with preserved formatting new_run = new_para.add_run(chunk_text + " ") new_run.bold = format_info['bold'] new_run.italic = format_info['italic'] new_run.underline = format_info['underline'] if format_info['font_size']: new_run.font.size = format_info['font_size'] if format_info['font_name']: new_run.font.name = format_info['font_name'] if format_info['color']: new_run.font.color.rgb = format_info['color'] # Save the formatted document new_doc.save(output_path) # Return both text content and file text_content = "\n".join(para.text for para in new_doc.paragraphs if para.text.strip()) return text_content, output_path elif input_ext == '.txt': # Handle TXT with line formatting preservation input_text = extract_text(file) if not input_text: return "Could not extract text from the document", None # Split into paragraphs while preserving line breaks paragraphs = input_text.split('\n\n') translated_paragraphs = [] for paragraph in paragraphs: # Handle individual lines within paragraphs lines = paragraph.split('\n') translated_lines = [] for line in lines: if line.strip(): translated_line = translate_text(line, source_lang, target_lang) translated_lines.append(translated_line) else: translated_lines.append('') # Preserve empty lines translated_paragraphs.append('\n'.join(translated_lines)) # Combine translated paragraphs with double line breaks final_text = '\n\n'.join(translated_paragraphs) # Save as formatted txt file with open(output_path, 'w', encoding='utf-8') as f: f.write(final_text) return final_text, output_path else: # For other file types, use the original translation logic input_text = extract_text(file) if input_text is None: return "Could not extract text from the document", None translated_text = translate_text(input_text, source_lang, target_lang) with open(output_path, 'w', encoding='utf-8') as f: f.write(translated_text) return translated_text, output_path except Exception as e: return f"Error: {str(e)}", None # Direct text translation function def translate_text_direct(text, source_lang, target_lang): if not text: return "Please enter some text" return translate_text(text, source_lang, target_lang) # Get current time in UTC def get_current_time(): utc_now = datetime.now(pytz.UTC) return utc_now.strftime("%Y-%m-%d %H:%M:%S") # Create Gradio interface def create_interface(): # Add header with timestamp and user info header = gr.Markdown( f""" # Document Translation Toolkit *Current Date and Time (UTC):* {get_current_time()} *Current User's Login:* gauravchand """ ) # Document Translation Interface doc_interface = gr.Interface( fn=translate_document, inputs=[ gr.File(label="Upload Document (PDF, DOCX, or TXT)"), gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"), gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi") ], outputs=[ gr.Textbox(label="Translation", lines=10), gr.File(label="Download Translation") ], title="Document Translation", description="Upload a document to translate" ) # Text Translation Interface text_interface = gr.Interface( fn=translate_text_direct, inputs=[ gr.Textbox(lines=5, label="Enter text to translate"), gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"), gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi") ], outputs=gr.Textbox(label="Translation", lines=5), title="Text Translation", description="Enter text directly to translate" ) # Combine interfaces with header demo = gr.Blocks() with demo: header.render() gr.TabbedInterface( [doc_interface, text_interface], tab_names=["Document Translation", "Text Translation"] ) return demo # Launch the app if __name__ == "__main__": demo = create_interface() demo.launch()