from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
from PyPDF2 import PdfReader
import docx
import os
import re
import torch
from datetime import datetime
import pytz
from io import BytesIO
from docx import Document
import tempfile

# Load translation model
def load_translation_model():
    try:
        model_name = "facebook/nllb-200-distilled-600M"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None, None

# Initialize models
tokenizer, model = load_translation_model()
MODELS = {"nllb": (tokenizer, model)} if tokenizer and model else None

# Extract text from documents
def extract_text(file):
    try:
        if isinstance(file, str):  # File path provided
            ext = os.path.splitext(file)[1].lower()
        else:  # File object provided
            ext = os.path.splitext(file.name)[1].lower()
            
        if ext == ".pdf":
            try:
                # Create a BytesIO object to hold the file content
                if isinstance(file, str):
                    with open(file, 'rb') as f:
                        file_content = BytesIO(f.read())
                else:
                    file_content = BytesIO(file.read())
                
                # Create PdfReader object from the BytesIO
                reader = PdfReader(file_content)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                return text.strip()
            except Exception as e:
                raise Exception(f"PDF extraction error: {str(e)}")
            finally:
                if 'file_content' in locals():
                    file_content.close()
            
        elif ext == ".docx":
            if isinstance(file, str):
                doc = docx.Document(file)
            else:
                doc = docx.Document(file)
            text = ""
            for para in doc.paragraphs:
                text += para.text + "\n"
            return text.strip()
            
        elif ext == ".txt":
            if isinstance(file, str):
                with open(file, 'r', encoding='utf-8') as f:
                    return f.read().strip()
            else:
                return file.read().decode("utf-8").strip()
        else:
            raise ValueError("Unsupported file format")
    except Exception as e:
        raise Exception(f"Error extracting text: {str(e)}")

# Preprocess idioms
def preprocess_idioms(text, src_lang, tgt_lang):
    idiom_map = {}
    
    if src_lang == "en" and tgt_lang == "hi":
        idiom_map = {
            "no piece of cake": "कोई आसान काम नहीं",
            "piece of cake": "बहुत आसान काम",
            "bite the bullet": "दांतों तले उंगली दबाना",
            "tackle it head-on": "सीधे मुकाबला करना",
            "fell into place": "सब कुछ ठीक हो गया",
            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
            "with a little perseverance": "थोड़े से धैर्य से",
            
            # Additional common idioms
            "break a leg": "बहुत बहुत शुभकामनाएं",
            "hit the nail on the head": "बिल्कुल सही बात कहना",
            "once in a blue moon": "बहुत कम, कभी-कभार",
            "under the weather": "तबीयत ठीक नहीं",
            "cost an arm and a leg": "बहुत महंगा",
            "beating around the bush": "इधर-उधर की बात करना",
            "call it a day": "काम समाप्त करना",
            "burn the midnight oil": "रात-रात भर जागकर काम करना",
            "get the ball rolling": "शुरुआत करना",
            "pull yourself together": "खुद को संभालो",
            "shoot yourself in the foot": "अपना ही नुकसान करना",
            "take it with a grain of salt": "संदेह से लेना",
            "the last straw": "सहनशीलता की आखिरी सीमा",
            "time flies": "समय पंख लगाकर उड़ता है",
            "wrap your head around": "समझने की कोशिश करना",
            "cut corners": "काम में छोटा रास्ता अपनाना",
            "back to square one": "फिर से शुरू से",
            "blessing in disguise": "छिपा हुआ वरदान",
            "cry over spilled milk": "बीती बात पर पछताना",
            "keep your chin up": "हिम्मत रखना",
            
            # Work-related idioms
            "think outside the box": "नए तरीके से सोचना",
            "raise the bar": "मानक ऊंचा करना",
            "learning curve": "सीखने की प्रक्रिया",
            "up and running": "चालू और कार्यरत",
            "back to the drawing board": "फिर से योजना बनाना",
            
            # Project-related phrases
            "running into issues": "समस्याओं का सामना करना",
            "iron out the bugs": "खामियां दूर करना",
            "in the pipeline": "विचाराधीन",
            "moving forward": "आगे बढ़ते हुए",
            "touch base": "संपर्क में रहना",
            
            # Technical phrases
            "user-friendly": "उपयोगकर्ता के अनुकूल",
            "cutting-edge": "अत्याधुनिक",
            "state of the art": "अत्याधुनिक तकनीक",
            "proof of concept": "व्यवहार्यता का प्रमाण",
            "game changer": "खेल बदलने वाला",

            "a blessing in disguise": "छुपा हुआ वरदान",
            "actions speak louder than words": "कर्म शब्दों से अधिक प्रभावी होते हैं",
            "add fuel to the fire": "आग में घी डालना",
            "barking up the wrong tree": "गलत दिशा में प्रयास करना",
            "best of both worlds": "दोनों चीजों का लाभ",
            "cut to the chase": "मुद्दे पर आना",
            "don't judge a book by its cover": "किसी को उसके रूप से मत आंकिए",
            "easy does it": "धीरे-धीरे करो",
            "every cloud has a silver lining": "हर मुश्किल में आशा की किरण होती है",
            "get a taste of your own medicine": "जैसा किया वैसा भुगतो",
            "hit the sack": "सोने जाना",
            "let the cat out of the bag": "राज़ खोल देना",
            "miss the boat": "मौका चूक जाना",
            "no pain no gain": "बिना मेहनत के कुछ नहीं मिलता",
            "on the ball": "सचेत और सतर्क",
            "pull the plug": "काम रोक देना",
            "spill the beans": "राज़ खोलना",
            "the ball is in your court": "अब निर्णय तुम्हारे हाथ में है",
            "through thick and thin": "हर परिस्थिति में",
            "you can't have your cake and eat it too": "दोनों फायदे एक साथ नहीं हो सकते"
        }
    elif src_lang == "en" and tgt_lang == "mr":
        idiom_map = {
            "no piece of cake": "सोपं काम नाही",
            "piece of cake": "अतिशय सोपं काम",
            "bite the bullet": "कठीण निर्णय घेणे",
            "tackle it head-on": "समस्येला थेट सामोरे जाणे",
            "fell into place": "सगळं व्यवस्थित झालं",
            "see the light at the end of the tunnel": "अंधारातून उजेडाची किरण दिसणे",
            "with a little perseverance": "थोड्या धीराने",
            "break a leg": "खूप शुभेच्छा",
            "hit the nail on the head": "अगदी बरोबर बोललात",
            "once in a blue moon": "क्वचितच, कधीतरी",
            "under the weather": "तब्येत ठीक नसणे",
            "cost an arm and a leg": "खूप महाग",
            "beating around the bush": "गोल गोल फिरवणे",
            "call it a day": "दिवसाचं काम संपवणे",
            "burn the midnight oil": "रात्रंदिवस मेहनत करणे",
            "get the ball rolling": "सुरुवात करणे",
            "pull yourself together": "स्वतःला सावरा",
            "shoot yourself in the foot": "स्वतःचेच पाय स्वतः कापणे",
            "take it with a grain of salt": "साशंक दृष्टीने पाहणे",
            "the last straw": "सहनशक्तीची शेवटची मर्यादा",
            "time flies": "वेळ पंख लावून उडतो",
            "wrap your head around": "समजून घेण्याचा प्रयत्न करणे",
            "cut corners": "कमी वेळात काम उरकणे",
            "back to square one": "पुन्हा सुरुवातीला",
            "blessing in disguise": "आशीर्वाद लपलेला",
            "cry over spilled milk": "झालेल्या गोष्टीसाठी रडत बसणे",
            "keep your chin up": "धीर धरा",

            # Work-related idioms
            "think outside the box": "वेगळ्या पद्धतीने विचार करणे",
            "raise the bar": "पातळी उंचावणे",
            "learning curve": "शिकण्याची प्रक्रिया",
            "up and running": "सुरू आणि कार्यरत",
            "back to the drawing board": "पुन्हा नव्याने योजना आखणे",
            
            # Project-related phrases
            "running into issues": "अडचणींना सामोरे जाणे",
            "iron out the bugs": "त्रुटी दूर करणे",
            "in the pipeline": "विचाराधीन",
            "moving forward": "पुढे जाताना",
            "touch base": "संपर्कात राहणे",
            
            # Technical phrases
            "user-friendly": "वापरकर्त्यास सोयीस्कर",
            "cutting-edge": "अत्याधुनिक",
            "state of the art": "सर्वोत्कृष्ट तंत्रज्ञान",
            "proof of concept": "संकल्पनेची सिद्धता",
            "game changer": "खेळ बदलणारी गोष्ट",

            "a blessing in disguise": "छुपलेले वरदान",
            "actions speak louder than words": "कृती शब्दांपेक्षा प्रभावी असतात",
            "add fuel to the fire": "आग ला फुंकर घालणे",
            "barking up the wrong tree": "चुकीच्या गोष्टीकडे लक्ष देणे",
            "best of both worlds": "दोनही गोष्टींचा लाभ",
            "cut to the chase": "थेट मुद्द्यावर येणे",
            "don't judge a book by its cover": "फक्त बाह्यरूप पाहून अंदाज लावू नका",
            "easy does it": "हळूहळू करा",
            "every cloud has a silver lining": "प्रत्येक संकटात संधी असते",
            "get a taste of your own medicine": "जसे कराल तसे भराल",
            "hit the sack": "झोपायला जाणे",
            "let the cat out of the bag": "गुपित उघड करणे",
            "miss the boat": "संधी गमावणे",
            "no pain no gain": "कष्टाशिवाय यश नाही",
            "on the ball": "सतर्क असणे",
            "pull the plug": "काम बंद करणे",
            "spill the beans": "गुपित सांगणे",
            "the ball is in your court": "निर्णय तुमच्या हाती आहे",
            "through thick and thin": "संकटसमयीही साथ देणे",
            "you can't have your cake and eat it too": "सगळं काही मिळवता येत नाही"
        }

    if idiom_map:
        # Sort idioms by length (longest first) to handle overlapping phrases
        sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
        pattern = '|'.join(map(re.escape, sorted_idioms))
        
        # Create a regex pattern and replace idioms
        if pattern:
            regex = re.compile(pattern, flags=re.IGNORECASE)
            text = regex.sub(lambda m: idiom_map[m.group(0).lower()], text)
    
    return text

# Translation function
def translate_text(text, src_lang, tgt_lang):
    if src_lang == tgt_lang:
        return text

    lang_map = {"English": "eng_Latn", "Hindi": "hin_Deva", "Marathi": "mar_Deva"}
    src_lang_code = lang_map.get(src_lang)
    tgt_lang_code = lang_map.get(tgt_lang)

    if not src_lang_code or not tgt_lang_code:
        return "Error: Unsupported language combination"

    try:
        # First apply idiom preprocessing
        preprocessed_text = preprocess_idioms(text, src_lang[:2].lower(), tgt_lang[:2].lower())
        tokenizer, model = MODELS["nllb"]
        
        chunks = []
        current_chunk = ""
        
        # Split text into manageable chunks
        for sentence in re.split('([.!?।]+)', preprocessed_text):
            if sentence.strip():
                if len(current_chunk) + len(sentence) < 450:
                    current_chunk += sentence
                else:
                    if current_chunk:
                        chunks.append(current_chunk)
                    current_chunk = sentence
        
        if current_chunk:
            chunks.append(current_chunk)
        
        translated_text = ""
        
        # Translate each chunk
        for chunk in chunks:
            if chunk.strip():
                inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
                tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
                
                translated = model.generate(
                    **inputs,
                    forced_bos_token_id=tgt_lang_id,
                    max_length=512,
                    num_beams=5,
                    length_penalty=1.0,
                    no_repeat_ngram_size=3
                )
                
                translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
                translated_text += translated_chunk + " "
        
        return translated_text.strip()
    except Exception as e:
        return f"Error during translation: {str(e)}"

# Document translation function
def translate_document(file, source_lang, target_lang):
    try:
        if file is None:
            return "Please upload a file", None
        
        input_ext = os.path.splitext(file.name)[1].lower()
        temp_dir = tempfile.gettempdir()
        
        # Change output extension to .txt for PDF inputs
        if input_ext == '.pdf':
            output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}.txt"
        else:
            output_filename = f"translated_{os.path.splitext(os.path.basename(file.name))[0]}{input_ext}"
            
        output_path = os.path.join(temp_dir, output_filename)


        if input_ext == '.pdf':
            try:
                # Create a BytesIO object for the PDF content
                if isinstance(file, str):
                    with open(file, 'rb') as f:
                        file_content = BytesIO(f.read())
                else:
                    file_content = BytesIO(file.read())
                
                # Create PdfReader object
                reader = PdfReader(file_content)
                translated_pages = []
                
              
                # Process each page while preserving structure
                for page_num, page in enumerate(reader.pages, 1):
                    # Extract text from the page
                    page_text = page.extract_text()
                    if not page_text.strip():
                        continue
                    
                    
                    # Split into paragraphs while preserving structure
                    paragraphs = page_text.split('\n\n')
                    translated_paragraphs = []
                    
                    for paragraph in paragraphs:
                        # Handle individual lines within paragraphs
                        lines = paragraph.split('\n')
                        translated_lines = []
                        
                        for line in lines:
                            if line.strip():
                                translated_line = translate_text(line, source_lang, target_lang)
                                translated_lines.append(translated_line)
                            else:
                                translated_lines.append('')  # Preserve empty lines
                        
                        translated_paragraphs.append('\n'.join(translated_lines))
                    
                    # Combine translated paragraphs with proper spacing
                    translated_pages.append('\n\n'.join(translated_paragraphs))
                
                # Combine all translated pages
                final_text = '\n\n'.join(translated_pages)
                
                # Save as formatted txt file
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(final_text)
                
                return final_text, output_path
                
            except Exception as e:
                raise Exception(f"PDF processing error: {str(e)}")
            finally:
                if 'file_content' in locals():
                    file_content.close()
        
        elif input_ext == '.docx':
            # Handle DOCX with formatting preservation
            doc = Document(file)
            new_doc = Document()
            
            # Copy styles from original document
            for style in doc.styles:
                if style.name not in new_doc.styles:
                    new_doc.styles.add_style(
                        style.name, 
                        style.type,
                        True if style.base_style else False
                    )
            
            # Process each paragraph while preserving formatting
            for para in doc.paragraphs:
                if not para.text.strip():
                    # Preserve empty paragraphs
                    new_doc.add_paragraph()
                    continue
                
                # Create new paragraph with same style
                new_para = new_doc.add_paragraph(style=para.style.name if para.style else None)
                
                # Buffer to collect text for translation
                runs_buffer = []
                formatting_map = []
                
                # Collect text and formatting information
                for run in para.runs:
                    if run.text.strip():
                        runs_buffer.append(run.text)
                        # Store formatting attributes
                        formatting_map.append({
                            'bold': run.bold,
                            'italic': run.italic,
                            'underline': run.underline,
                            'font_size': run.font.size if run.font.size else None,
                            'font_name': run.font.name if run.font.name else None,
                            'color': run.font.color.rgb if run.font.color and run.font.color.rgb else None
                        })
                
                if runs_buffer:
                    # Translate the combined text
                    combined_text = " ".join(runs_buffer)
                    translated_text = translate_text(combined_text, source_lang, target_lang)
                    
                    # Split translated text approximately matching original structure
                    translated_parts = translated_text.split()
                    avg_len = len(translated_parts) // len(formatting_map)
                    
                    # Apply formatting to translated parts
                    current_index = 0
                    for i, format_info in enumerate(formatting_map):
                        # Calculate text chunk for this run
                        end_index = min(current_index + avg_len, len(translated_parts))
                        if i == len(formatting_map) - 1:
                            # Last run gets all remaining text
                            end_index = len(translated_parts)
                        
                        chunk_text = " ".join(translated_parts[current_index:end_index])
                        current_index = end_index
                        
                        # Create new run with preserved formatting
                        new_run = new_para.add_run(chunk_text + " ")
                        new_run.bold = format_info['bold']
                        new_run.italic = format_info['italic']
                        new_run.underline = format_info['underline']
                        if format_info['font_size']:
                            new_run.font.size = format_info['font_size']
                        if format_info['font_name']:
                            new_run.font.name = format_info['font_name']
                        if format_info['color']:
                            new_run.font.color.rgb = format_info['color']
            
            # Save the formatted document
            new_doc.save(output_path)
            
            # Return both text content and file
            text_content = "\n".join(para.text for para in new_doc.paragraphs if para.text.strip())
            return text_content, output_path
            
        elif input_ext == '.txt':
            # Handle TXT with line formatting preservation
            input_text = extract_text(file)
            if not input_text:
                return "Could not extract text from the document", None
            
            # Split into paragraphs while preserving line breaks
            paragraphs = input_text.split('\n\n')
            translated_paragraphs = []
            
            for paragraph in paragraphs:
                # Handle individual lines within paragraphs
                lines = paragraph.split('\n')
                translated_lines = []
                
                for line in lines:
                    if line.strip():
                        translated_line = translate_text(line, source_lang, target_lang)
                        translated_lines.append(translated_line)
                    else:
                        translated_lines.append('')  # Preserve empty lines
                
                translated_paragraphs.append('\n'.join(translated_lines))
            
            # Combine translated paragraphs with double line breaks
            final_text = '\n\n'.join(translated_paragraphs)
            
            # Save as formatted txt file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(final_text)
            
            return final_text, output_path
        
        else:
            # For other file types, use the original translation logic
            input_text = extract_text(file)
            if input_text is None:
                return "Could not extract text from the document", None
            
            translated_text = translate_text(input_text, source_lang, target_lang)
            
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(translated_text)
            
            return translated_text, output_path
            
    except Exception as e:
        return f"Error: {str(e)}", None

# Direct text translation function
def translate_text_direct(text, source_lang, target_lang):
    if not text:
        return "Please enter some text"
    return translate_text(text, source_lang, target_lang)

# Get current time in UTC
def get_current_time():
    utc_now = datetime.now(pytz.UTC)
    return utc_now.strftime("%Y-%m-%d %H:%M:%S")

# Create Gradio interface
def create_interface():
    # Add header with timestamp and user info
    header = gr.Markdown(
        f"""
        # Document Translation Toolkit
        *Current Date and Time (UTC):* {get_current_time()}  
        *Current User's Login:* gauravchand
        """
    )
    
    # Document Translation Interface
    doc_interface = gr.Interface(
        fn=translate_document,
        inputs=[
            gr.File(label="Upload Document (PDF, DOCX, or TXT)"),
            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"),
            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi")
        ],
        outputs=[
            gr.Textbox(label="Translation", lines=10),
            gr.File(label="Download Translation")
        ],
        title="Document Translation",
        description="Upload a document to translate"
    )
    
    # Text Translation Interface
    text_interface = gr.Interface(
        fn=translate_text_direct,
        inputs=[
            gr.Textbox(lines=5, label="Enter text to translate"),
            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Source Language", value="English"),
            gr.Dropdown(choices=["English", "Hindi", "Marathi"], label="Target Language", value="Hindi")
        ],
        outputs=gr.Textbox(label="Translation", lines=5),
        title="Text Translation",
        description="Enter text directly to translate"
    )
    
    # Combine interfaces with header
    demo = gr.Blocks()
    with demo:
        header.render()
        gr.TabbedInterface(
            [doc_interface, text_interface],
            tab_names=["Document Translation", "Text Translation"]
        )
    
    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()