Spaces:

gauravchand11
/

try

Build error

App Files Files Community

gauravchand11 commited on 29 days ago

Commit

8b4e117

verified ·

1 Parent(s): 77a6efe

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -87

app.py CHANGED Viewed

@@ -4,14 +4,6 @@ from PyPDF2 import PdfReader
 import docx
 import os
 import re
-from datetime import datetime
-# Page config
-st.set_page_config(
-    page_title="Document Translator (NLLB-200)",
-    page_icon="📄",
-    layout="wide"
-)
 # Load NLLB model and tokenizer
 @st.cache_resource
@@ -27,92 +19,78 @@ def initialize_models():
     tokenizer, model = load_translation_model()
     return {"nllb": (tokenizer, model)}
-def split_long_sentence(sentence, max_length=200):
-    """Split long sentences into smaller chunks at appropriate break points."""
-    if len(sentence) <= max_length:
-        return [sentence]
-    chunks = []
-    current_chunk = ""
-    words = sentence.split()
-    for word in words:
-        if len(current_chunk) + len(word) + 1 <= max_length:
-            current_chunk += (" " + word if current_chunk else word)
-        else:
-            chunks.append(current_chunk)
-            current_chunk = word
-    if current_chunk:
-        chunks.append(current_chunk)
-    return chunks
 def preprocess_idioms(text, src_lang, tgt_lang):
     if src_lang == "en" and tgt_lang == "hi":
         idiom_map = {
-            # Common English-Hindi idiom mappings
             "no piece of cake": "कोई आसान काम नहीं",
             "bite the bullet": "दांतों तले उंगली दबाना",
-            "tackle it head-on": "इसे पूरे मन से हाथ में लेना",
-            "fell into place": "ठीक हो गया",
-            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखाई देना",
             "with a little perseverance": "थोड़े से धैर्य से",
-            "break the ice": "बातचीत की शुरुआत करना",
-            "on cloud nine": "सातवें आसमान पर होना",
-            "once in a blue moon": "कभी-कभार",
             "beating around the bush": "इधर-उधर की बात करना",
-            "burning the midnight oil": "रात-रात भर जागकर काम करना",
-            "calm before the storm": "तूफान से पहले की शांति",
-            "cost an arm and a leg": "बहुत महंगा होना",
-            "blessing in disguise": "छुपा हुआ वरदान",
-            "kill two birds with one stone": "एक पंथ दो काज",
-            "a piece of cake": "बहुत आसान काम",
-            "under the weather": "तबीयत ठीक न होना",
             "pull yourself together": "खुद को संभालो",
-            "rise and shine": "जल्दी उठो और तैयार हो जाओ",
             "time flies": "समय पंख लगाकर उड़ता है",
-            "actions speak louder than words": "कथनी से करनी बड़ी",
-            "all ears": "पूरा ध्यान से सुन रहा हूं",
-            "back to square one": "वापस शुरुआत में",
-            "better late than never": "देर आये दुरुस्त आये",
             "cry over spilled milk": "बीती बात पर पछताना",
-            "down to earth": "सरल स्वभाव का",
-            "every cloud has a silver lining": "हर मुसीबत में कोई न कोई अच्छाई छिपी होती है",
-            "food for thought": "सोचने वाली बात",
-            "give someone the benefit of the doubt": "शक का फायदा देना",
-            "hit the nail on the head": "सटीक बात कहना",
-            "in hot water": "मुसीबत में होना"
         }
         # Sort idioms by length (longest first) to handle overlapping phrases
         sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
-        # Replace idioms with their translations
-        for idiom in sorted_idioms:
-            pattern = r'\b' + re.escape(idiom) + r'\b'
-            text = re.sub(pattern, idiom_map[idiom], text, flags=re.IGNORECASE)
-    elif src_lang == "en" and tgt_lang == "mr":
-        idiom_map = {
-            "no piece of cake": "सोपं काम नाही",
-            "bite the bullet": "कठीण निर्णय घेणे",
-            "tackle it head-on": "समस्येला थेट सामोरे जाणे",
-            "fell into place": "सगळं व्यवस्थित झालं",
-            "see the light at the end of the tunnel": "अंधारातून प्रकाशाकडे जाणे",
-            "with a little perseverance": "थोड्या धीराने",
-            "break the ice": "संभाषणाची सुरुवात करणे",
-            "on cloud nine": "आनंदात असणे",
-            "once in a blue moon": "क्वचितच",
-            "burning the midnight oil": "रात्रंदिवस मेहनत करणे",
-            "better late than never": "उशीर का होईना पण योग्य वेळी"
-        }
-        for idiom, translation in idiom_map.items():
-            pattern = r'\b' + re.escape(idiom) + r'\b'
-            text = re.sub(pattern, translation, text, flags=re.IGNORECASE)
     return text
 def extract_text(file):
     ext = os.path.splitext(file.name)[1].lower()
@@ -136,6 +114,7 @@ def extract_text(file):
     else:
         raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
 def translate_text(text, src_lang, tgt_lang, models):
     if src_lang == tgt_lang:
         return text
@@ -147,22 +126,106 @@ def translate_text(text, src_lang, tgt_lang, models):
         return "Error: Unsupported language combination"
     tgt_lang_code = lang_map[tgt_lang]
     tokenizer, model = models["nllb"]
     # Preprocess for idioms
     preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
-    # Split text into smaller chunks (sentences)
-    sentences = re.split(r'(?<=[.!?])\s+', preprocessed_text)
-    translated_text = []
-    for sentence in sentences:
         if sentence.strip():
-            chunks = split_long_sentence(sentence, max_length=200)
-            for chunk in chunks:
-                try:
-                    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
-                    translated = model.generate(
-                        **inputs

 import docx
 import os
 import re
 # Load NLLB model and tokenizer
 @st.cache_resource
     tokenizer, model = load_translation_model()
     return {"nllb": (tokenizer, model)}
+# Enhanced idiom mapping with more comprehensive translations
 def preprocess_idioms(text, src_lang, tgt_lang):
     if src_lang == "en" and tgt_lang == "hi":
         idiom_map = {
+            # Basic phrases
             "no piece of cake": "कोई आसान काम नहीं",
+            "piece of cake": "बहुत आसान काम",
             "bite the bullet": "दांतों तले उंगली दबाना",
+            "tackle it head-on": "सीधे मुकाबला करना",
+            "fell into place": "सब कुछ ठीक हो गया",
+            "see the light at the end of the tunnel": "मुश्किलों के अंत में उम्मीद की किरण दिखना",
             "with a little perseverance": "थोड़े से धैर्य से",
+            # Additional common idioms
+            "break a leg": "बहुत बहुत शुभकामनाएं",
+            "hit the nail on the head": "बिल्कुल सही बात कहना",
+            "once in a blue moon": "बहुत कम, कभी-कभार",
+            "under the weather": "तबीयत ठीक नहीं",
+            "cost an arm and a leg": "बहुत महंगा",
             "beating around the bush": "इधर-उधर की बात करना",
+            "call it a day": "काम समाप्त करना",
+            "burn the midnight oil": "रात-रात भर जागकर काम करना",
+            "get the ball rolling": "शुरुआत करना",
             "pull yourself together": "खुद को संभालो",
+            "shoot yourself in the foot": "अपना ही नुकसान करना",
+            "take it with a grain of salt": "संदेह से लेना",
+            "the last straw": "सहनशीलता की आखिरी सीमा",
             "time flies": "समय पंख लगाकर उड़ता है",
+            "wrap your head around": "समझने की कोशिश करना",
+            "cut corners": "काम में छोटा रास्ता अपनाना",
+            "back to square one": "फिर से शुरू से",
+            "blessing in disguise": "छिपा हुआ वरदान",
             "cry over spilled milk": "बीती बात पर पछताना",
+            "keep your chin up": "हिम्मत रखना",
+            # Work-related idioms
+            "think outside the box": "नए तरीके से सोचना",
+            "raise the bar": "मानक ऊंचा करना",
+            "learning curve": "सीखने की प्रक्रिया",
+            "up and running": "चालू और कार्यरत",
+            "back to the drawing board": "फिर से योजना बनाना",
+            # Project-related phrases
+            "running into issues": "समस्याओं का सामना करना",
+            "iron out the bugs": "खामियां दूर करना",
+            "in the pipeline": "विचाराधीन",
+            "moving forward": "आगे बढ़ते हुए",
+            "touch base": "संपर्क में रहना",
+            # Technical phrases
+            "user-friendly": "उपयोगकर्ता के अनुकूल",
+            "cutting-edge": "अत्याधुनिक",
+            "state of the art": "अत्याधुनिक तकनीक",
+            "proof of concept": "व्यवहार्यता का प्रमाण",
+            "game changer": "खेल बदलने वाला"
         }
         # Sort idioms by length (longest first) to handle overlapping phrases
         sorted_idioms = sorted(idiom_map.keys(), key=len, reverse=True)
+        # Create a single regex pattern for all idioms
+        pattern = '|'.join(map(re.escape, sorted_idioms))
+        def replace_idiom(match):
+            return idiom_map[match.group(0).lower()]
+        # Replace all idioms in one pass, case-insensitive
+        text = re.sub(pattern, replace_idiom, text, flags=re.IGNORECASE)
     return text
+# Function to extract text from different file types
 def extract_text(file):
     ext = os.path.splitext(file.name)[1].lower()
     else:
         raise ValueError("Unsupported file format. Please upload PDF, DOCX, or TXT files.")
+# Translation function with improved chunking
 def translate_text(text, src_lang, tgt_lang, models):
     if src_lang == tgt_lang:
         return text
         return "Error: Unsupported language combination"
     tgt_lang_code = lang_map[tgt_lang]
     tokenizer, model = models["nllb"]
     # Preprocess for idioms
     preprocessed_text = preprocess_idioms(text, src_lang, tgt_lang)
+    # Improved chunking: Split by sentences while preserving context
+    chunks = []
+    current_chunk = ""
+    for sentence in re.split('([.!?।]+)', preprocessed_text):
         if sentence.strip():
+            if len(current_chunk) + len(sentence) < 450:  # Leave room for tokenization
+                current_chunk += sentence
+            else:
+                if current_chunk:
+                    chunks.append(current_chunk)
+                current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk)
+    translated_text = ""
+    for chunk in chunks:
+        if chunk.strip():
+            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            translated = model.generate(
+                **inputs,
+                forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang_code],
+                max_length=512,
+                num_beams=5,  # Improved beam search
+                length_penalty=1.0,  # Balanced length penalty
+                no_repeat_ngram_size=3  # Avoid repetition
+            )
+            translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
+            translated_text += translated_chunk + " "
+    return translated_text.strip()
+# Function to save text as a file
+def save_text_to_file(text, original_filename, prefix="translated"):
+    output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
+    with open(output_filename, "w", encoding="utf-8") as f:
+        f.write(text)
+    return output_filename
+# Main processing function
+def process_document(file, source_lang, target_lang, models):
+    try:
+        # Extract text from uploaded file
+        text = extract_text(file)
+        # Translate the text
+        translated_text = translate_text(text, source_lang, target_lang, models)
+        # Save the result
+        if translated_text.startswith("Error:"):
+            output_file = save_text_to_file(translated_text, file.name, prefix="error")
+        else:
+            output_file = save_text_to_file(translated_text, file.name)
+        return output_file, translated_text
+    except Exception as e:
+        error_message = f"Error: {str(e)}"
+        output_file = save_text_to_file(error_message, file.name, prefix="error")
+        return output_file, error_message
+# Streamlit interface
+def main():
+    st.title("Document Translator (NLLB-200)")
+    st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages (English, Hindi, Marathi).")
+    # Initialize models
+    models = initialize_models()
+    # File uploader
+    uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
+    # Language selection
+    col1, col2 = st.columns(2)
+    with col1:
+        source_lang = st.selectbox("Source Language", ["en", "hi", "mr"], index=0)
+    with col2:
+        target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
+    if uploaded_file is not None and st.button("Translate"):
+        with st.spinner("Translating..."):
+            output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
+            # Display result
+            st.text_area("Translated Text", result_text, height=300)
+            # Provide download button
+            with open(output_file, "rb") as file:
+                st.download_button(
+                    label="Download Translated Document",
+                    data=file,
+                    file_name=os.path.basename(output_file),
+                    mime="text/plain"
+                )
+if __name__ == "__main__":
+    main()