Spaces:

gauravchand11
/

try

Build error

App Files Files Community

gauravchand11 commited on 29 days ago

Commit

1337d1b

verified ·

1 Parent(s): 5e3207d

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -48

app.py CHANGED Viewed

@@ -5,13 +5,12 @@ import io
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, MT5ForConditionalGeneration
 import torch
 from pathlib import Path
-import tempfile
 from typing import Union, Tuple, List, Dict
 import os
 import sys
 from datetime import datetime, timezone
 import warnings
-import json
 # Filter warnings
 warnings.filterwarnings('ignore', category=UserWarning)
@@ -105,7 +104,6 @@ class TextBatcher:
     @staticmethod
     def _split_into_sentences(text: str) -> List[str]:
         """Split text into sentences with improved boundary detection"""
-        # Basic sentence boundary detection
         delimiters = ['. ', '! ', '? ', '।', '॥', '\n']
         sentences = []
         current = text
@@ -131,14 +129,12 @@ class ModelManager:
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
-            # Load models with improved error handling
             models = {
                 "gemma": ModelManager._load_gemma_model(),
                 "nllb": ModelManager._load_nllb_model(),
                 "mt5": ModelManager._load_mt5_model()
             }
-            # Move models to appropriate device
             if not torch.cuda.is_available():
                 for model_tuple in models.values():
                     model_tuple[1].to(device)
@@ -208,7 +204,6 @@ class TranslationPipeline:
     @torch.no_grad()
     def process_text(self, text: str, source_lang: str, target_lang: str) -> str:
-        # Split text into manageable batches
         batches = TextBatcher.batch_process_text(text)
         final_results = []
@@ -231,10 +226,11 @@ class TranslationPipeline:
             final_results.append(corrected)
-        return " ".join(final_results)
     def _understand_context(self, text: str) -> str:
-        """Enhanced context understanding using Gemma model"""
         tokenizer, model = self.models["gemma"]
         prompt = f"""Analyze and provide context for translation:
@@ -267,12 +263,9 @@ Provide a clear and concise interpretation that maintains:
         return context.replace(prompt, "").strip()
     def _translate_with_context(self, text: str, source_lang: str, target_lang: str) -> str:
-        """Enhanced translation using NLLB model with context awareness"""
         tokenizer, model = self.models["nllb"]
-        source_lang_token = f"___{source_lang}___"
         target_lang_token = f"___{target_lang}___"
         inputs = tokenizer(text, return_tensors="pt", max_length=CONFIG["MAX_BATCH_LENGTH"], truncation=True)
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
@@ -293,7 +286,6 @@ Provide a clear and concise interpretation that maintains:
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
     def _correct_grammar(self, text: str, target_lang: str) -> str:
-        """Enhanced grammar correction using MT5 model"""
         tokenizer, model = self.models["mt5"]
         lang_code = CONFIG["MT5_LANG_CODES"][target_lang]
         prompt = CONFIG["GRAMMAR_PROMPTS"][lang_code]
@@ -313,9 +305,20 @@ Provide a clear and concise interpretation that maintains:
         )
         corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        for prefix in CONFIG["GRAMMAR_PROMPTS"].values():
-            corrected = corrected.replace(prefix, "")
-        return corrected.strip()
 class DocumentExporter:
     """Handles document export operations"""
@@ -328,31 +331,23 @@ class DocumentExporter:
         buffer = io.BytesIO()
         doc.save(buffer)
         buffer.seek(0)
-        return buffer
-    @staticmethod
-    def save_as_text(text: str) -> io.BytesIO:
-        buffer = io.BytesIO()
-        buffer.write(text.encode())
-        buffer.seek(0)
         return buffer
 def main():
     st.title("🌐 Enhanced Document Translation App")
-    # Check for HF_TOKEN
-    if not os.environ.get('HF_TOKEN'):
-        st.error("HF_TOKEN not found in environment variables. Please add it in the Spaces settings.")
-        st.stop()
     # Display system info
     st.sidebar.markdown(f"""
     ### System Information
     **Current UTC Time:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}
-    **User:** {os.environ.get('USER', 'unknown')}
     """)
     # Load models
     with st.spinner("Loading models... This may take a few minutes."):
         try:
@@ -412,25 +407,14 @@ def main():
                 key="translation_result"
             )
-            # Download options
-            st.markdown("### Download Options")
-            col1, col2 = st.columns(2)
-            with col1:
-                st.download_button(
-                    label="Download as TXT",
-                    data=DocumentExporter.save_as_text(final_text),
-                    file_name="translated_document.txt",
-                    mime="text/plain"
-                )
-            with col2:
-                st.download_button(
-                    label="Download as DOCX",
-                    data=DocumentExporter.save_as_docx(final_text),
-                    file_name="translated_document.docx",
-                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-                )
             status_text.text("Translation completed successfully!")
             progress_bar.progress(100)

 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, MT5ForConditionalGeneration
 import torch
 from pathlib import Path
 from typing import Union, Tuple, List, Dict
 import os
 import sys
 from datetime import datetime, timezone
 import warnings
+import re
 # Filter warnings
 warnings.filterwarnings('ignore', category=UserWarning)
     @staticmethod
     def _split_into_sentences(text: str) -> List[str]:
         """Split text into sentences with improved boundary detection"""
         delimiters = ['. ', '! ', '? ', '।', '॥', '\n']
         sentences = []
         current = text
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
             models = {
                 "gemma": ModelManager._load_gemma_model(),
                 "nllb": ModelManager._load_nllb_model(),
                 "mt5": ModelManager._load_mt5_model()
             }
             if not torch.cuda.is_available():
                 for model_tuple in models.values():
                     model_tuple[1].to(device)
     @torch.no_grad()
     def process_text(self, text: str, source_lang: str, target_lang: str) -> str:
         batches = TextBatcher.batch_process_text(text)
         final_results = []
             final_results.append(corrected)
+        # Clean up the final text
+        final_text = " ".join(final_results)
+        return self._clean_text(final_text)
     def _understand_context(self, text: str) -> str:
         tokenizer, model = self.models["gemma"]
         prompt = f"""Analyze and provide context for translation:
         return context.replace(prompt, "").strip()
     def _translate_with_context(self, text: str, source_lang: str, target_lang: str) -> str:
         tokenizer, model = self.models["nllb"]
         target_lang_token = f"___{target_lang}___"
         inputs = tokenizer(text, return_tensors="pt", max_length=CONFIG["MAX_BATCH_LENGTH"], truncation=True)
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
     def _correct_grammar(self, text: str, target_lang: str) -> str:
         tokenizer, model = self.models["mt5"]
         lang_code = CONFIG["MT5_LANG_CODES"][target_lang]
         prompt = CONFIG["GRAMMAR_PROMPTS"][lang_code]
         )
         corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return self._clean_text(corrected.replace(prompt, "").strip())
+    def _clean_text(self, text: str) -> str:
+        """Clean up the text by removing special tokens and fixing formatting"""
+        # Remove MT5 special tokens
+        text = re.sub(r'<extra_id_\d+>', '', text)
+        # Fix multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Fix punctuation spacing
+        text = re.sub(r'\s+([.,!?।॥])', r'\1', text)
+        return text.strip()
 class DocumentExporter:
     """Handles document export operations"""
         buffer = io.BytesIO()
         doc.save(buffer)
         buffer.seek(0)
         return buffer
 def main():
     st.title("🌐 Enhanced Document Translation App")
     # Display system info
     st.sidebar.markdown(f"""
     ### System Information
     **Current UTC Time:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}
+    **User:** {os.environ.get('USER', 'gauravchand')}
     """)
+    # Check for HF_TOKEN
+    if not os.environ.get('HF_TOKEN'):
+        st.error("HF_TOKEN not found in environment variables. Please add it in the Spaces settings.")
+        st.stop()
     # Load models
     with st.spinner("Loading models... This may take a few minutes."):
         try:
                 key="translation_result"
             )
+            # Download option
+            st.markdown("### Download Option")
+            st.download_button(
+                label="Download as DOCX",
+                data=DocumentExporter.save_as_docx(final_text),
+                file_name="translated_document.docx",
+                mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+            )
             status_text.text("Translation completed successfully!")
             progress_bar.progress(100)