Spaces:

gauravchand11
/

try

Build error

App Files Files Community

gauravchand11 commited on Apr 6

Commit

5a89d4a

verified ·

1 Parent(s): f88b938

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -59

app.py CHANGED Viewed

@@ -10,11 +10,26 @@ from typing import Union, Tuple
 import os
 import sys
 from datetime import datetime, timezone
-# Display current information in sidebar
-st.sidebar.text(f"Current Date and Time (UTC):")
-st.sidebar.text(datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'))
-st.sidebar.text(f"Current User's Login: {os.environ.get('USER', 'gauravchand')}")
 # Get Hugging Face token from environment variables
 HF_TOKEN = os.environ.get('HF_TOKEN')
@@ -61,7 +76,8 @@ def load_models():
         nllb_tokenizer = AutoTokenizer.from_pretrained(
             "facebook/nllb-200-distilled-600M",
             token=HF_TOKEN,
-            trust_remote_code=True
         )
         nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
             "facebook/nllb-200-distilled-600M",
@@ -75,7 +91,9 @@ def load_models():
         mt5_tokenizer = AutoTokenizer.from_pretrained(
             "google/mt5-small",
             token=HF_TOKEN,
-            trust_remote_code=True
         )
         mt5_model = MT5ForConditionalGeneration.from_pretrained(
             "google/mt5-small",
@@ -155,7 +173,6 @@ def interpret_context(text: str, gemma_tuple: Tuple) -> str:
     """Use Gemma model to interpret context and understand regional nuances."""
     tokenizer, model = gemma_tuple
-    # Split text into batches
     batches = batch_process_text(text)
     interpreted_batches = []
@@ -186,23 +203,21 @@ def translate_text(text: str, source_lang: str, target_lang: str, nllb_tuple: Tu
     """Translate text using NLLB model."""
     tokenizer, model = nllb_tuple
-    # Split text into batches
     batches = batch_process_text(text)
     translated_batches = []
     for batch in batches:
-        # Prepare the input text with source language token
-        inputs = tokenizer(batch, return_tensors="pt", max_length=512, truncation=True)
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
-        # Get target language token ID
-        target_lang_token = f"___{target_lang}___"
-        target_lang_id = tokenizer.convert_tokens_to_ids(target_lang_token)
-        # Generate translation
         outputs = model.generate(
             **inputs,
-            forced_bos_token_id=target_lang_id,
             max_length=512,
             do_sample=True,
             temperature=0.7,
@@ -217,21 +232,16 @@ def translate_text(text: str, source_lang: str, target_lang: str, nllb_tuple: Tu
 @torch.no_grad()
 def correct_grammar(text: str, target_lang: str, mt5_tuple: Tuple) -> str:
-    """
-    Correct grammar using MT5 model for all supported languages.
-    Uses a text-to-text approach with language-specific prompts.
-    """
     tokenizer, model = mt5_tuple
     lang_code = MT5_LANG_CODES[target_lang]
-    # Language-specific prompts for grammar correction
     prompts = {
         'en': "grammar: ",
         'hi': "व्याकरण सुधार: ",
         'mr': "व्याकरण सुधारणा: "
     }
-    # Split text into batches
     batches = batch_process_text(text)
     corrected_batches = []
@@ -251,8 +261,6 @@ def correct_grammar(text: str, target_lang: str, mt5_tuple: Tuple) -> str:
         )
         corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Clean up any artifacts from the model output
         for prefix in prompts.values():
             corrected_text = corrected_text.replace(prefix, "")
         corrected_text = corrected_text.strip()
@@ -273,7 +281,7 @@ def save_as_docx(text: str) -> io.BytesIO:
     return docx_buffer
 def main():
-    st.title("Document Translation App")
     # Load models
     with st.spinner("Loading models... This may take a few minutes."):
@@ -306,40 +314,52 @@ def main():
             index=1
         )
-    if uploaded_file and st.button("Translate"):
         try:
-            with st.spinner("Processing document..."):
-                # Extract text
-                text = extract_text_from_file(uploaded_file)
-                # Interpret context
-                with st.spinner("Interpreting context..."):
-                    interpreted_text = interpret_context(text, gemma_tuple)
-                # Translate
-                with st.spinner("Translating..."):
-                    translated_text = translate_text(
-                        interpreted_text,
-                        SUPPORTED_LANGUAGES[source_language],
-                        SUPPORTED_LANGUAGES[target_language],
-                        nllb_tuple
-                    )
-                # Grammar correction
-                with st.spinner("Correcting grammar..."):
-                    corrected_text = correct_grammar(
-                        translated_text,
-                        SUPPORTED_LANGUAGES[target_language],
-                        mt5_tuple
-                    )
-                # Display result
-                st.subheader("Translation Result:")
-                st.text_area("Translated Text:", value=corrected_text, height=150)
-                # Download options
-                st.subheader("Download Translation:")
                 # Text file download
                 text_buffer = io.BytesIO()
                 text_buffer.write(corrected_text.encode())
@@ -351,7 +371,8 @@ def main():
                     file_name="translated_document.txt",
                     mime="text/plain"
                 )
                 # DOCX file download
                 docx_buffer = save_as_docx(corrected_text)
                 st.download_button(
@@ -360,7 +381,9 @@ def main():
                     file_name="translated_document.docx",
                     mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                 )
         except Exception as e:
             st.error(f"An error occurred: {str(e)}")

 import os
 import sys
 from datetime import datetime, timezone
+import warnings
+# Filter out specific warnings
+warnings.filterwarnings('ignore', category=UserWarning, module='transformers.convert_slow_tokenizer')
+warnings.filterwarnings('ignore', category=UserWarning, module='transformers.tokenization_utils_base')
+# Custom styling
+st.set_page_config(
+    page_title="Document Translation App",
+    page_icon="🌐",
+    layout="wide"
+)
+# Display current information in sidebar with proper formatting
+current_time = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
+st.sidebar.markdown("""
+### System Information
+**Current UTC Time:** {}
+**User:** {}
+""".format(current_time, os.environ.get('USER', 'gauravchand')))
 # Get Hugging Face token from environment variables
 HF_TOKEN = os.environ.get('HF_TOKEN')
         nllb_tokenizer = AutoTokenizer.from_pretrained(
             "facebook/nllb-200-distilled-600M",
             token=HF_TOKEN,
+            trust_remote_code=True,
+            use_fast=False  # Use slow tokenizer to avoid warnings
         )
         nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
             "facebook/nllb-200-distilled-600M",
         mt5_tokenizer = AutoTokenizer.from_pretrained(
             "google/mt5-small",
             token=HF_TOKEN,
+            trust_remote_code=True,
+            legacy=False,  # Use new behavior
+            use_fast=False  # Use slow tokenizer to avoid warnings
         )
         mt5_model = MT5ForConditionalGeneration.from_pretrained(
             "google/mt5-small",
     """Use Gemma model to interpret context and understand regional nuances."""
     tokenizer, model = gemma_tuple
     batches = batch_process_text(text)
     interpreted_batches = []
     """Translate text using NLLB model."""
     tokenizer, model = nllb_tuple
     batches = batch_process_text(text)
     translated_batches = []
     for batch in batches:
+        # Add source language token to input
+        batch_with_lang = f"{source_lang} {batch}"
+        inputs = tokenizer(batch_with_lang, return_tensors="pt", max_length=512, truncation=True)
         inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # Add target language token
+        target_lang_token = tokenizer(target_lang, add_special_tokens=False)["input_ids"][0]
         outputs = model.generate(
             **inputs,
+            forced_bos_token_id=target_lang_token,
             max_length=512,
             do_sample=True,
             temperature=0.7,
 @torch.no_grad()
 def correct_grammar(text: str, target_lang: str, mt5_tuple: Tuple) -> str:
+    """Correct grammar using MT5 model for all supported languages."""
     tokenizer, model = mt5_tuple
     lang_code = MT5_LANG_CODES[target_lang]
     prompts = {
         'en': "grammar: ",
         'hi': "व्याकरण सुधार: ",
         'mr': "व्याकरण सुधारणा: "
     }
     batches = batch_process_text(text)
     corrected_batches = []
         )
         corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
         for prefix in prompts.values():
             corrected_text = corrected_text.replace(prefix, "")
         corrected_text = corrected_text.strip()
     return docx_buffer
 def main():
+    st.title("🌐 Document Translation App")
     # Load models
     with st.spinner("Loading models... This may take a few minutes."):
             index=1
         )
+    if uploaded_file and st.button("Translate", type="primary"):
         try:
+            progress_bar = st.progress(0)
+            # Extract text
+            text = extract_text_from_file(uploaded_file)
+            progress_bar.progress(20)
+            # Interpret context
+            with st.spinner("Interpreting context..."):
+                interpreted_text = interpret_context(text, gemma_tuple)
+            progress_bar.progress(40)
+            # Translate
+            with st.spinner("Translating..."):
+                translated_text = translate_text(
+                    interpreted_text,
+                    SUPPORTED_LANGUAGES[source_language],
+                    SUPPORTED_LANGUAGES[target_language],
+                    nllb_tuple
+                )
+            progress_bar.progress(70)
+            # Grammar correction
+            with st.spinner("Correcting grammar..."):
+                corrected_text = correct_grammar(
+                    translated_text,
+                    SUPPORTED_LANGUAGES[target_language],
+                    mt5_tuple
+                )
+            progress_bar.progress(90)
+            # Display result
+            st.markdown("### Translation Result")
+            st.text_area(
+                label="Translated Text",
+                value=corrected_text,
+                height=200,
+                key="translation_result"
+            )
+            # Download options
+            st.markdown("### Download Options")
+            col1, col2 = st.columns(2)
+            with col1:
                 # Text file download
                 text_buffer = io.BytesIO()
                 text_buffer.write(corrected_text.encode())
                     file_name="translated_document.txt",
                     mime="text/plain"
                 )
+            with col2:
                 # DOCX file download
                 docx_buffer = save_as_docx(corrected_text)
                 st.download_button(
                     file_name="translated_document.docx",
                     mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                 )
+            progress_bar.progress(100)
         except Exception as e:
             st.error(f"An error occurred: {str(e)}")