Spaces:

gauravchand11
/

try

Build error

App Files Files Community

gauravchand11 commited on Apr 6

Commit

c98f2e3

verified ·

1 Parent(s): 7f47488

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -23

app.py CHANGED Viewed

@@ -2,15 +2,18 @@ import streamlit as st
 import PyPDF2
 import docx
 import io
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 from pathlib import Path
 import tempfile
 from typing import Union, Tuple
-import language_tool_python
-# Initialize language tool for grammar correction
-language_tool = language_tool_python.LanguageTool('en-US')
 # Define supported languages and their codes
 SUPPORTED_LANGUAGES = {
@@ -19,26 +22,53 @@ SUPPORTED_LANGUAGES = {
     'Marathi': 'mar_Deva'
 }
 @st.cache_resource
 def load_models():
-    """Load and cache the translation and context interpretation models."""
     # Load Gemma model for context interpretation
-    gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
     gemma_model = AutoModelForCausalLM.from_pretrained(
         "google/gemma-2b",
         device_map="auto",
-        torch_dtype=torch.float16
     )
     # Load NLLB model for translation
-    nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
     nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
         "facebook/nllb-200-distilled-600M",
         device_map="auto",
-        torch_dtype=torch.float16
     )
-    return (gemma_tokenizer, gemma_model), (nllb_tokenizer, nllb_model)
 def extract_text_from_file(uploaded_file) -> str:
     """Extract text content from uploaded file based on its type."""
@@ -106,17 +136,39 @@ def translate_text(text: str, source_lang: str, target_lang: str, nllb_tuple: Tu
     translated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
     return translated_text
-def correct_grammar(text: str, target_lang: str) -> str:
-    """Correct grammar and ensure tense consistency in the translated text."""
-    # For English target language, use LanguageTool
-    if target_lang == 'eng_Latn':
-        matches = language_tool.check(text)
-        corrected_text = language_tool.correct(text)
-        return corrected_text
-    # For other languages, return as-is (you may want to add specific grammar
-    # correction for Hindi and Marathi in a production environment)
-    return text
 def save_as_docx(text: str) -> io.BytesIO:
     """Save translated text as a DOCX file."""
@@ -134,7 +186,12 @@ def main():
     # Load models
     with st.spinner("Loading models... This may take a few minutes."):
-        gemma_tuple, nllb_tuple = load_models()
     # File upload
     uploaded_file = st.file_uploader(
@@ -182,7 +239,8 @@ def main():
                 with st.spinner("Correcting grammar..."):
                     corrected_text = correct_grammar(
                         translated_text,
-                        SUPPORTED_LANGUAGES[target_language]
                     )
                 # Display result

 import PyPDF2
 import docx
 import io
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration
 import torch
 from pathlib import Path
 import tempfile
 from typing import Union, Tuple
+import os
+# Get Hugging Face token from environment variables
+HF_TOKEN = os.environ.get('HF_TOKEN')
+if not HF_TOKEN:
+    st.error("HF_TOKEN not found in environment variables. Please add it in the Spaces settings.")
+    st.stop()
 # Define supported languages and their codes
 SUPPORTED_LANGUAGES = {
     'Marathi': 'mar_Deva'
 }
+# Language codes for MT5
+MT5_LANG_CODES = {
+    'eng_Latn': 'en',
+    'hin_Deva': 'hi',
+    'mar_Deva': 'mr'
+}
 @st.cache_resource
 def load_models():
+    """Load and cache the translation, context interpretation, and grammar correction models."""
     # Load Gemma model for context interpretation
+    gemma_tokenizer = AutoTokenizer.from_pretrained(
+        "google/gemma-2b",
+        token=HF_TOKEN
+    )
     gemma_model = AutoModelForCausalLM.from_pretrained(
         "google/gemma-2b",
         device_map="auto",
+        torch_dtype=torch.float16,
+        token=HF_TOKEN
     )
     # Load NLLB model for translation
+    nllb_tokenizer = AutoTokenizer.from_pretrained(
+        "facebook/nllb-200-distilled-600M",
+        token=HF_TOKEN
+    )
     nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
         "facebook/nllb-200-distilled-600M",
         device_map="auto",
+        torch_dtype=torch.float16,
+        token=HF_TOKEN
     )
+    # Load MT5 model for grammar correction
+    mt5_tokenizer = AutoTokenizer.from_pretrained(
+        "google/mt5-small",
+        token=HF_TOKEN
+    )
+    mt5_model = T5ForConditionalGeneration.from_pretrained(
+        "google/mt5-small",
+        device_map="auto",
+        torch_dtype=torch.float16,
+        token=HF_TOKEN
+    )
+    return (gemma_tokenizer, gemma_model), (nllb_tokenizer, nllb_model), (mt5_tokenizer, mt5_model)
 def extract_text_from_file(uploaded_file) -> str:
     """Extract text content from uploaded file based on its type."""
     translated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
     return translated_text
+def correct_grammar(text: str, target_lang: str, mt5_tuple: Tuple) -> str:
+    """
+    Correct grammar using MT5 model for all supported languages.
+    Uses a text-to-text approach with language-specific prompts.
+    """
+    tokenizer, model = mt5_tuple
+    lang_code = MT5_LANG_CODES[target_lang]
+    # Language-specific prompts for grammar correction
+    prompts = {
+        'en': f"grammar: {text}",
+        'hi': f"व्याकरण सुधार: {text}",
+        'mr': f"व्याकरण सुधारणा: {text}"
+    }
+    prompt = prompts[lang_code]
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(model.device)
+    outputs = model.generate(
+        **inputs,
+        max_length=512,
+        num_beams=5,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True
+    )
+    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Clean up any artifacts from the model output
+    corrected_text = corrected_text.replace("grammar:", "").replace("व्याकरण सुधार:", "").replace("व्याकरण सुधारणा:", "").strip()
+    return corrected_text
 def save_as_docx(text: str) -> io.BytesIO:
     """Save translated text as a DOCX file."""
     # Load models
     with st.spinner("Loading models... This may take a few minutes."):
+        try:
+            gemma_tuple, nllb_tuple, mt5_tuple = load_models()
+        except Exception as e:
+            st.error(f"Error loading models: {str(e)}")
+            st.error("Please check if the HF_TOKEN is valid and has the necessary permissions.")
+            st.stop()
     # File upload
     uploaded_file = st.file_uploader(
                 with st.spinner("Correcting grammar..."):
                     corrected_text = correct_grammar(
                         translated_text,
+                        SUPPORTED_LANGUAGES[target_language],
+                        mt5_tuple
                     )
                 # Display result