Spaces:

gauravchand11
/

try

Build error

App Files Files Community

gauravchand11 commited on 27 days ago

Commit

ed75acb

verified ·

1 Parent(s): f4c8d2f

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -95

app.py CHANGED Viewed

@@ -12,33 +12,41 @@ import os
 import re
 import torch
 import numpy as np
 # Load models and tokenizers
 @st.cache_resource
 def load_models():
-    # BERT model for context understanding
-    context_tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-multilingual-cased')
-    context_model = BertModel.from_pretrained('google-bert/bert-base-multilingual-cased')
-    # NLLB model for translation
-    nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-    nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
-    # GECToR model for grammar correction
-    grammar_tokenizer = AutoTokenizer.from_pretrained('gotutiyan/gector-bert-base-cased-5k')
-    grammar_model = AutoModelForTokenClassification.from_pretrained('gotutiyan/gector-bert-base-cased-5k')
-    return {
-        "context": (context_tokenizer, context_model),
-        "nllb": (nllb_tokenizer, nllb_model),
-        "grammar": (grammar_tokenizer, grammar_model)
-    }
 def get_bert_embeddings(text, models):
     """Get contextual embeddings from BERT"""
     tokenizer, model = models["context"]
-    # Split text into smaller chunks if needed
     max_length = 512
     chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
     contextual_embeddings = []
@@ -47,7 +55,7 @@ def get_bert_embeddings(text, models):
         inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model(**inputs)
-            embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
             contextual_embeddings.append(embeddings)
     # Combine embeddings from all chunks
@@ -55,7 +63,7 @@ def get_bert_embeddings(text, models):
     return combined_embedding
 def apply_grammar_correction(text, models):
-    """Apply grammar correction using GECToR"""
     tokenizer, model = models["grammar"]
     sentences = re.split('([.!?।]+)', text)
@@ -63,22 +71,23 @@ def apply_grammar_correction(text, models):
     for sentence in sentences:
         if sentence.strip():
             inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
             with torch.no_grad():
                 outputs = model(**inputs)
                 predictions = torch.argmax(outputs.logits, dim=2)
-                # Convert predictions to corrected text
                 tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
                 corrected_tokens = []
                 for token, pred in zip(tokens, predictions[0]):
-                    if pred == 0:  # Keep token as is
-                        corrected_tokens.append(token)
-                    # Handle other prediction cases as needed
                 corrected_text = tokenizer.convert_tokens_to_string(corrected_tokens)
-                corrected_sentences.append(corrected_text)
     return " ".join(corrected_sentences)
@@ -109,7 +118,6 @@ def translate_text(text, src_lang, tgt_lang, models):
     if src_lang == tgt_lang:
         return text
-    # Language codes for NLLB
     lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
     if src_lang not in lang_map or tgt_lang not in lang_map:
@@ -118,62 +126,67 @@ def translate_text(text, src_lang, tgt_lang, models):
     tgt_lang_code = lang_map[tgt_lang]
     tokenizer, model = models["nllb"]
-    # Get contextual embeddings
-    context_embedding = get_bert_embeddings(text, models)
-    # Split into chunks for translation
-    chunks = []
-    current_chunk = ""
-    for sentence in re.split('([.!?।]+)', text):
-        if sentence.strip():
-            if len(current_chunk) + len(sentence) < 450:
-                current_chunk += sentence
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk)
-                current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk)
-    translated_text = ""
-    for chunk in chunks:
-        if chunk.strip():
-            # Prepare input with context
-            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
-            # Add context embedding to attention mask
-            attention_mask = inputs['attention_mask'].float()
-            attention_mask = attention_mask * (1 + 0.1 * context_embedding.norm())
-            # Get target language token ID
-            tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
-            # Generate translation
-            with torch.no_grad():
-                translated = model.generate(
-                    input_ids=inputs['input_ids'],
-                    attention_mask=attention_mask,
-                    forced_bos_token_id=tgt_lang_id,
-                    max_length=512,
-                    num_beams=5,
-                    length_penalty=1.0,
-                    no_repeat_ngram_size=3,
-                    do_sample=True,
-                    temperature=0.7
-                )
-                translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
-                translated_text += translated_chunk + " "
-    # Apply grammar correction
-    corrected_text = apply_grammar_correction(translated_text.strip(), models)
-    return corrected_text
 def save_text_to_file(text, original_filename, prefix="translated"):
-    output_filename = f"{prefix}_{os.path.basename(original_filename)}.txt"
     with open(output_filename, "w", encoding="utf-8") as f:
         f.write(text)
     return output_filename
@@ -183,15 +196,14 @@ def process_document(file, source_lang, target_lang, models):
         # Extract text from uploaded file
         text = extract_text(file)
-        # Log the input text for debugging
-        st.sidebar.write("Input text:", text[:500] + "...")
-        # Translate the text with context awareness and grammar correction
         translated_text = translate_text(text, source_lang, target_lang, models)
-        # Log the output text for debugging
-        st.sidebar.write("Output text:", translated_text[:500] + "...")
         # Save the result
         if translated_text.startswith("Error:"):
             output_file = save_text_to_file(translated_text, file.name, prefix="error")
@@ -199,9 +211,10 @@ def process_document(file, source_lang, target_lang, models):
             output_file = save_text_to_file(translated_text, file.name)
         return output_file, translated_text
     except Exception as e:
         error_message = f"Error: {str(e)}"
-        st.error(f"An error occurred: {error_message}")
         output_file = save_text_to_file(error_message, file.name, prefix="error")
         return output_file, error_message
@@ -209,10 +222,15 @@ def main():
     st.title("Advanced Document Translator")
     st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages.")
     try:
-        # Initialize models
         with st.spinner("Loading models..."):
             models = load_models()
         # File uploader
         uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
@@ -224,9 +242,6 @@ def main():
         with col2:
             target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
-        # Add debug mode toggle
-        debug_mode = st.sidebar.checkbox("Enable Debug Mode")
         if uploaded_file is not None and st.button("Translate"):
             with st.spinner("Processing and Translating..."):
                 output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
@@ -242,13 +257,10 @@ def main():
                         file_name=os.path.basename(output_file),
                         mime="text/plain"
                     )
-                if debug_mode:
-                    st.sidebar.write("Translation completed")
-                    st.sidebar.write("Output file:", output_file)
     except Exception as e:
-        st.error(f"An error occurred while initializing the application: {str(e)}")
 if __name__ == "__main__":
     main()

 import re
 import torch
 import numpy as np
+from datetime import datetime, timezone
 # Load models and tokenizers
 @st.cache_resource
 def load_models():
+    try:
+        # BERT model for context understanding
+        context_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
+        context_model = BertModel.from_pretrained('bert-base-multilingual-cased')
+        # NLLB model for translation
+        nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+        nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
+        # Grammar correction model
+        grammar_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        grammar_model = AutoModelForTokenClassification.from_pretrained(
+            'bert-base-cased',
+            num_labels=3  # Assuming 3 labels: keep, delete, replace
+        )
+        return {
+            "context": (context_tokenizer, context_model),
+            "nllb": (nllb_tokenizer, nllb_model),
+            "grammar": (grammar_tokenizer, grammar_model)
+        }
+    except Exception as e:
+        st.error(f"Error loading models: {str(e)}")
+        raise e
 def get_bert_embeddings(text, models):
     """Get contextual embeddings from BERT"""
     tokenizer, model = models["context"]
+    # Split text into smaller chunks
     max_length = 512
     chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
     contextual_embeddings = []
         inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model(**inputs)
+            embeddings = outputs.last_hidden_state.mean(dim=1)
             contextual_embeddings.append(embeddings)
     # Combine embeddings from all chunks
     return combined_embedding
 def apply_grammar_correction(text, models):
+    """Basic grammar correction using BERT"""
     tokenizer, model = models["grammar"]
     sentences = re.split('([.!?।]+)', text)
     for sentence in sentences:
         if sentence.strip():
+            # Basic tokenization and prediction
             inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)
             with torch.no_grad():
                 outputs = model(**inputs)
                 predictions = torch.argmax(outputs.logits, dim=2)
                 tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
                 corrected_tokens = []
                 for token, pred in zip(tokens, predictions[0]):
+                    if pred == 0 or token in ['[CLS]', '[SEP]', '[PAD]']:
+                        if token not in ['[CLS]', '[SEP]', '[PAD]']:
+                            corrected_tokens.append(token)
                 corrected_text = tokenizer.convert_tokens_to_string(corrected_tokens)
+                if corrected_text.strip():
+                    corrected_sentences.append(corrected_text)
     return " ".join(corrected_sentences)
     if src_lang == tgt_lang:
         return text
     lang_map = {"en": "eng_Latn", "hi": "hin_Deva", "mr": "mar_Deva"}
     if src_lang not in lang_map or tgt_lang not in lang_map:
     tgt_lang_code = lang_map[tgt_lang]
     tokenizer, model = models["nllb"]
+    try:
+        # Get contextual embeddings
+        context_embedding = get_bert_embeddings(text, models)
+        # Split into chunks for translation
+        chunks = []
+        current_chunk = ""
+        for sentence in re.split('([.!?।]+)', text):
+            if sentence.strip():
+                if len(current_chunk) + len(sentence) < 450:
+                    current_chunk += sentence
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk)
+                    current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk)
+        translated_text = ""
+        for chunk in chunks:
+            if chunk.strip():
+                inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
+                # Use context embedding to modify attention
+                attention_mask = inputs['attention_mask'].float()
+                context_weight = 0.1 * torch.sigmoid(context_embedding.mean())
+                attention_mask = attention_mask * (1 + context_weight)
+                # Get target language token ID
+                tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
+                with torch.no_grad():
+                    translated = model.generate(
+                        input_ids=inputs['input_ids'],
+                        attention_mask=attention_mask,
+                        forced_bos_token_id=tgt_lang_id,
+                        max_length=512,
+                        num_beams=5,
+                        length_penalty=1.0,
+                        no_repeat_ngram_size=3,
+                        do_sample=True,
+                        temperature=0.7
+                    )
+                    translated_chunk = tokenizer.decode(translated[0], skip_special_tokens=True)
+                    translated_text += translated_chunk + " "
+        # Apply basic grammar correction
+        corrected_text = apply_grammar_correction(translated_text.strip(), models)
+        return corrected_text
+    except Exception as e:
+        st.error(f"Translation error: {str(e)}")
+        return f"Error during translation: {str(e)}"
 def save_text_to_file(text, original_filename, prefix="translated"):
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+    output_filename = f"{prefix}_{timestamp}_{os.path.basename(original_filename)}.txt"
     with open(output_filename, "w", encoding="utf-8") as f:
         f.write(text)
     return output_filename
         # Extract text from uploaded file
         text = extract_text(file)
+        # Add debugging information
+        st.sidebar.write("Processing document...")
+        st.sidebar.write(f"Source language: {source_lang}")
+        st.sidebar.write(f"Target language: {target_lang}")
+        # Translate the text
         translated_text = translate_text(text, source_lang, target_lang, models)
         # Save the result
         if translated_text.startswith("Error:"):
             output_file = save_text_to_file(translated_text, file.name, prefix="error")
             output_file = save_text_to_file(translated_text, file.name)
         return output_file, translated_text
     except Exception as e:
         error_message = f"Error: {str(e)}"
+        st.error(error_message)
         output_file = save_text_to_file(error_message, file.name, prefix="error")
         return output_file, error_message
     st.title("Advanced Document Translator")
     st.write("Upload a document (PDF, DOCX, or TXT) and select source and target languages.")
+    # Display current user and timestamp
+    st.sidebar.write(f"Current User: {os.getenv('USER', 'gauravchand')}")
+    st.sidebar.write(f"UTC Time: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')}")
     try:
+        # Initialize models with error handling
         with st.spinner("Loading models..."):
             models = load_models()
+            st.success("Models loaded successfully!")
         # File uploader
         uploaded_file = st.file_uploader("Upload Document", type=["pdf", "docx", "txt"])
         with col2:
             target_lang = st.selectbox("Target Language", ["en", "hi", "mr"], index=1)
         if uploaded_file is not None and st.button("Translate"):
             with st.spinner("Processing and Translating..."):
                 output_file, result_text = process_document(uploaded_file, source_lang, target_lang, models)
                         file_name=os.path.basename(output_file),
                         mime="text/plain"
                     )
     except Exception as e:
+        st.error(f"Application error: {str(e)}")
+        st.warning("Please try refreshing the page or contact support.")
 if __name__ == "__main__":
     main()