Spaces:

tahirsher
/

Multilingual-Document-Translator

Sleeping

App Files Files Community

tahirsher commited on Jan 31

Commit

c899d24

verified ·

1 Parent(s): f04fe79

Create app.py

Browse files

Files changed (1) hide show

app.py +159 -0

app.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import fitz  # PyMuPDF for PDF processing
+from PIL import Image
+import pytesseract
+from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
+import streamlit as st
+import os
+import re
+from docx import Document
+from langdetect import detect
+import asyncio  # For asynchronous processing
+# Initialize BLIP-2 model and processor for image-to-text
+@st.cache(allow_output_mutation=True)
+def load_blip2_model():
+    processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+    model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
+    return processor, model
+processor, model = load_blip2_model()
+# Initialize translation pipeline for Korean to English
+@st.cache(allow_output_mutation=True)
+def load_translation_model():
+    return pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
+translator = load_translation_model()
+# Path to Tesseract executable for OCR
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+def extract_text_from_image(image):
+    """Extract text from image using OCR or BLIP-2."""
+    # First try using BLIP-2
+    image = image.convert("RGB")
+    inputs = processor(images=image, return_tensors="pt")
+    generated_ids = model.generate(**inputs)
+    decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Fallback to OCR if BLIP-2 extraction fails
+    if not decoded_text.strip():
+        decoded_text = pytesseract.image_to_string(image, lang='kor+eng')
+    return decoded_text.strip()
+def extract_from_pdf(pdf_path):
+    """Extract text from PDF by combining direct extraction and OCR fallback."""
+    doc = fitz.open(pdf_path)
+    full_text = ""
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        # Try extracting text directly
+        text = page.get_text()
+        # If no text, fallback to OCR
+        if not text.strip():
+            pix = page.get_pixmap()
+            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            text = extract_text_from_image(image)
+        full_text += text + "\n"
+    return full_text.strip()
+def extract_from_word(docx_path):
+    doc = Document(docx_path)
+    full_text = ""
+    for para in doc.paragraphs:
+        full_text += para.text + "\n"
+    return full_text.strip()
+def clean_text(text):
+    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
+def translate_text(text):
+    if not text.strip():
+        return "No text available for translation."
+    detected_language = detect(text)
+    st.write(f"Detected language: {detected_language}")
+    if detected_language == "en":
+        return "The text is already in English."
+    chunks = [text[i:i + 50000] for i in range(0, len(text), 50000)]
+    translated_text = ""
+    for chunk in chunks:
+        translated_chunk = translator(chunk, max_length=400)
+        if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
+            translated_text += translated_chunk[0]['translation_text'] + " "
+    return translated_text.strip()
+def create_pdf(translated_text, output_path):
+    doc = fitz.open()
+    page = doc.new_page()
+    # Define text insertion rectangle
+    rect = fitz.Rect(50, 50, 550, 750)
+    # Insert text using the defined rectangle
+    page.insert_textbox(
+        rect, translated_text,
+        fontsize=12,
+        fontname="helv",
+        color=(0, 0, 0),
+    )
+    doc.save(output_path)
+async def process_document(uploaded_file):
+    file_extension = uploaded_file.name.split(".")[-1].lower()
+    temp_file_path = f"temp.{file_extension}"
+    with open(temp_file_path, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    try:
+        if file_extension == "pdf":
+            extracted_text = extract_from_pdf(temp_file_path)
+        elif file_extension in ["jpg", "jpeg", "png"]:
+            image = Image.open(temp_file_path)
+            extracted_text = extract_text_from_image(image)
+        elif file_extension == "docx":
+            extracted_text = extract_from_word(temp_file_path)
+        else:
+            st.error("Unsupported file format.")
+            return
+        extracted_text = clean_text(extracted_text)
+        st.write("Extracted Text (First 50000 characters):", extracted_text[:50000])
+        translated_text = translate_text(extracted_text)
+        st.subheader("Translated Text (English)")
+        st.write(translated_text)
+        if translated_text.strip():
+            output_pdf_path = "translated_document.pdf"
+            create_pdf(translated_text, output_pdf_path)
+            with open(output_pdf_path, "rb") as f:
+                st.download_button(
+                    label="Download Translated PDF",
+                    data=f,
+                    file_name="translated_document.pdf",
+                    mime="application/pdf"
+                )
+        else:
+            st.warning("No content to save in the translated PDF.")
+    finally:
+        if os.path.exists(temp_file_path):
+            os.remove(temp_file_path)
+        if os.path.exists("translated_document.pdf"):
+            os.remove("translated_document.pdf")
+st.title("Multilingual Document Translator")
+uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
+if uploaded_file is not None:
+    with st.spinner("Processing document..."):
+        asyncio.run(process_document(uploaded_file))