Spaces:

tahirsher
/

Multilingual-Document-Translator

Sleeping

File size: 5,507 Bytes

import fitz  # PyMuPDF for PDF processing
from PIL import Image
import pytesseract
from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
import streamlit as st
import os
import re
from docx import Document
from langdetect import detect
import asyncio  # For asynchronous processing

# Initialize BLIP-2 model and processor for image-to-text
@st.cache_resource  # Use st.cache_resource for caching models
def load_blip2_model():
    processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
    model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
    return processor, model

processor, model = load_blip2_model()

# Initialize translation pipeline for Korean to English
@st.cache_resource  # Use st.cache_resource for caching models
def load_translation_model():
    return pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")

translator = load_translation_model()

# Path to Tesseract executable for OCR
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

def extract_text_from_image(image):
    """Extract text from image using OCR or BLIP-2."""
    # First try using BLIP-2
    image = image.convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    generated_ids = model.generate(**inputs)
    decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    # Fallback to OCR if BLIP-2 extraction fails
    if not decoded_text.strip():
        decoded_text = pytesseract.image_to_string(image, lang='kor+eng')

    return decoded_text.strip()

def extract_from_pdf(pdf_path):
    """Extract text from PDF by combining direct extraction and OCR fallback."""
    doc = fitz.open(pdf_path)
    full_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Try extracting text directly
        text = page.get_text()

        # If no text, fallback to OCR
        if not text.strip():
            pix = page.get_pixmap()
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text = extract_text_from_image(image)

        full_text += text + "\n"
    return full_text.strip()

def extract_from_word(docx_path):
    doc = Document(docx_path)
    full_text = ""
    for para in doc.paragraphs:
        full_text += para.text + "\n"
    return full_text.strip()

def clean_text(text):
    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()

def translate_text(text):
    if not text.strip():
        return "No text available for translation."
    
    detected_language = detect(text)
    st.write(f"Detected language: {detected_language}")

    if detected_language == "en":
        return "The text is already in English."

    chunks = [text[i:i + 50000] for i in range(0, len(text), 50000)]
    translated_text = ""
    for chunk in chunks:
        translated_chunk = translator(chunk, max_length=400)
        if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
            translated_text += translated_chunk[0]['translation_text'] + " "
    return translated_text.strip()

def create_pdf(translated_text, output_path):
    doc = fitz.open()
    page = doc.new_page()
    
    # Define text insertion rectangle
    rect = fitz.Rect(50, 50, 550, 750)
    
    # Insert text using the defined rectangle
    page.insert_textbox(
        rect, translated_text,
        fontsize=12,
        fontname="helv",
        color=(0, 0, 0),
    )
    doc.save(output_path)

async def process_document(uploaded_file):
    file_extension = uploaded_file.name.split(".")[-1].lower()
    temp_file_path = f"temp.{file_extension}"
    with open(temp_file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    try:
        if file_extension == "pdf":
            extracted_text = extract_from_pdf(temp_file_path)
        elif file_extension in ["jpg", "jpeg", "png"]:
            image = Image.open(temp_file_path)
            extracted_text = extract_text_from_image(image)
        elif file_extension == "docx":
            extracted_text = extract_from_word(temp_file_path)
        else:
            st.error("Unsupported file format.")
            return

        extracted_text = clean_text(extracted_text)
        st.write("Extracted Text (First 50000 characters):", extracted_text[:50000])

        translated_text = translate_text(extracted_text)

        st.subheader("Translated Text (English)")
        st.write(translated_text)

        if translated_text.strip():
            output_pdf_path = "translated_document.pdf"
            create_pdf(translated_text, output_pdf_path)

            with open(output_pdf_path, "rb") as f:
                st.download_button(
                    label="Download Translated PDF",
                    data=f,
                    file_name="translated_document.pdf",
                    mime="application/pdf"
                )
        else:
            st.warning("No content to save in the translated PDF.")
    finally:
        if os.path.exists(temp_file_path):
            os.remove(temp_file_path)
        if os.path.exists("translated_document.pdf"):
            os.remove("translated_document.pdf")

st.title("Multilingual Document Translator")
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])

if uploaded_file is not None:
    with st.spinner("Processing document..."):
        asyncio.run(process_document(uploaded_file))