tahirsher commited on
Commit
c899d24
·
verified ·
1 Parent(s): f04fe79

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF for PDF processing
2
+ from PIL import Image
3
+ import pytesseract
4
+ from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
5
+ import streamlit as st
6
+ import os
7
+ import re
8
+ from docx import Document
9
+ from langdetect import detect
10
+ import asyncio # For asynchronous processing
11
+
12
+ # Initialize BLIP-2 model and processor for image-to-text
13
+ @st.cache(allow_output_mutation=True)
14
+ def load_blip2_model():
15
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
16
+ model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
17
+ return processor, model
18
+
19
+ processor, model = load_blip2_model()
20
+
21
+ # Initialize translation pipeline for Korean to English
22
+ @st.cache(allow_output_mutation=True)
23
+ def load_translation_model():
24
+ return pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
25
+
26
+ translator = load_translation_model()
27
+
28
+ # Path to Tesseract executable for OCR
29
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
30
+
31
+ def extract_text_from_image(image):
32
+ """Extract text from image using OCR or BLIP-2."""
33
+ # First try using BLIP-2
34
+ image = image.convert("RGB")
35
+ inputs = processor(images=image, return_tensors="pt")
36
+ generated_ids = model.generate(**inputs)
37
+ decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
38
+
39
+ # Fallback to OCR if BLIP-2 extraction fails
40
+ if not decoded_text.strip():
41
+ decoded_text = pytesseract.image_to_string(image, lang='kor+eng')
42
+
43
+ return decoded_text.strip()
44
+
45
+ def extract_from_pdf(pdf_path):
46
+ """Extract text from PDF by combining direct extraction and OCR fallback."""
47
+ doc = fitz.open(pdf_path)
48
+ full_text = ""
49
+
50
+ for page_num in range(len(doc)):
51
+ page = doc.load_page(page_num)
52
+
53
+ # Try extracting text directly
54
+ text = page.get_text()
55
+
56
+ # If no text, fallback to OCR
57
+ if not text.strip():
58
+ pix = page.get_pixmap()
59
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
60
+ text = extract_text_from_image(image)
61
+
62
+ full_text += text + "\n"
63
+ return full_text.strip()
64
+
65
+ def extract_from_word(docx_path):
66
+ doc = Document(docx_path)
67
+ full_text = ""
68
+ for para in doc.paragraphs:
69
+ full_text += para.text + "\n"
70
+ return full_text.strip()
71
+
72
+ def clean_text(text):
73
+ return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
74
+
75
+ def translate_text(text):
76
+ if not text.strip():
77
+ return "No text available for translation."
78
+
79
+ detected_language = detect(text)
80
+ st.write(f"Detected language: {detected_language}")
81
+
82
+ if detected_language == "en":
83
+ return "The text is already in English."
84
+
85
+ chunks = [text[i:i + 50000] for i in range(0, len(text), 50000)]
86
+ translated_text = ""
87
+ for chunk in chunks:
88
+ translated_chunk = translator(chunk, max_length=400)
89
+ if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
90
+ translated_text += translated_chunk[0]['translation_text'] + " "
91
+ return translated_text.strip()
92
+
93
+ def create_pdf(translated_text, output_path):
94
+ doc = fitz.open()
95
+ page = doc.new_page()
96
+
97
+ # Define text insertion rectangle
98
+ rect = fitz.Rect(50, 50, 550, 750)
99
+
100
+ # Insert text using the defined rectangle
101
+ page.insert_textbox(
102
+ rect, translated_text,
103
+ fontsize=12,
104
+ fontname="helv",
105
+ color=(0, 0, 0),
106
+ )
107
+ doc.save(output_path)
108
+
109
+ async def process_document(uploaded_file):
110
+ file_extension = uploaded_file.name.split(".")[-1].lower()
111
+ temp_file_path = f"temp.{file_extension}"
112
+ with open(temp_file_path, "wb") as f:
113
+ f.write(uploaded_file.getbuffer())
114
+
115
+ try:
116
+ if file_extension == "pdf":
117
+ extracted_text = extract_from_pdf(temp_file_path)
118
+ elif file_extension in ["jpg", "jpeg", "png"]:
119
+ image = Image.open(temp_file_path)
120
+ extracted_text = extract_text_from_image(image)
121
+ elif file_extension == "docx":
122
+ extracted_text = extract_from_word(temp_file_path)
123
+ else:
124
+ st.error("Unsupported file format.")
125
+ return
126
+
127
+ extracted_text = clean_text(extracted_text)
128
+ st.write("Extracted Text (First 50000 characters):", extracted_text[:50000])
129
+
130
+ translated_text = translate_text(extracted_text)
131
+
132
+ st.subheader("Translated Text (English)")
133
+ st.write(translated_text)
134
+
135
+ if translated_text.strip():
136
+ output_pdf_path = "translated_document.pdf"
137
+ create_pdf(translated_text, output_pdf_path)
138
+
139
+ with open(output_pdf_path, "rb") as f:
140
+ st.download_button(
141
+ label="Download Translated PDF",
142
+ data=f,
143
+ file_name="translated_document.pdf",
144
+ mime="application/pdf"
145
+ )
146
+ else:
147
+ st.warning("No content to save in the translated PDF.")
148
+ finally:
149
+ if os.path.exists(temp_file_path):
150
+ os.remove(temp_file_path)
151
+ if os.path.exists("translated_document.pdf"):
152
+ os.remove("translated_document.pdf")
153
+
154
+ st.title("Multilingual Document Translator")
155
+ uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
156
+
157
+ if uploaded_file is not None:
158
+ with st.spinner("Processing document..."):
159
+ asyncio.run(process_document(uploaded_file))