Spaces:
Sleeping
Sleeping
import streamlit as st | |
import logging | |
from concurrent.futures import ThreadPoolExecutor | |
import subprocess | |
import sys | |
# Attempt to import libraries, with fallback | |
try: | |
import pytesseract | |
import cv2 | |
import numpy as np | |
from PIL import Image | |
import fitz # PyMuPDF for PDF processing | |
from transformers import pipeline | |
except ImportError: | |
st.error("Required libraries are missing. Please install them using pip.") | |
st.stop() | |
# Setup logging | |
def setup_logging(): | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s", | |
) | |
# Tesseract installation check and guide | |
def check_tesseract(): | |
try: | |
# Try to get Tesseract version | |
version = subprocess.check_output(['tesseract', '--version'], | |
stderr=subprocess.STDOUT).decode('utf-8') | |
return True | |
except (subprocess.CalledProcessError, FileNotFoundError): | |
# Provide installation instructions based on operating system | |
st.error("Tesseract OCR is not installed.") | |
st.markdown("### Tesseract Installation Guide:") | |
if sys.platform.startswith('linux'): | |
st.code(""" | |
# For Ubuntu/Debian | |
sudo apt-get update | |
sudo apt-get install -y tesseract-ocr | |
# For Fedora | |
sudo dnf install -y tesseract | |
# For CentOS/RHEL | |
sudo yum install -y tesseract | |
""") | |
elif sys.platform.startswith('darwin'): | |
st.code(""" | |
# For macOS (using Homebrew) | |
brew install tesseract | |
""") | |
elif sys.platform.startswith('win'): | |
st.markdown(""" | |
1. Download Tesseract installer from: | |
https://github.com/UB-Mannheim/tesseract/wiki | |
2. Run the installer | |
3. Add Tesseract directory to your system PATH | |
""") | |
st.info("After installation, restart your application.") | |
return False | |
# Load models globally for faster performance | |
def load_models(): | |
logging.info("Loading Hugging Face models...") | |
# Translation models | |
translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi") | |
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur") | |
# Summarization model | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
return translator_hi, translator_ur, summarizer | |
# Function to preprocess image for better OCR | |
def preprocess_image(image): | |
# Convert PIL Image to OpenCV format | |
img_np = np.array(image) | |
# Convert to grayscale | |
gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY) | |
# Apply thresholding to preprocess the image | |
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] | |
# Apply deskewing if needed | |
coords = np.column_stack(np.where(gray > 0)) | |
# Prevent error if no foreground pixels found | |
if coords.size == 0: | |
return gray | |
angle = cv2.minAreaRect(coords)[-1] | |
# The cv2.minAreaRect returns values in the range [:-90, 0) | |
# so we need to take the inverse to get the rotation from the horizontal axis | |
if angle < -45: | |
angle = -(90 + angle) | |
else: | |
angle = -angle | |
# Rotate the image to deskew | |
(h, w) = gray.shape[:2] | |
center = (w // 2, h // 2) | |
M = cv2.getRotationMatrix2D(center, angle, 1.0) | |
rotated = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) | |
return rotated | |
# Function to extract text from images | |
def extract_text_from_image(image): | |
logging.info("Extracting text from image...") | |
# Preprocess image | |
preprocessed_img = preprocess_image(image) | |
# Use pytesseract for OCR | |
text = pytesseract.image_to_string(preprocessed_img) | |
return text.strip() | |
# Function to extract text from PDFs | |
def extract_text_from_pdf(pdf_file): | |
logging.info("Extracting text from PDF...") | |
doc = fitz.open(pdf_file) | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
# Function to process text in chunks for better performance | |
def process_chunks(text, model, chunk_size=500): | |
chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
results = [] | |
with ThreadPoolExecutor() as executor: | |
results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks)) | |
return " ".join([result[0]["translation_text"] for result in results]) | |
# Main app logic | |
def main(): | |
# Check Tesseract installation first | |
if not check_tesseract(): | |
return | |
setup_logging() | |
st.title("Advanced Lab Report Analyzer") | |
st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.") | |
# Load all models | |
translator_hi, translator_ur, summarizer = load_models() | |
file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"]) | |
if file: | |
text = "" | |
try: | |
if file.type in ["image/jpeg", "image/png", "image/jpg"]: | |
image = Image.open(file) | |
text = extract_text_from_image(image) | |
elif file.type == "application/pdf": | |
text = extract_text_from_pdf(file) | |
elif file.type == "text/plain": | |
text = file.read().decode("utf-8") | |
if text: | |
with st.spinner("Analyzing the report..."): | |
# Generate summary | |
summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"] | |
# Generate translations | |
hindi_translation = process_chunks(text, translator_hi) | |
urdu_translation = process_chunks(text, translator_ur) | |
# Display results | |
st.subheader("Original Text:") | |
st.write(text) | |
st.subheader("Analysis Summary (English):") | |
st.write(summary) | |
st.subheader("Hindi Translation:") | |
st.write(hindi_translation) | |
st.subheader("Urdu Translation:") | |
st.write(urdu_translation) | |
else: | |
st.warning("No text could be extracted. Please check the file and try again.") | |
except Exception as e: | |
logging.error(f"Error processing the file: {e}") | |
st.error(f"An error occurred while processing the file: {e}") | |
else: | |
st.info("Please upload a file to begin.") | |
if __name__ == "__main__": | |
main() |