Spaces:

mdasad3617
/

lab-report-analyzer

Sleeping

App Files Files Community

lab-report-analyzer / app.py

mdasad3617

Update app.py

cc10da2 verified 5 months ago

raw

history blame

6.89 kB

	import streamlit as st
	import logging
	from concurrent.futures import ThreadPoolExecutor
	import subprocess
	import sys

	# Attempt to import libraries, with fallback
	try:
	import pytesseract
	import cv2
	import numpy as np
	from PIL import Image
	import fitz # PyMuPDF for PDF processing
	from transformers import pipeline
	except ImportError:
	st.error("Required libraries are missing. Please install them using pip.")
	st.stop()

	# Setup logging
	def setup_logging():
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	)

	# Tesseract installation check and guide
	def check_tesseract():
	try:
	# Try to get Tesseract version
	version = subprocess.check_output(['tesseract', '--version'],
	stderr=subprocess.STDOUT).decode('utf-8')
	return True
	except (subprocess.CalledProcessError, FileNotFoundError):
	# Provide installation instructions based on operating system
	st.error("Tesseract OCR is not installed.")
	st.markdown("### Tesseract Installation Guide:")

	if sys.platform.startswith('linux'):
	st.code("""
	# For Ubuntu/Debian
	sudo apt-get update
	sudo apt-get install -y tesseract-ocr

	# For Fedora
	sudo dnf install -y tesseract

	# For CentOS/RHEL
	sudo yum install -y tesseract
	""")
	elif sys.platform.startswith('darwin'):
	st.code("""
	# For macOS (using Homebrew)
	brew install tesseract
	""")
	elif sys.platform.startswith('win'):
	st.markdown("""
	1. Download Tesseract installer from:
	https://github.com/UB-Mannheim/tesseract/wiki
	2. Run the installer
	3. Add Tesseract directory to your system PATH
	""")

	st.info("After installation, restart your application.")
	return False

	# Load models globally for faster performance
	@st.cache_resource
	def load_models():
	logging.info("Loading Hugging Face models...")

	# Translation models
	translator_hi = pipeline("translation", model="Helsinki-NLP/opus-mt-en-hi")
	translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")

	# Summarization model
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	return translator_hi, translator_ur, summarizer

	# Function to preprocess image for better OCR
	def preprocess_image(image):
	# Convert PIL Image to OpenCV format
	img_np = np.array(image)

	# Convert to grayscale
	gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)

	# Apply thresholding to preprocess the image
	gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY \| cv2.THRESH_OTSU)[1]

	# Apply deskewing if needed
	coords = np.column_stack(np.where(gray > 0))

	# Prevent error if no foreground pixels found
	if coords.size == 0:
	return gray

	angle = cv2.minAreaRect(coords)[-1]

	# The cv2.minAreaRect returns values in the range [:-90, 0)
	# so we need to take the inverse to get the rotation from the horizontal axis
	if angle < -45:
	angle = -(90 + angle)
	else:
	angle = -angle

	# Rotate the image to deskew
	(h, w) = gray.shape[:2]
	center = (w // 2, h // 2)
	M = cv2.getRotationMatrix2D(center, angle, 1.0)
	rotated = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

	return rotated

	# Function to extract text from images
	def extract_text_from_image(image):
	logging.info("Extracting text from image...")

	# Preprocess image
	preprocessed_img = preprocess_image(image)

	# Use pytesseract for OCR
	text = pytesseract.image_to_string(preprocessed_img)

	return text.strip()

	# Function to extract text from PDFs
	def extract_text_from_pdf(pdf_file):
	logging.info("Extracting text from PDF...")
	doc = fitz.open(pdf_file)
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	# Function to process text in chunks for better performance
	def process_chunks(text, model, chunk_size=500):
	chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
	results = []
	with ThreadPoolExecutor() as executor:
	results = list(executor.map(lambda chunk: model(chunk, max_length=200), chunks))
	return " ".join([result[0]["translation_text"] for result in results])

	# Main app logic
	def main():
	# Check Tesseract installation first
	if not check_tesseract():
	return

	setup_logging()
	st.title("Advanced Lab Report Analyzer")
	st.write("Upload a file (Image, PDF, or Text) to analyze and summarize the lab report in English, Hindi, and Urdu.")

	# Load all models
	translator_hi, translator_ur, summarizer = load_models()

	file = st.file_uploader("Upload a file (Image, PDF, or Text):", type=["jpg", "png", "jpeg", "pdf", "txt"])

	if file:
	text = ""
	try:
	if file.type in ["image/jpeg", "image/png", "image/jpg"]:
	image = Image.open(file)
	text = extract_text_from_image(image)
	elif file.type == "application/pdf":
	text = extract_text_from_pdf(file)
	elif file.type == "text/plain":
	text = file.read().decode("utf-8")

	if text:
	with st.spinner("Analyzing the report..."):
	# Generate summary
	summary = summarizer(text, max_length=130, min_length=30)[0]["summary_text"]

	# Generate translations
	hindi_translation = process_chunks(text, translator_hi)
	urdu_translation = process_chunks(text, translator_ur)

	# Display results
	st.subheader("Original Text:")
	st.write(text)

	st.subheader("Analysis Summary (English):")
	st.write(summary)

	st.subheader("Hindi Translation:")
	st.write(hindi_translation)

	st.subheader("Urdu Translation:")
	st.write(urdu_translation)
	else:
	st.warning("No text could be extracted. Please check the file and try again.")

	except Exception as e:
	logging.error(f"Error processing the file: {e}")
	st.error(f"An error occurred while processing the file: {e}")
	else:
	st.info("Please upload a file to begin.")

	if __name__ == "__main__":
	main()