Spaces:

Sompote
/

soil_profile

Sleeping

App Files Files Community

soil_profile / document_processor.py

Sompote

Upload 17 files

2c200f8 verified 2 months ago

raw

history blame contribute delete

4.18 kB

	import PyPDF2
	from PIL import Image
	import base64
	import io
	import streamlit as st

	try:
	from pdf2image import convert_from_path
	PDF2IMAGE_AVAILABLE = True
	except ImportError:
	PDF2IMAGE_AVAILABLE = False
	st.warning("⚠️ pdf2image not available. PDF to image conversion will be limited.")

	class DocumentProcessor:
	def __init__(self):
	pass

	def extract_text_from_pdf(self, pdf_file):
	"""Extract text content from PDF file"""
	try:
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	st.error(f"Error extracting text from PDF: {str(e)}")
	return None

	def convert_pdf_to_images(self, pdf_file):
	"""Convert PDF pages to images"""
	if not PDF2IMAGE_AVAILABLE:
	st.warning("PDF to image conversion not available. Install poppler-utils and pdf2image.")
	return None

	try:
	images = convert_from_path(pdf_file, dpi=200)
	return images
	except Exception as e:
	st.error(f"Error converting PDF to images: {str(e)}")
	return None

	def image_to_base64(self, image):
	"""Convert PIL image to base64 string for API"""
	try:
	if isinstance(image, str):
	with open(image, "rb") as img_file:
	return base64.b64encode(img_file.read()).decode('utf-8')
	else:
	buffered = io.BytesIO()
	image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode('utf-8')
	except Exception as e:
	st.error(f"Error converting image to base64: {str(e)}")
	return None

	def process_uploaded_file(self, uploaded_file):
	"""Process uploaded file (PDF or image)"""
	if uploaded_file is None:
	return None, None, None

	file_type = uploaded_file.type

	if file_type == "application/pdf":
	# Extract text
	text_content = self.extract_text_from_pdf(uploaded_file)

	# Convert to images for visual analysis (if available)
	images = None
	image_base64 = None

	if PDF2IMAGE_AVAILABLE:
	try:
	import tempfile
	import os

	# Use temporary file to avoid conflicts
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
	temp_pdf.write(uploaded_file.getbuffer())
	temp_pdf_path = temp_pdf.name

	try:
	images = self.convert_pdf_to_images(temp_pdf_path)

	# Convert first page to base64 for LLM analysis
	if images and len(images) > 0:
	image_base64 = self.image_to_base64(images[0])
	finally:
	# Clean up temporary file
	if os.path.exists(temp_pdf_path):
	os.unlink(temp_pdf_path)

	except Exception as e:
	st.warning(f"PDF to image conversion failed: {str(e)}. Using text analysis only.")

	return text_content, images, image_base64

	elif file_type in ["image/jpeg", "image/png", "image/jpg"]:
	# For image files
	try:
	image = Image.open(uploaded_file)
	image_base64 = self.image_to_base64(image)

	return None, [image], image_base64
	except Exception as e:
	st.error(f"Error processing image file: {str(e)}")
	return None, None, None

	else:
	st.error("Unsupported file type. Please upload PDF or image files.")
	return None, None, None