soil_profile / document_processor.py
Sompote's picture
Upload 17 files
2c200f8 verified
import PyPDF2
from PIL import Image
import base64
import io
import streamlit as st
try:
from pdf2image import convert_from_path
PDF2IMAGE_AVAILABLE = True
except ImportError:
PDF2IMAGE_AVAILABLE = False
st.warning("⚠️ pdf2image not available. PDF to image conversion will be limited.")
class DocumentProcessor:
def __init__(self):
pass
def extract_text_from_pdf(self, pdf_file):
"""Extract text content from PDF file"""
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
st.error(f"Error extracting text from PDF: {str(e)}")
return None
def convert_pdf_to_images(self, pdf_file):
"""Convert PDF pages to images"""
if not PDF2IMAGE_AVAILABLE:
st.warning("PDF to image conversion not available. Install poppler-utils and pdf2image.")
return None
try:
images = convert_from_path(pdf_file, dpi=200)
return images
except Exception as e:
st.error(f"Error converting PDF to images: {str(e)}")
return None
def image_to_base64(self, image):
"""Convert PIL image to base64 string for API"""
try:
if isinstance(image, str):
with open(image, "rb") as img_file:
return base64.b64encode(img_file.read()).decode('utf-8')
else:
buffered = io.BytesIO()
image.save(buffered, format="PNG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')
except Exception as e:
st.error(f"Error converting image to base64: {str(e)}")
return None
def process_uploaded_file(self, uploaded_file):
"""Process uploaded file (PDF or image)"""
if uploaded_file is None:
return None, None, None
file_type = uploaded_file.type
if file_type == "application/pdf":
# Extract text
text_content = self.extract_text_from_pdf(uploaded_file)
# Convert to images for visual analysis (if available)
images = None
image_base64 = None
if PDF2IMAGE_AVAILABLE:
try:
import tempfile
import os
# Use temporary file to avoid conflicts
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_pdf.write(uploaded_file.getbuffer())
temp_pdf_path = temp_pdf.name
try:
images = self.convert_pdf_to_images(temp_pdf_path)
# Convert first page to base64 for LLM analysis
if images and len(images) > 0:
image_base64 = self.image_to_base64(images[0])
finally:
# Clean up temporary file
if os.path.exists(temp_pdf_path):
os.unlink(temp_pdf_path)
except Exception as e:
st.warning(f"PDF to image conversion failed: {str(e)}. Using text analysis only.")
return text_content, images, image_base64
elif file_type in ["image/jpeg", "image/png", "image/jpg"]:
# For image files
try:
image = Image.open(uploaded_file)
image_base64 = self.image_to_base64(image)
return None, [image], image_base64
except Exception as e:
st.error(f"Error processing image file: {str(e)}")
return None, None, None
else:
st.error("Unsupported file type. Please upload PDF or image files.")
return None, None, None