import PyPDF2 from PIL import Image import base64 import io import streamlit as st try: from pdf2image import convert_from_path PDF2IMAGE_AVAILABLE = True except ImportError: PDF2IMAGE_AVAILABLE = False st.warning("⚠️ pdf2image not available. PDF to image conversion will be limited.") class DocumentProcessor: def __init__(self): pass def extract_text_from_pdf(self, pdf_file): """Extract text content from PDF file""" try: pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text except Exception as e: st.error(f"Error extracting text from PDF: {str(e)}") return None def convert_pdf_to_images(self, pdf_file): """Convert PDF pages to images""" if not PDF2IMAGE_AVAILABLE: st.warning("PDF to image conversion not available. Install poppler-utils and pdf2image.") return None try: images = convert_from_path(pdf_file, dpi=200) return images except Exception as e: st.error(f"Error converting PDF to images: {str(e)}") return None def image_to_base64(self, image): """Convert PIL image to base64 string for API""" try: if isinstance(image, str): with open(image, "rb") as img_file: return base64.b64encode(img_file.read()).decode('utf-8') else: buffered = io.BytesIO() image.save(buffered, format="PNG") return base64.b64encode(buffered.getvalue()).decode('utf-8') except Exception as e: st.error(f"Error converting image to base64: {str(e)}") return None def process_uploaded_file(self, uploaded_file): """Process uploaded file (PDF or image)""" if uploaded_file is None: return None, None, None file_type = uploaded_file.type if file_type == "application/pdf": # Extract text text_content = self.extract_text_from_pdf(uploaded_file) # Convert to images for visual analysis (if available) images = None image_base64 = None if PDF2IMAGE_AVAILABLE: try: import tempfile import os # Use temporary file to avoid conflicts with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(uploaded_file.getbuffer()) temp_pdf_path = temp_pdf.name try: images = self.convert_pdf_to_images(temp_pdf_path) # Convert first page to base64 for LLM analysis if images and len(images) > 0: image_base64 = self.image_to_base64(images[0]) finally: # Clean up temporary file if os.path.exists(temp_pdf_path): os.unlink(temp_pdf_path) except Exception as e: st.warning(f"PDF to image conversion failed: {str(e)}. Using text analysis only.") return text_content, images, image_base64 elif file_type in ["image/jpeg", "image/png", "image/jpg"]: # For image files try: image = Image.open(uploaded_file) image_base64 = self.image_to_base64(image) return None, [image], image_base64 except Exception as e: st.error(f"Error processing image file: {str(e)}") return None, None, None else: st.error("Unsupported file type. Please upload PDF or image files.") return None, None, None