Spaces:
Sleeping
Sleeping
import PyPDF2 | |
from PIL import Image | |
import base64 | |
import io | |
import streamlit as st | |
try: | |
from pdf2image import convert_from_path | |
PDF2IMAGE_AVAILABLE = True | |
except ImportError: | |
PDF2IMAGE_AVAILABLE = False | |
st.warning("⚠️ pdf2image not available. PDF to image conversion will be limited.") | |
class DocumentProcessor: | |
def __init__(self): | |
pass | |
def extract_text_from_pdf(self, pdf_file): | |
"""Extract text content from PDF file""" | |
try: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
st.error(f"Error extracting text from PDF: {str(e)}") | |
return None | |
def convert_pdf_to_images(self, pdf_file): | |
"""Convert PDF pages to images""" | |
if not PDF2IMAGE_AVAILABLE: | |
st.warning("PDF to image conversion not available. Install poppler-utils and pdf2image.") | |
return None | |
try: | |
images = convert_from_path(pdf_file, dpi=200) | |
return images | |
except Exception as e: | |
st.error(f"Error converting PDF to images: {str(e)}") | |
return None | |
def image_to_base64(self, image): | |
"""Convert PIL image to base64 string for API""" | |
try: | |
if isinstance(image, str): | |
with open(image, "rb") as img_file: | |
return base64.b64encode(img_file.read()).decode('utf-8') | |
else: | |
buffered = io.BytesIO() | |
image.save(buffered, format="PNG") | |
return base64.b64encode(buffered.getvalue()).decode('utf-8') | |
except Exception as e: | |
st.error(f"Error converting image to base64: {str(e)}") | |
return None | |
def process_uploaded_file(self, uploaded_file): | |
"""Process uploaded file (PDF or image)""" | |
if uploaded_file is None: | |
return None, None, None | |
file_type = uploaded_file.type | |
if file_type == "application/pdf": | |
# Extract text | |
text_content = self.extract_text_from_pdf(uploaded_file) | |
# Convert to images for visual analysis (if available) | |
images = None | |
image_base64 = None | |
if PDF2IMAGE_AVAILABLE: | |
try: | |
import tempfile | |
import os | |
# Use temporary file to avoid conflicts | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: | |
temp_pdf.write(uploaded_file.getbuffer()) | |
temp_pdf_path = temp_pdf.name | |
try: | |
images = self.convert_pdf_to_images(temp_pdf_path) | |
# Convert first page to base64 for LLM analysis | |
if images and len(images) > 0: | |
image_base64 = self.image_to_base64(images[0]) | |
finally: | |
# Clean up temporary file | |
if os.path.exists(temp_pdf_path): | |
os.unlink(temp_pdf_path) | |
except Exception as e: | |
st.warning(f"PDF to image conversion failed: {str(e)}. Using text analysis only.") | |
return text_content, images, image_base64 | |
elif file_type in ["image/jpeg", "image/png", "image/jpg"]: | |
# For image files | |
try: | |
image = Image.open(uploaded_file) | |
image_base64 = self.image_to_base64(image) | |
return None, [image], image_base64 | |
except Exception as e: | |
st.error(f"Error processing image file: {str(e)}") | |
return None, None, None | |
else: | |
st.error("Unsupported file type. Please upload PDF or image files.") | |
return None, None, None |