import PyPDF2 import os from typing import Optional, Tuple from app.core.config import settings import logging logger = logging.getLogger(__name__) class PDFProcessor: def __init__(self): self.allowed_extensions = settings.ALLOWED_EXTENSIONS self.max_file_size = settings.MAX_FILE_SIZE def validate_file(self, file_path: str) -> Tuple[bool, str]: """Validate uploaded file""" if not os.path.exists(file_path): return False, "File does not exist" # Check file size file_size = os.path.getsize(file_path) if file_size > self.max_file_size: return False, f"File size exceeds maximum allowed size of {self.max_file_size} bytes" # Check file extension file_ext = os.path.splitext(file_path)[1].lower() if file_ext not in self.allowed_extensions: return False, f"File type not allowed. Allowed types: {', '.join(self.allowed_extensions)}" return True, "File is valid" def extract_text(self, file_path: str) -> Optional[str]: """Extract text content from PDF file""" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) text_content = [] for page_num in range(len(pdf_reader.pages)): try: page = pdf_reader.pages[page_num] text = page.extract_text() if text.strip(): text_content.append(f"Page {page_num + 1}:\n{text.strip()}") except Exception as e: logger.warning(f"Error extracting text from page {page_num + 1}: {e}") continue return "\n\n".join(text_content) except Exception as e: logger.error(f"Error processing PDF file {file_path}: {e}") return None def get_metadata(self, file_path: str) -> dict: """Extract metadata from PDF file""" try: with open(file_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) metadata = { 'num_pages': len(pdf_reader.pages), 'file_size': os.path.getsize(file_path), 'title': None, 'author': None, 'subject': None, 'creator': None } if pdf_reader.metadata: metadata.update({ 'title': pdf_reader.metadata.get('/Title'), 'author': pdf_reader.metadata.get('/Author'), 'subject': pdf_reader.metadata.get('/Subject'), 'creator': pdf_reader.metadata.get('/Creator') }) return metadata except Exception as e: logger.error(f"Error extracting metadata from PDF file {file_path}: {e}") return { 'num_pages': 0, 'file_size': os.path.getsize(file_path) if os.path.exists(file_path) else 0, 'title': None, 'author': None, 'subject': None, 'creator': None } def process_pdf(self, file_path: str) -> Tuple[bool, str, dict]: """Process PDF file and return text content and metadata""" # Validate file is_valid, error_message = self.validate_file(file_path) if not is_valid: return False, error_message, {} # Extract text text_content = self.extract_text(file_path) if text_content is None: return False, "Failed to extract text from PDF", {} # Get metadata metadata = self.get_metadata(file_path) return True, text_content, metadata