Spaces:
Sleeping
Sleeping
| """ | |
| PDF Processing Module | |
| Handles PDF file upload, text extraction, and preprocessing. | |
| """ | |
| import PyPDF2 | |
| import io | |
| import re | |
| from typing import Optional, List | |
| import streamlit as st | |
| class PDFProcessor: | |
| """Class to handle PDF file processing and text extraction""" | |
| def __init__(self): | |
| self.max_file_size = 10 * 1024 * 1024 # 10MB limit | |
| def validate_pdf(self, uploaded_file) -> bool: | |
| """ | |
| Validate uploaded PDF file | |
| Args: | |
| uploaded_file: Streamlit uploaded file object | |
| Returns: | |
| bool: True if valid, False otherwise | |
| """ | |
| # Check file size | |
| if uploaded_file.size > self.max_file_size: | |
| st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)") | |
| return False | |
| # Check file type | |
| if uploaded_file.type != "application/pdf": | |
| st.error("Please upload a valid PDF file") | |
| return False | |
| return True | |
| def extract_text_from_pdf(self, uploaded_file) -> Optional[str]: | |
| """ | |
| Extract text content from uploaded PDF file | |
| Args: | |
| uploaded_file: Streamlit uploaded file object | |
| Returns: | |
| str: Extracted text content or None if extraction fails | |
| """ | |
| try: | |
| # Reset file pointer | |
| uploaded_file.seek(0) | |
| # Create a PDF reader object | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read())) | |
| # Check if PDF is encrypted | |
| if pdf_reader.is_encrypted: | |
| st.error("❌ Cannot process encrypted PDF files. Please upload an unencrypted PDF.") | |
| return None | |
| # Check number of pages | |
| num_pages = len(pdf_reader.pages) | |
| if num_pages == 0: | |
| st.error("❌ PDF file appears to be empty or corrupted.") | |
| return None | |
| if num_pages > 100: | |
| st.warning(f"⚠️ Large PDF detected ({num_pages} pages). Processing may take longer.") | |
| # Extract text from all pages | |
| text_content = "" | |
| failed_pages = [] | |
| for page_num, page in enumerate(pdf_reader.pages): | |
| try: | |
| page_text = page.extract_text() | |
| if page_text.strip(): # Only add non-empty pages | |
| text_content += page_text + "\n" | |
| except Exception as e: | |
| failed_pages.append(page_num + 1) | |
| continue | |
| # Report failed pages | |
| if failed_pages: | |
| if len(failed_pages) < 5: | |
| st.warning(f"⚠️ Could not extract text from pages: {', '.join(map(str, failed_pages))}") | |
| else: | |
| st.warning(f"⚠️ Could not extract text from {len(failed_pages)} pages") | |
| if not text_content.strip(): | |
| st.error("❌ No readable text content found in the PDF file. The PDF might contain only images or scanned content.") | |
| return None | |
| # Check if extracted text is too short | |
| if len(text_content.strip()) < 100: | |
| st.warning("⚠️ Very little text was extracted. The PDF might contain mostly images or have formatting issues.") | |
| return text_content | |
| except PyPDF2.errors.PdfReadError as e: | |
| st.error(f"❌ Invalid or corrupted PDF file: {str(e)}") | |
| return None | |
| except MemoryError: | |
| st.error("❌ PDF file is too large to process. Please try a smaller file.") | |
| return None | |
| except Exception as e: | |
| st.error(f"❌ Unexpected error processing PDF file: {str(e)}") | |
| return None | |
| def preprocess_text(self, text: str) -> str: | |
| """ | |
| Clean and preprocess extracted text | |
| Args: | |
| text: Raw extracted text | |
| Returns: | |
| str: Cleaned and preprocessed text | |
| """ | |
| if not text: | |
| return "" | |
| # Remove excessive whitespace and newlines | |
| text = re.sub(r'\n+', '\n', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters that might interfere with processing | |
| text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text) | |
| # Remove extra spaces | |
| text = ' '.join(text.split()) | |
| return text.strip() | |
| def process_pdf(self, uploaded_file) -> Optional[str]: | |
| """ | |
| Complete PDF processing pipeline | |
| Args: | |
| uploaded_file: Streamlit uploaded file object | |
| Returns: | |
| str: Processed text content or None if processing fails | |
| """ | |
| if not self.validate_pdf(uploaded_file): | |
| return None | |
| # Extract text | |
| raw_text = self.extract_text_from_pdf(uploaded_file) | |
| if raw_text is None: | |
| return None | |
| # Preprocess text | |
| processed_text = self.preprocess_text(raw_text) | |
| if len(processed_text) < 50: | |
| st.warning("The extracted text is very short. Please check if the PDF contains readable text.") | |
| return processed_text | |