Spaces:
Sleeping
Sleeping
| """ | |
| PageChunker.py | |
| A module for page-level document chunking with token counting and preprocessing. | |
| Features: | |
| - Page-based document splitting | |
| - Content validation | |
| - Blank page detection | |
| - Document metadata enrichment | |
| """ | |
| from typing import List, Optional | |
| import logging | |
| from langchain_core.documents import Document | |
| from core.BaseChunker import BaseChunker | |
| logger = logging.getLogger(__name__) | |
| class PageChunker(BaseChunker): | |
| """Handles document chunking at the page level.""" | |
| def __init__(self, model_name=None, embedding_model=None): | |
| """ | |
| Initialize page chunker with specified models. | |
| Args: | |
| model_name: Name of the model for tokenization | |
| embedding_model: Model for generating embeddings | |
| """ | |
| super().__init__(model_name, embedding_model) | |
| self.page_stats = [] | |
| def _is_blank_page(self, text: str) -> bool: | |
| """Check if page is blank or contains only whitespace/special characters.""" | |
| cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '') | |
| return len(cleaned_text) < self.BLANK_THRESHOLD | |
| def _process_single_page(self, content: str, page_number: int, preprocess: bool) -> Optional[Document]: | |
| """ | |
| Process a single page with optional preprocessing and analysis. | |
| Args: | |
| content: The page content | |
| page_number: The page number | |
| preprocess: Whether to preprocess the text | |
| Returns: | |
| Document object with processed content and metadata, or None if page is blank | |
| """ | |
| if self._is_blank_page(content): | |
| self.page_stats.append(f"Page {page_number} is blank.") | |
| return None | |
| # Optionally preprocess the text | |
| if preprocess: | |
| content = self.preprocess_text(content) | |
| # Analyze the page and generate metadata | |
| stats = self.analyze_text(content) | |
| metadata = { | |
| "page": page_number, | |
| "char_count": stats["char_count"], | |
| "token_count": stats["token_count"], | |
| "sentence_count": stats["sentence_count"], | |
| "word_count": stats["word_count"], | |
| "has_ocr": str(stats.get("has_content", True)), | |
| "is_blank": "false" | |
| } | |
| return Document(page_content=content, metadata=metadata) | |
| def page_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]: | |
| """ | |
| Process PDF document page by page with analysis and optional preprocessing. | |
| Args: | |
| file_path: Path to the PDF file | |
| preprocess: Whether to preprocess page text | |
| Returns: | |
| List of Document objects, one per non-blank page | |
| """ | |
| try: | |
| self.page_stats = [] # Reset stats for this document | |
| raw_pages = self.load_document(file_path) | |
| processed_pages = [] | |
| logger.info(f"Processing document with {len(raw_pages)} pages") | |
| for idx, page in enumerate(raw_pages): | |
| processed_page = self._process_single_page(page.page_content, idx + 1, preprocess) | |
| if processed_page: | |
| processed_pages.append(processed_page) | |
| # Output skipped pages for transparency | |
| if self.page_stats: | |
| logger.info("\n".join(self.page_stats)) | |
| logger.info(f"Processed {len(processed_pages)} non-blank pages") | |
| return processed_pages | |
| except Exception as e: | |
| logger.error(f"Error in page_process_document: {e}") | |
| raise | |
| def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]: | |
| """ | |
| Process document using page chunking strategy (implements abstract method). | |
| Args: | |
| file_path: Path to the PDF file | |
| preprocess: Whether to preprocess page text | |
| Returns: | |
| List of Document objects, one per non-blank page | |
| """ | |
| return self.page_process_document(file_path, preprocess) |