AI_Toolkit / src /core /PageChunker.py
NavyDevilDoc's picture
Upload 10 files
c0f31c1 verified
"""
PageChunker.py
A module for page-level document chunking with token counting and preprocessing.
Features:
- Page-based document splitting
- Content validation
- Blank page detection
- Document metadata enrichment
"""
from typing import List, Optional
import logging
from langchain_core.documents import Document
from core.BaseChunker import BaseChunker
logger = logging.getLogger(__name__)
class PageChunker(BaseChunker):
"""Handles document chunking at the page level."""
def __init__(self, model_name=None, embedding_model=None):
"""
Initialize page chunker with specified models.
Args:
model_name: Name of the model for tokenization
embedding_model: Model for generating embeddings
"""
super().__init__(model_name, embedding_model)
self.page_stats = []
def _is_blank_page(self, text: str) -> bool:
"""Check if page is blank or contains only whitespace/special characters."""
cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
return len(cleaned_text) < self.BLANK_THRESHOLD
def _process_single_page(self, content: str, page_number: int, preprocess: bool) -> Optional[Document]:
"""
Process a single page with optional preprocessing and analysis.
Args:
content: The page content
page_number: The page number
preprocess: Whether to preprocess the text
Returns:
Document object with processed content and metadata, or None if page is blank
"""
if self._is_blank_page(content):
self.page_stats.append(f"Page {page_number} is blank.")
return None
# Optionally preprocess the text
if preprocess:
content = self.preprocess_text(content)
# Analyze the page and generate metadata
stats = self.analyze_text(content)
metadata = {
"page": page_number,
"char_count": stats["char_count"],
"token_count": stats["token_count"],
"sentence_count": stats["sentence_count"],
"word_count": stats["word_count"],
"has_ocr": str(stats.get("has_content", True)),
"is_blank": "false"
}
return Document(page_content=content, metadata=metadata)
def page_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
"""
Process PDF document page by page with analysis and optional preprocessing.
Args:
file_path: Path to the PDF file
preprocess: Whether to preprocess page text
Returns:
List of Document objects, one per non-blank page
"""
try:
self.page_stats = [] # Reset stats for this document
raw_pages = self.load_document(file_path)
processed_pages = []
logger.info(f"Processing document with {len(raw_pages)} pages")
for idx, page in enumerate(raw_pages):
processed_page = self._process_single_page(page.page_content, idx + 1, preprocess)
if processed_page:
processed_pages.append(processed_page)
# Output skipped pages for transparency
if self.page_stats:
logger.info("\n".join(self.page_stats))
logger.info(f"Processed {len(processed_pages)} non-blank pages")
return processed_pages
except Exception as e:
logger.error(f"Error in page_process_document: {e}")
raise
def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
"""
Process document using page chunking strategy (implements abstract method).
Args:
file_path: Path to the PDF file
preprocess: Whether to preprocess page text
Returns:
List of Document objects, one per non-blank page
"""
return self.page_process_document(file_path, preprocess)