| """ |
| Text processing utilities for chunking and token counting. |
| |
| This module provides utilities for token counting using tiktoken and text preprocessing. |
| """ |
|
|
| import tiktoken |
| import re |
| from typing import List |
|
|
|
|
| |
| _encoder = None |
|
|
|
|
| def get_encoder(): |
| """ |
| Get or create tiktoken encoder (Claude-compatible). |
| |
| Returns: |
| tiktoken.Encoding: The cl100k_base encoding used by Claude |
| """ |
| global _encoder |
| if _encoder is None: |
| _encoder = tiktoken.get_encoding("cl100k_base") |
| return _encoder |
|
|
|
|
| def count_tokens(text: str) -> int: |
| """ |
| Count tokens in text using tiktoken. |
| |
| Args: |
| text: Input text to count tokens |
| |
| Returns: |
| int: Number of tokens in the text |
| """ |
| if not text: |
| return 0 |
| encoder = get_encoder() |
| return len(encoder.encode(text)) |
|
|
|
|
| def clean_text(text: str) -> str: |
| """ |
| Clean extracted PDF text. |
| |
| Removes excessive whitespace and null bytes. |
| |
| Args: |
| text: Raw text from PDF extraction |
| |
| Returns: |
| str: Cleaned text |
| """ |
| if not text: |
| return "" |
|
|
| |
| text = text.replace('\x00', '') |
|
|
| |
| text = re.sub(r'\s+', ' ', text) |
|
|
| |
| text = text.strip() |
|
|
| return text |
|
|
|
|
| def split_into_sentences(text: str) -> List[str]: |
| """ |
| Split text into sentences using simple heuristics. |
| |
| Attempts to use nltk if available, otherwise falls back to regex-based splitting. |
| |
| Args: |
| text: Input text to split |
| |
| Returns: |
| List[str]: List of sentences |
| """ |
| if not text: |
| return [] |
|
|
| |
| try: |
| import nltk |
| |
| try: |
| return nltk.sent_tokenize(text) |
| except LookupError: |
| |
| try: |
| nltk.download('punkt_tab', quiet=True) |
| return nltk.sent_tokenize(text) |
| except: |
| |
| pass |
| except ImportError: |
| pass |
|
|
| |
| |
| sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
| |
| sentences = [s.strip() for s in sentences if s.strip()] |
|
|
| return sentences |
|
|
|
|
| def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str: |
| """ |
| Truncate text to a maximum length. |
| |
| Args: |
| text: Text to truncate |
| max_length: Maximum length in characters |
| suffix: Suffix to add if text is truncated |
| |
| Returns: |
| str: Truncated text |
| """ |
| if len(text) <= max_length: |
| return text |
|
|
| return text[:max_length - len(suffix)] + suffix |
|
|
|
|
| def estimate_pages_from_text(text: str, chars_per_page: int = 2000) -> int: |
| """ |
| Estimate number of pages from text length. |
| |
| Args: |
| text: Input text |
| chars_per_page: Average characters per page (default: 2000) |
| |
| Returns: |
| int: Estimated number of pages |
| """ |
| if not text: |
| return 0 |
|
|
| return max(1, len(text) // chars_per_page) |
|
|