Spaces:
Sleeping
Sleeping
KUNAL SHAW
Initial commit: RAG Chatbot for Agentic AI eBook with LangGraph, Pinecone, and Groq
f9c215a | """ | |
| utils.py - Helper functions for text processing and chunking | |
| This module contains utility functions for: | |
| - Text cleaning (removing extra whitespace, headers/footers) | |
| - Token counting using tiktoken | |
| - Text chunking with overlap | |
| - Confidence score normalization | |
| """ | |
| import re | |
| from typing import List, Dict, Tuple | |
| import json | |
| # Try to use tiktoken for accurate token counting, fallback to word count | |
| try: | |
| import tiktoken | |
| TOKENIZER = tiktoken.get_encoding("cl100k_base") | |
| USE_TIKTOKEN = True | |
| except ImportError: | |
| USE_TIKTOKEN = False | |
| print("WARNING: tiktoken not available, using word count approximation") | |
| def count_tokens(text: str) -> int: | |
| """ | |
| Count tokens in text using tiktoken or word count fallback. | |
| Args: | |
| text: Input text string | |
| Returns: | |
| Number of tokens (approximate if tiktoken not available) | |
| """ | |
| if USE_TIKTOKEN: | |
| return len(TOKENIZER.encode(text)) | |
| else: | |
| # Rough approximation: ~1.3 words per token on average | |
| words = len(text.split()) | |
| return int(words * 1.3) | |
| def clean_text(text: str) -> str: | |
| """ | |
| Clean extracted PDF text by removing extra whitespace and common artifacts. | |
| Args: | |
| text: Raw text from PDF extraction | |
| Returns: | |
| Cleaned text string | |
| """ | |
| # Remove excessive whitespace (multiple spaces, tabs) | |
| text = re.sub(r'[ \t]+', ' ', text) | |
| # Remove excessive newlines (more than 2 in a row) | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| # Remove page numbers (common patterns like "Page 1" or "- 1 -") | |
| text = re.sub(r'(?i)page\s*\d+', '', text) | |
| text = re.sub(r'-\s*\d+\s*-', '', text) | |
| # Remove common header/footer artifacts (customize based on your PDF) | |
| # This is a simple heuristic - you might need to adjust for your specific PDF | |
| text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE) | |
| # Strip leading/trailing whitespace from each line | |
| lines = [line.strip() for line in text.split('\n')] | |
| text = '\n'.join(lines) | |
| # Final cleanup | |
| text = text.strip() | |
| return text | |
| def chunk_text( | |
| text: str, | |
| page_number: int, | |
| chunk_size: int = 500, | |
| chunk_overlap: int = 50, | |
| source: str = "Ebook-Agentic-AI.pdf" | |
| ) -> List[Dict]: | |
| """ | |
| Split text into overlapping chunks with metadata. | |
| Uses token counting to ensure chunks are approximately chunk_size tokens, | |
| with overlap for context continuity. | |
| Args: | |
| text: Text to chunk (from one page) | |
| page_number: Page number for metadata | |
| chunk_size: Target size in tokens (default 500) | |
| chunk_overlap: Overlap between chunks in tokens (default 50) | |
| source: Source document name | |
| Returns: | |
| List of chunk dictionaries with id, page, text, start_char, end_char | |
| """ | |
| chunks = [] | |
| # If text is empty or very short, return single chunk | |
| if not text or count_tokens(text) <= chunk_size: | |
| if text.strip(): | |
| chunk_id = f"pdfpage_{page_number}_chunk_0" | |
| chunks.append({ | |
| "id": chunk_id, | |
| "page": page_number, | |
| "text": text.strip(), | |
| "start_char": 0, | |
| "end_char": len(text), | |
| "source": source | |
| }) | |
| return chunks | |
| # Split into sentences for better chunking | |
| # Simple sentence splitting - handles common cases | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| current_chunk = [] | |
| current_tokens = 0 | |
| current_start = 0 | |
| chunk_index = 0 | |
| char_position = 0 | |
| for sentence in sentences: | |
| sentence_tokens = count_tokens(sentence) | |
| # If adding this sentence exceeds chunk_size, save current chunk | |
| if current_tokens + sentence_tokens > chunk_size and current_chunk: | |
| # Join current chunk | |
| chunk_text = ' '.join(current_chunk) | |
| chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}" | |
| chunks.append({ | |
| "id": chunk_id, | |
| "page": page_number, | |
| "text": chunk_text, | |
| "start_char": current_start, | |
| "end_char": current_start + len(chunk_text), | |
| "source": source | |
| }) | |
| chunk_index += 1 | |
| # Calculate overlap - keep last few sentences that fit in overlap | |
| overlap_tokens = 0 | |
| overlap_sentences = [] | |
| for s in reversed(current_chunk): | |
| s_tokens = count_tokens(s) | |
| if overlap_tokens + s_tokens <= chunk_overlap: | |
| overlap_sentences.insert(0, s) | |
| overlap_tokens += s_tokens | |
| else: | |
| break | |
| current_chunk = overlap_sentences | |
| current_tokens = overlap_tokens | |
| current_start = char_position - sum(len(s) + 1 for s in overlap_sentences) | |
| current_chunk.append(sentence) | |
| current_tokens += sentence_tokens | |
| char_position += len(sentence) + 1 # +1 for space | |
| # Don't forget the last chunk! | |
| if current_chunk: | |
| chunk_text = ' '.join(current_chunk) | |
| chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}" | |
| chunks.append({ | |
| "id": chunk_id, | |
| "page": page_number, | |
| "text": chunk_text, | |
| "start_char": current_start, | |
| "end_char": current_start + len(chunk_text), | |
| "source": source | |
| }) | |
| return chunks | |
| def normalize_score(score: float) -> float: | |
| """ | |
| Normalize similarity score to 0-1 range. | |
| Pinecone returns similarity scores typically between -1 and 1 for cosine. | |
| This function normalizes them to 0-1 range. | |
| Formula: normalized = (score + 1) / 2 | |
| Then clamp to [0, 1] for safety. | |
| Args: | |
| score: Raw similarity score from Pinecone | |
| Returns: | |
| Normalized score between 0.0 and 1.0 | |
| """ | |
| # For cosine similarity, scores are in [-1, 1] | |
| # Normalize to [0, 1] | |
| normalized = (score + 1.0) / 2.0 | |
| # Clamp to valid range (safety check) | |
| return max(0.0, min(1.0, normalized)) | |
| def compute_confidence(scores: List[float], method: str = "max") -> float: | |
| """ | |
| Compute confidence score from list of similarity scores. | |
| Args: | |
| scores: List of raw similarity scores from retrieval | |
| method: "max" for maximum score, "mean" for average | |
| Returns: | |
| Confidence score rounded to 3 decimal places | |
| """ | |
| if not scores: | |
| return 0.0 | |
| # Normalize all scores | |
| normalized_scores = [normalize_score(s) for s in scores] | |
| # Compute confidence based on method | |
| if method == "max": | |
| confidence = max(normalized_scores) | |
| elif method == "mean": | |
| confidence = sum(normalized_scores) / len(normalized_scores) | |
| else: | |
| # Default to max | |
| confidence = max(normalized_scores) | |
| return round(confidence, 3) | |
| def save_chunks_to_jsonl(chunks: List[Dict], filepath: str, include_embeddings: bool = False): | |
| """ | |
| Save chunks to a JSONL file for backup. | |
| Args: | |
| chunks: List of chunk dictionaries | |
| filepath: Output file path | |
| include_embeddings: Whether to include embeddings (makes file large) | |
| """ | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| for chunk in chunks: | |
| # Create a copy to potentially remove embeddings | |
| chunk_data = chunk.copy() | |
| if not include_embeddings and 'embedding' in chunk_data: | |
| del chunk_data['embedding'] | |
| f.write(json.dumps(chunk_data, ensure_ascii=False) + '\n') | |
| print(f"Saved {len(chunks)} chunks to {filepath}") | |
| def load_chunks_from_jsonl(filepath: str) -> List[Dict]: | |
| """ | |
| Load chunks from a JSONL file. | |
| Args: | |
| filepath: Input file path | |
| Returns: | |
| List of chunk dictionaries | |
| """ | |
| chunks = [] | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| if line.strip(): | |
| chunks.append(json.loads(line)) | |
| print(f"Loaded {len(chunks)} chunks from {filepath}") | |
| return chunks | |
| def format_chunks_for_llm(chunks: List[Dict]) -> str: | |
| """ | |
| Format retrieved chunks into a string for LLM context. | |
| Args: | |
| chunks: List of chunk dictionaries with 'text' and 'page' keys | |
| Returns: | |
| Formatted string with markers for the LLM | |
| """ | |
| formatted_parts = [] | |
| for i, chunk in enumerate(chunks): | |
| page = chunk.get('page', 'unknown') | |
| text = chunk.get('text', '') | |
| chunk_id = chunk.get('id', f'chunk_{i}') | |
| part = f"[Source: {chunk_id}, Page {page}]\n{text}" | |
| formatted_parts.append(part) | |
| return "\n\n---\n\n".join(formatted_parts) | |
| if __name__ == "__main__": | |
| # Quick test of utility functions | |
| print("Testing utils.py functions...") | |
| # Test token counting | |
| test_text = "This is a test sentence for token counting." | |
| print(f"Token count for '{test_text}': {count_tokens(test_text)}") | |
| # Test text cleaning | |
| dirty_text = " This has extra spaces \n\n\n\nAnd too many newlines Page 123" | |
| clean = clean_text(dirty_text) | |
| print(f"Cleaned text: '{clean}'") | |
| # Test score normalization | |
| test_scores = [-1.0, 0.0, 0.5, 1.0] | |
| for score in test_scores: | |
| print(f"Score {score} -> normalized: {normalize_score(score)}") | |
| # Test confidence computation | |
| scores = [0.8, 0.6, 0.7] | |
| print(f"Confidence (max): {compute_confidence(scores, 'max')}") | |
| print(f"Confidence (mean): {compute_confidence(scores, 'mean')}") | |
| print("\nAll tests passed!") | |