Spaces:

KunalShaw
/

RAG-Chatbot-Agentic-AI

Sleeping

KUNAL SHAW

Initial commit: RAG Chatbot for Agentic AI eBook with LangGraph, Pinecone, and Groq

f9c215a about 1 month ago

9.86 kB

	"""
	utils.py - Helper functions for text processing and chunking

	This module contains utility functions for:
	- Text cleaning (removing extra whitespace, headers/footers)
	- Token counting using tiktoken
	- Text chunking with overlap
	- Confidence score normalization
	"""

	import re
	from typing import List, Dict, Tuple
	import json

	# Try to use tiktoken for accurate token counting, fallback to word count
	try:
	import tiktoken
	TOKENIZER = tiktoken.get_encoding("cl100k_base")
	USE_TIKTOKEN = True
	except ImportError:
	USE_TIKTOKEN = False
	print("WARNING: tiktoken not available, using word count approximation")


	def count_tokens(text: str) -> int:
	"""
	Count tokens in text using tiktoken or word count fallback.

	Args:
	text: Input text string

	Returns:
	Number of tokens (approximate if tiktoken not available)
	"""
	if USE_TIKTOKEN:
	return len(TOKENIZER.encode(text))
	else:
	# Rough approximation: ~1.3 words per token on average
	words = len(text.split())
	return int(words * 1.3)


	def clean_text(text: str) -> str:
	"""
	Clean extracted PDF text by removing extra whitespace and common artifacts.

	Args:
	text: Raw text from PDF extraction

	Returns:
	Cleaned text string
	"""
	# Remove excessive whitespace (multiple spaces, tabs)
	text = re.sub(r'[ \t]+', ' ', text)

	# Remove excessive newlines (more than 2 in a row)
	text = re.sub(r'\n{3,}', '\n\n', text)

	# Remove page numbers (common patterns like "Page 1" or "- 1 -")
	text = re.sub(r'(?i)page\s*\d+', '', text)
	text = re.sub(r'-\s\d+\s-', '', text)

	# Remove common header/footer artifacts (customize based on your PDF)
	# This is a simple heuristic - you might need to adjust for your specific PDF
	text = re.sub(r'^\s\d+\s$', '', text, flags=re.MULTILINE)

	# Strip leading/trailing whitespace from each line
	lines = [line.strip() for line in text.split('\n')]
	text = '\n'.join(lines)

	# Final cleanup
	text = text.strip()

	return text


	def chunk_text(
	text: str,
	page_number: int,
	chunk_size: int = 500,
	chunk_overlap: int = 50,
	source: str = "Ebook-Agentic-AI.pdf"
	) -> List[Dict]:
	"""
	Split text into overlapping chunks with metadata.

	Uses token counting to ensure chunks are approximately chunk_size tokens,
	with overlap for context continuity.

	Args:
	text: Text to chunk (from one page)
	page_number: Page number for metadata
	chunk_size: Target size in tokens (default 500)
	chunk_overlap: Overlap between chunks in tokens (default 50)
	source: Source document name

	Returns:
	List of chunk dictionaries with id, page, text, start_char, end_char
	"""
	chunks = []

	# If text is empty or very short, return single chunk
	if not text or count_tokens(text) <= chunk_size:
	if text.strip():
	chunk_id = f"pdfpage_{page_number}_chunk_0"
	chunks.append({
	"id": chunk_id,
	"page": page_number,
	"text": text.strip(),
	"start_char": 0,
	"end_char": len(text),
	"source": source
	})
	return chunks

	# Split into sentences for better chunking
	# Simple sentence splitting - handles common cases
	sentences = re.split(r'(?<=[.!?])\s+', text)

	current_chunk = []
	current_tokens = 0
	current_start = 0
	chunk_index = 0
	char_position = 0

	for sentence in sentences:
	sentence_tokens = count_tokens(sentence)

	# If adding this sentence exceeds chunk_size, save current chunk
	if current_tokens + sentence_tokens > chunk_size and current_chunk:
	# Join current chunk
	chunk_text = ' '.join(current_chunk)
	chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}"

	chunks.append({
	"id": chunk_id,
	"page": page_number,
	"text": chunk_text,
	"start_char": current_start,
	"end_char": current_start + len(chunk_text),
	"source": source
	})

	chunk_index += 1

	# Calculate overlap - keep last few sentences that fit in overlap
	overlap_tokens = 0
	overlap_sentences = []
	for s in reversed(current_chunk):
	s_tokens = count_tokens(s)
	if overlap_tokens + s_tokens <= chunk_overlap:
	overlap_sentences.insert(0, s)
	overlap_tokens += s_tokens
	else:
	break

	current_chunk = overlap_sentences
	current_tokens = overlap_tokens
	current_start = char_position - sum(len(s) + 1 for s in overlap_sentences)

	current_chunk.append(sentence)
	current_tokens += sentence_tokens
	char_position += len(sentence) + 1 # +1 for space

	# Don't forget the last chunk!
	if current_chunk:
	chunk_text = ' '.join(current_chunk)
	chunk_id = f"pdfpage_{page_number}_chunk_{chunk_index}"

	chunks.append({
	"id": chunk_id,
	"page": page_number,
	"text": chunk_text,
	"start_char": current_start,
	"end_char": current_start + len(chunk_text),
	"source": source
	})

	return chunks


	def normalize_score(score: float) -> float:
	"""
	Normalize similarity score to 0-1 range.

	Pinecone returns similarity scores typically between -1 and 1 for cosine.
	This function normalizes them to 0-1 range.

	Formula: normalized = (score + 1) / 2
	Then clamp to [0, 1] for safety.

	Args:
	score: Raw similarity score from Pinecone

	Returns:
	Normalized score between 0.0 and 1.0
	"""
	# For cosine similarity, scores are in [-1, 1]
	# Normalize to [0, 1]
	normalized = (score + 1.0) / 2.0

	# Clamp to valid range (safety check)
	return max(0.0, min(1.0, normalized))


	def compute_confidence(scores: List[float], method: str = "max") -> float:
	"""
	Compute confidence score from list of similarity scores.

	Args:
	scores: List of raw similarity scores from retrieval
	method: "max" for maximum score, "mean" for average

	Returns:
	Confidence score rounded to 3 decimal places
	"""
	if not scores:
	return 0.0

	# Normalize all scores
	normalized_scores = [normalize_score(s) for s in scores]

	# Compute confidence based on method
	if method == "max":
	confidence = max(normalized_scores)
	elif method == "mean":
	confidence = sum(normalized_scores) / len(normalized_scores)
	else:
	# Default to max
	confidence = max(normalized_scores)

	return round(confidence, 3)


	def save_chunks_to_jsonl(chunks: List[Dict], filepath: str, include_embeddings: bool = False):
	"""
	Save chunks to a JSONL file for backup.

	Args:
	chunks: List of chunk dictionaries
	filepath: Output file path
	include_embeddings: Whether to include embeddings (makes file large)
	"""
	with open(filepath, 'w', encoding='utf-8') as f:
	for chunk in chunks:
	# Create a copy to potentially remove embeddings
	chunk_data = chunk.copy()

	if not include_embeddings and 'embedding' in chunk_data:
	del chunk_data['embedding']

	f.write(json.dumps(chunk_data, ensure_ascii=False) + '\n')

	print(f"Saved {len(chunks)} chunks to {filepath}")


	def load_chunks_from_jsonl(filepath: str) -> List[Dict]:
	"""
	Load chunks from a JSONL file.

	Args:
	filepath: Input file path

	Returns:
	List of chunk dictionaries
	"""
	chunks = []
	with open(filepath, 'r', encoding='utf-8') as f:
	for line in f:
	if line.strip():
	chunks.append(json.loads(line))

	print(f"Loaded {len(chunks)} chunks from {filepath}")
	return chunks


	def format_chunks_for_llm(chunks: List[Dict]) -> str:
	"""
	Format retrieved chunks into a string for LLM context.

	Args:
	chunks: List of chunk dictionaries with 'text' and 'page' keys

	Returns:
	Formatted string with markers for the LLM
	"""
	formatted_parts = []

	for i, chunk in enumerate(chunks):
	page = chunk.get('page', 'unknown')
	text = chunk.get('text', '')
	chunk_id = chunk.get('id', f'chunk_{i}')

	part = f"[Source: {chunk_id}, Page {page}]\n{text}"
	formatted_parts.append(part)

	return "\n\n---\n\n".join(formatted_parts)


	if __name__ == "__main__":
	# Quick test of utility functions
	print("Testing utils.py functions...")

	# Test token counting
	test_text = "This is a test sentence for token counting."
	print(f"Token count for '{test_text}': {count_tokens(test_text)}")

	# Test text cleaning
	dirty_text = " This has extra spaces \n\n\n\nAnd too many newlines Page 123"
	clean = clean_text(dirty_text)
	print(f"Cleaned text: '{clean}'")

	# Test score normalization
	test_scores = [-1.0, 0.0, 0.5, 1.0]
	for score in test_scores:
	print(f"Score {score} -> normalized: {normalize_score(score)}")

	# Test confidence computation
	scores = [0.8, 0.6, 0.7]
	print(f"Confidence (max): {compute_confidence(scores, 'max')}")
	print(f"Confidence (mean): {compute_confidence(scores, 'mean')}")

	print("\nAll tests passed!")