Spaces:

sivakum4
/

ubva

Running

App Files Files Community

ubva / buffalo_rag /embeddings /chunker.py

sivakum4

Feat: HF Inference API

9108a9a 18 days ago

raw

history blame contribute delete

11.7 kB

	import os
	import json
	import re
	from typing import List, Dict, Any, Optional
	import pickle
	from tqdm import tqdm

	from sentence_transformers import SentenceTransformer

	class DocumentChunker:
	def __init__(self, input_dir: str = "data/raw",
	output_dir: str = "data/processed",
	embedding_dir: str = "data/embeddings",
	model_name: str = "BAAI/bge-small-en-v1.5"):
	self.input_dir = input_dir
	self.output_dir = output_dir
	self.embedding_dir = embedding_dir

	# Create output directories
	os.makedirs(output_dir, exist_ok=True)
	os.makedirs(embedding_dir, exist_ok=True)

	# Load embedding model
	self.model = SentenceTransformer(model_name)

	def load_documents(self) -> List[Dict[str, Any]]:
	"""Load all documents from the input directory."""
	documents = []

	for filename in os.listdir(self.input_dir):
	if filename.endswith('.json'):
	filepath = os.path.join(self.input_dir, filename)
	with open(filepath, 'r') as f:
	document = json.load(f)
	documents.append(document)

	return documents

	def chunk_by_headings(self, document: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""Split document into chunks based on headings."""
	chunks = []

	# If no headings, just create a single chunk
	if not document.get('headings'):
	chunk = {
	'title': document['title'],
	'content': document['content'],
	'url': document['url'],
	'categories': document.get('categories', []),
	'scraped_at': document['scraped_at'],
	'document_type': document.get('document_type', 'webpage')
	}
	chunks.append(chunk)
	return chunks

	# Process document based on headings
	headings = sorted(document['headings'], key=lambda h: h.get('level', 6))
	content = document['content']

	# Use headings to split content
	current_title = document['title']
	current_content = ""
	content_lines = content.split('\n')
	line_index = 0

	for heading in headings:
	heading_text = heading['text']

	# Find the heading in the content
	heading_found = False
	for i in range(line_index, len(content_lines)):
	if heading_text in content_lines[i]:
	# Save the previous chunk
	if current_content.strip():
	chunk = {
	'title': current_title,
	'content': current_content.strip(),
	'url': document['url'],
	'categories': document.get('categories', []),
	'scraped_at': document['scraped_at'],
	'document_type': document.get('document_type', 'webpage')
	}
	chunks.append(chunk)

	# Start new chunk
	current_title = heading_text
	current_content = ""
	line_index = i + 1
	heading_found = True
	break

	if not heading_found:
	current_content += heading_text + "\n"

	# Add content until the next heading
	if line_index < len(content_lines):
	for i in range(line_index, len(content_lines)):
	# Check if line contains any of the upcoming headings
	if any(h['text'] in content_lines[i] for h in headings if h['text'] != heading_text):
	break
	current_content += content_lines[i] + "\n"
	line_index = i + 1

	# Add the last chunk
	if current_content.strip():
	chunk = {
	'title': current_title,
	'content': current_content.strip(),
	'url': document['url'],
	'categories': document.get('categories', []),
	'scraped_at': document['scraped_at'],
	'document_type': document.get('document_type', 'webpage')
	}
	chunks.append(chunk)

	return chunks

	def chunk_faqs(self, document: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""Extract FAQs as individual chunks."""
	chunks = []

	if not document.get('faqs'):
	return chunks

	for faq in document['faqs']:
	chunk = {
	'title': faq['question'],
	'content': faq['answer'],
	'url': document['url'],
	'categories': document.get('categories', []),
	'scraped_at': document['scraped_at'],
	'document_type': 'faq',
	'question': faq['question']
	}
	chunks.append(chunk)

	return chunks

	def chunk_semantically(self, document: Dict[str, Any],
	max_chunk_size: int = 1000,
	overlap: int = 100) -> List[Dict[str, Any]]:
	"""Split document into fixed-size chunks with overlap."""
	chunks = []
	content = document['content']

	# Skip empty content
	if not content.strip():
	return chunks

	# Split content by paragraphs
	paragraphs = re.split(r'\n\s*\n', content)

	current_chunk = ""
	current_length = 0

	for para in paragraphs:
	para = para.strip()
	if not para:
	continue

	para_length = len(para)

	# If paragraph alone exceeds max size, split by sentences
	if para_length > max_chunk_size:
	sentences = re.split(r'(?<=[.!?])\s+', para)
	for sentence in sentences:
	sentence = sentence.strip()
	sentence_length = len(sentence)

	if current_length + sentence_length <= max_chunk_size:
	current_chunk += sentence + " "
	current_length += sentence_length + 1
	else:
	# Save current chunk
	if current_chunk:
	chunk = {
	'title': document['title'],
	'content': current_chunk.strip(),
	'url': document['url'],
	'categories': document.get('categories', []),
	'scraped_at': document['scraped_at'],
	'document_type': document.get('document_type', 'webpage')
	}
	chunks.append(chunk)

	# Start new chunk
	current_chunk = sentence + " "
	current_length = sentence_length + 1

	# Paragraph fits within limit
	elif current_length + para_length <= max_chunk_size:
	current_chunk += para + "\n\n"
	current_length += para_length + 2

	# Paragraph doesn't fit, create a new chunk
	else:
	# Save current chunk
	if current_chunk:
	chunk = {
	'title': document['title'],
	'content': current_chunk.strip(),
	'url': document['url'],
	'categories': document.get('categories', []),
	'scraped_at': document['scraped_at'],
	'document_type': document.get('document_type', 'webpage')
	}
	chunks.append(chunk)

	# Start new chunk
	current_chunk = para + "\n\n"
	current_length = para_length + 2

	# Add the last chunk
	if current_chunk:
	chunk = {
	'title': document['title'],
	'content': current_chunk.strip(),
	'url': document['url'],
	'categories': document.get('categories', []),
	'scraped_at': document['scraped_at'],
	'document_type': document.get('document_type', 'webpage')
	}
	chunks.append(chunk)

	return chunks

	def create_chunks(self) -> List[Dict[str, Any]]:
	"""Process all documents and create chunks."""
	all_chunks = []

	# Load documents
	documents = self.load_documents()
	print(f"Loaded {len(documents)} documents")

	# Process each document
	for document in tqdm(documents, desc="Chunking documents"):
	# FAQ chunks
	faq_chunks = self.chunk_faqs(document)
	all_chunks.extend(faq_chunks)

	# Heading-based chunks
	heading_chunks = self.chunk_by_headings(document)
	all_chunks.extend(heading_chunks)

	# Semantic chunks as fallback
	if not heading_chunks:
	semantic_chunks = self.chunk_semantically(document)
	all_chunks.extend(semantic_chunks)

	# Save chunks to output directory
	with open(os.path.join(self.output_dir, 'chunks.json'), 'w') as f:
	json.dump(all_chunks, f, indent=2)

	print(f"Created {len(all_chunks)} chunks")
	return all_chunks

	def create_embeddings(self, chunks: Optional[List[Dict[str, Any]]] = None) -> Dict[str, Any]:
	"""Create embeddings for all chunks."""
	if chunks is None:
	# Load chunks if not provided
	chunks_path = os.path.join(self.output_dir, 'chunks.json')
	if os.path.exists(chunks_path):
	with open(chunks_path, 'r') as f:
	chunks = json.load(f)
	else:
	chunks = self.create_chunks()

	# Prepare texts for embedding
	texts = []
	for chunk in chunks:
	# For FAQs, combine question and answer
	if chunk.get('document_type') == 'faq':
	text = f"{chunk['title']} {chunk['content']}"
	else:
	# For regular chunks, use title and content
	text = f"{chunk['title']} {chunk['content']}"
	texts.append(text)

	# Create embeddings
	print("Creating embeddings...")
	embeddings = self.model.encode(texts, show_progress_bar=True)

	# Create mapping of chunk ID to embedding
	embedding_map = {}
	for i, chunk in enumerate(chunks):
	chunk_id = f"chunk_{i}"
	embedding_map[chunk_id] = {
	'embedding': embeddings[i],
	'chunk': chunk
	}

	# Save embeddings
	with open(os.path.join(self.embedding_dir, 'embeddings.pkl'), 'wb') as f:
	pickle.dump(embedding_map, f)

	print(f"Created embeddings for {len(chunks)} chunks")
	return embedding_map

	# Example usage
	if __name__ == "__main__":
	chunker = DocumentChunker()
	chunks = chunker.create_chunks()
	embedding_map = chunker.create_embeddings(chunks)