Spaces:

yashjainme
/

web-scraper-chatbot-backend

Sleeping

App Files Files Community

web-scraper-chatbot-backend / vector_db.py

yashjainme

Upload 5 files

5f315ec verified 5 months ago

raw

history blame contribute delete

3.69 kB




	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import Chroma
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_core.documents import Document
	import uuid
	import os
	import shutil

	class VectorDatabase:
	def __init__(self):
	self.embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)
	self.vector_stores = {} # Store vector stores per URL

	def _create_unique_db_path(self, url):
	"""Create a unique directory for each scraped URL"""
	# Use UUID to ensure unique directory names
	unique_id = str(uuid.uuid4())
	db_path = os.path.join("./chroma_db", unique_id)
	os.makedirs(db_path, exist_ok=True)
	return db_path, unique_id

	def process_and_store(self, scraped_data):
	"""Process documents and store in vector database"""
	# Clean up any existing vector stores if they exist
	self._cleanup_old_vector_stores()

	# Create a LangChain Document object
	document = Document(
	page_content=scraped_data['content'],
	metadata={
	"source": scraped_data['url'],
	"domain": scraped_data['domain']
	}
	)

	# Split the document into chunks
	chunks = self.text_splitter.split_documents([document])

	# Create a unique path for this URL's vector store
	db_path, unique_id = self._create_unique_db_path(scraped_data['url'])

	# Create and persist the vector store
	vector_store = Chroma.from_documents(
	chunks,
	self.embedding,
	persist_directory=db_path
	)

	# Store reference to this vector store
	self.vector_stores[unique_id] = {
	'store': vector_store,
	'url': scraped_data['url']
	}

	return len(chunks), unique_id

	def search(self, query, url=None, k=3):
	"""Search for relevant documents"""
	if not self.vector_stores:
	return []

	# If no specific URL provided, use the most recently added vector store
	if url is None:
	# Get the last added vector store
	vector_store_info = list(self.vector_stores.values())[-1]
	vector_store = vector_store_info['store']
	else:
	# Find vector store for specific URL
	matching_stores = [
	info['store'] for info in self.vector_stores.values()
	if info['url'] == url
	]
	if not matching_stores:
	return []
	vector_store = matching_stores[0]

	return vector_store.similarity_search(query, k=k)

	def _cleanup_old_vector_stores(self, max_stores=5):
	"""Clean up old vector stores to prevent resource exhaustion"""
	if len(self.vector_stores) > max_stores:
	# Remove the oldest vector stores
	oldest_keys = list(self.vector_stores.keys())[:len(self.vector_stores) - max_stores]
	for key in oldest_keys:
	# Remove from dictionary
	store_info = self.vector_stores.pop(key)

	# Remove the physical directory
	db_path = os.path.join("./chroma_db", key)
	try:
	shutil.rmtree(db_path)
	except Exception as e:
	print(f"Error cleaning up vector store: {e}")