ZySec / modules /database_utils.py
vSiddi
fix files
8e29341
raw
history blame
No virus
2.98 kB
# database_utils.py
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from modules import app_constants, app_logger
app_logger = app_logger.app_logger
def initialize_chroma_db(file_path):
"""
Initializes or creates a new Chroma database.
:param file_path: Path to the Chroma database file
:return: A retriever object if initialization is successful, None otherwise
"""
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name=app_constants.EMBEDDING_MODEL_NAME)
# Initialize Chroma database
try:
if os.path.exists(file_path):
app_logger.info(f"Using existing Chroma database at {file_path}.")
else:
app_logger.info(f"Chroma database not found at {file_path}. Creating a new one.")
os.makedirs(os.path.dirname(file_path), exist_ok=True)
db = Chroma(persist_directory=file_path, embedding_function=embeddings, client_settings=app_constants.CHROMA_SETTINGS)
except Exception as e:
app_logger.error(f"Failed to initialize Chroma database at {file_path}. Reason: {e}")
return None
# Create a retriever from the Chroma database
#retriever = db.as_retriever()
return db
def get_chroma_db_files(directory):
"""Retrieve files ending with 'chroma_db' from the given directory."""
return [f for f in os.listdir(directory) if f.endswith('chroma_db')]
def format_db_name(db_name):
"""Format the database name to a more readable form."""
return db_name.replace('_', ' ').replace('chroma db', '').title().strip()
def delete_doc_from_chroma_db(db_path, source_doc):
"""
Deletes all items related to a given source document in a Chroma database located at a specific path.
:param db_path: Path to the Chroma database file
:param source_doc: The source document identifier to match
"""
# Initialize embeddings (assuming this step is necessary for your Chroma setup)
embeddings = HuggingFaceEmbeddings(model_name=app_constants.EMBEDDING_MODEL_NAME)
# Initialize Chroma database
if not os.path.exists(db_path):
app_logger.error(f"No Chroma database found at {db_path}.")
return
db = Chroma(persist_directory=db_path, embedding_function=embeddings, client_settings=app_constants.CHROMA_SETTINGS)
ids_to_delete = []
# Iterate over documents in the database
for doc in db:
# Check if the document is related to the source document
if doc.metadata.get('source') == source_doc:
# Add the document's ID to the list of IDs to delete
ids_to_delete.append(doc.id)
# Delete documents with matching IDs
if ids_to_delete:
db.delete(ids=ids_to_delete)
db.persist()
app_logger.error(f"Deleted {len(ids_to_delete)} items related to '{source_doc}'.")
else:
app_logger.error(f"No items found related to '{source_doc}'.")