File size: 2,975 Bytes
1cfcd72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
from langchain_community.document_loaders import (
    WebBaseLoader,
    PyPDFLoader,
    TextLoader,
    UnstructuredMarkdownLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

CHROMA_DB_DIR = "./chroma_db"

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)


def load_and_ingest_file(file_path):
    print(f"Loading file: {file_path}")
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext in [".md", ".markdown"]:
        loader = UnstructuredMarkdownLoader(file_path)
    else:
        loader = TextLoader(file_path)
    docs = loader.load()
    store_embeddings(docs, source_type="file", source_path=file_path)


def load_and_ingest_url(url):
    loader = WebBaseLoader(url)
    docs = loader.load()
    store_embeddings(docs, source_type="url", source_path=url)


def store_embeddings(docs, source_type="file", source_path=""):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(docs)

    # Add metadata to each chunk
    for chunk in chunks:
        chunk.metadata["source_type"] = source_type
        chunk.metadata["source_path"] = source_path

    vectordb = Chroma(
        collection_name="docs_collection",
        embedding_function=embeddings,
        persist_directory=None,  # Where to save data locally, remove if not necessary
    )
    vectordb.add_documents(chunks)
    print(f"Stored {len(chunks)} chunks in VectorDB.")


def delete_embeddings_by_source(source_path):
    """Delete embeddings for a specific source file or URL"""
    try:
        vectordb = Chroma(
            collection_name="docs_collection",
            embedding_function=embeddings,
            persist_directory=None,
        )
        # Delete documents where source_path matches
        vectordb._collection.delete(where={"source_path": source_path})
        print(f"Deleted embeddings for source: {source_path}")
        return f"Deleted embeddings for: {source_path}"
    except Exception as e:
        print(f"Error deleting embeddings: {str(e)}")
        return f"Error deleting embeddings: {str(e)}"


def clear_database():
    """Clear all documents from the vector database"""
    try:
        vectordb = Chroma(
            collection_name="docs_collection",
            embedding_function=embeddings,
            persist_directory=None,
        )
        vectordb._collection.delete(where={})
        print("Database cleared successfully.")
        return "Database cleared successfully."
    except Exception as e:
        print(f"Error clearing database: {str(e)}")
        return f"Error clearing database: {str(e)}"