File size: 2,518 Bytes
d98ba57
3657397
d98ba57
cc2ce8c
 
 
 
 
fe19632
cc2ce8c
d98ba57
cc2ce8c
5bccbe7
cc2ce8c
 
d6936f0
cc2ce8c
fe19632
cc2ce8c
d6936f0
cc2ce8c
 
 
 
d98ba57
cc2ce8c
d6936f0
cc2ce8c
 
 
 
d98ba57
fe19632
 
 
 
 
 
 
 
 
 
5bccbe7
 
fe19632
 
419f9af
 
 
fe19632
 
 
cc2ce8c
 
 
 
d98ba57
3657397
 
 
d98ba57
3657397
d98ba57
 
3657397
 
d98ba57
3657397
cc2ce8c
 
 
d98ba57
cc2ce8c
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import shutil

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader

from .config import get_sources
from .embeddings import EMBEDDING_MODEL_NAME
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore

MIN_CHUNK_SIZE = 100

def load_data():
    print("Loading data...")
    docs = parse_data()
    print("Documents loaded")
    embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    print("Building index...")
    vectorstore = get_vectorstore(embedding_function)

    assert isinstance(vectorstore, Chroma)
    vectorstore.from_documents(
        docs, embedding_function, persist_directory=PERSIST_DIRECTORY
    )
    print("Index built")
    return vectorstore


def parse_data():
    docs = []
    for source in get_sources():
        file_path = source["file_path"]
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()

        # split it into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        doc_chunks = text_splitter.split_documents(pages)

        for chunk in doc_chunks:
            if len(chunk.page_content) < MIN_CHUNK_SIZE:
                continue
            chunk.metadata["name"] = source["name"]
            chunk.metadata["domain"] = source["domain"]
            url = source.get("url", None)
            if url:
                chunk.metadata["url"] = source.get("url", None)
            chunk.metadata["page_number"] = chunk.metadata["page"]
            chunk.metadata["short_name"] = chunk.metadata["name"]
            docs.append(chunk)

    return docs


def clear_index():
    directory_path = PERSIST_DIRECTORY
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        try:
            print(f"Deleting {file_path}")
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f"Failed to delete {file_path}. Reason: {e}")


if __name__ == "__main__":
    clear_index()
    db = load_data()
    # query it
    query = (
        "He who can bear the misfortune of a nation is called the ruler of the world."
    )
    docs = db.similarity_search(query)
    print(docs)