File size: 4,136 Bytes
59fc6ec
815128e
 
 
 
a766494
815128e
 
25ef847
815128e
25ef847
815128e
c118c66
815128e
 
e8c6d72
a766494
815128e
e8c6d72
 
 
 
 
 
 
 
815128e
 
 
 
 
 
 
 
 
 
25ef847
 
 
 
 
 
 
 
 
 
 
 
 
 
815128e
25ef847
 
815128e
 
 
59fc6ec
815128e
 
 
25ef847
 
815128e
e8c6d72
815128e
 
 
 
 
 
 
 
 
 
 
 
 
 
e8c6d72
 
 
815128e
 
e8c6d72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
815128e
 
 
 
 
 
 
 
 
e8c6d72
25ef847
 
 
 
 
e8c6d72
 
 
 
 
 
 
 
 
815128e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# setting device on GPU if available, else CPU
import os
from timeit import default_timer as timer
from typing import List

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.base import VectorStore
from langchain.vectorstores.chroma import Chroma
from langchain.vectorstores.faiss import FAISS

from app_modules.init import *


def load_documents(source_pdfs_path, urls) -> List:
    loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
    documents = loader.load()
    if urls is not None and len(urls) > 0:
        for doc in documents:
            source = doc.metadata["source"]
            filename = source.split("/")[-1]
            for url in urls:
                if url.endswith(filename):
                    doc.metadata["url"] = url
                    break
    return documents


def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(documents)


def generate_index(
    chunks: List, embeddings: HuggingFaceInstructEmbeddings
) -> VectorStore:
    if using_faiss:
        faiss_instructor_embeddings = FAISS.from_documents(
            documents=chunks, embedding=embeddings
        )

        faiss_instructor_embeddings.save_local(index_path)
        return faiss_instructor_embeddings
    else:
        chromadb_instructor_embeddings = Chroma.from_documents(
            documents=chunks, embedding=embeddings, persist_directory=index_path
        )

        chromadb_instructor_embeddings.persist()
        return chromadb_instructor_embeddings


# Constants
device_type, hf_pipeline_device_type = get_device_types()
hf_embeddings_model_name = (
    os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
)
index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
source_urls = os.environ.get("SOURCE_URLS")
chunk_size = os.environ.get("CHUNCK_SIZE")
chunk_overlap = os.environ.get("CHUNK_OVERLAP")

start = timer()
embeddings = HuggingFaceInstructEmbeddings(
    model_name=hf_embeddings_model_name, model_kwargs={"device": device_type}
)
end = timer()

print(f"Completed in {end - start:.3f}s")

start = timer()

if not os.path.isdir(index_path):
    print(
        f"The index persist directory {index_path} is not present. Creating a new one."
    )
    os.mkdir(index_path)

    if source_urls is not None:
        # Open the file for reading
        file = open(source_urls, "r")

        # Read the contents of the file into a list of strings
        lines = file.readlines()

        # Close the file
        file.close()

        # Remove the newline characters from each string
        source_urls = [line.strip() for line in lines]

    print(
        f"Loading {'' if source_urls is None else str(len(source_urls)) + ' '}PDF files from {source_pdfs_path}"
    )
    sources = load_documents(source_pdfs_path, source_urls)

    print(f"Splitting {len(sources)} PDF pages in to chunks ...")

    chunks = split_chunks(
        sources, chunk_size=int(chunk_size), chunk_overlap=int(chunk_overlap)
    )
    print(f"Generating index for {len(chunks)} chunks ...")

    index = generate_index(chunks, embeddings)
else:
    print(f"The index persist directory {index_path} is present. Loading index ...")
    index = (
        FAISS.load_local(index_path, embeddings)
        if using_faiss
        else Chroma(embedding_function=embeddings, persist_directory=index_path)
    )
    query = "hi"
    print(f"Load relevant documents for standalone question: {query}")

    start2 = timer()
    docs = index.as_retriever().get_relevant_documents(query)
    end = timer()

    print(f"Completed in {end - start2:.3f}s")
    print(docs)

end = timer()

print(f"Completed in {end - start:.3f}s")