added Faiss support
Browse files- .env.example +2 -1
- .gitattributes +2 -0
- data/faiss_1024_512/index.faiss +3 -0
- data/faiss_1024_512/index.pkl +3 -0
- ingest.py +25 -8
.env.example
CHANGED
@@ -57,7 +57,8 @@ LLAMACPP_MODEL_PATH="./models/wizardLM-7B.ggmlv3.q4_1.bin"
|
|
57 |
LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/wizardLM-7B-GGML/resolve/main/wizardLM-7B.ggmlv3.q4_1.bin
|
58 |
|
59 |
# Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
|
60 |
-
CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
|
|
|
61 |
|
62 |
QUESTIONS_FILE_PATH="./data/questions.txt"
|
63 |
|
|
|
57 |
LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/wizardLM-7B-GGML/resolve/main/wizardLM-7B.ggmlv3.q4_1.bin
|
58 |
|
59 |
# Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
|
60 |
+
# CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
|
61 |
+
FAISS_INDEX_PATH="./data/faiss_1024_512/"
|
62 |
|
63 |
QUESTIONS_FILE_PATH="./data/questions.txt"
|
64 |
|
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/faiss_1024_512/index.faiss filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/faiss_1024_512/index.pkl filter=lfs diff=lfs merge=lfs -text
|
data/faiss_1024_512/index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcb86f8f32c953c7d5c99662a27e43d6261da7b7b4342bac638e6d19bf7ee530
|
3 |
+
size 78975021
|
data/faiss_1024_512/index.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:313a047fb82ef5c43661a12b2424aeae88688d7631e4bdaf7de283a0b0763dc9
|
3 |
+
size 26672894
|
ingest.py
CHANGED
@@ -6,7 +6,9 @@ from typing import List
|
|
6 |
from langchain.document_loaders import PyPDFDirectoryLoader
|
7 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
9 |
from langchain.vectorstores.chroma import Chroma
|
|
|
10 |
|
11 |
from app_modules.utils import *
|
12 |
|
@@ -24,13 +26,23 @@ def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
|
|
24 |
return text_splitter.split_documents(documents)
|
25 |
|
26 |
|
27 |
-
def generate_index(
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
|
35 |
|
36 |
# Constants
|
@@ -40,7 +52,8 @@ device_type, hf_pipeline_device_type = get_device_types()
|
|
40 |
hf_embeddings_model_name = (
|
41 |
os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
|
42 |
)
|
43 |
-
index_path = os.environ.get("CHROMADB_INDEX_PATH")
|
|
|
44 |
source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
|
45 |
chunk_size = os.environ.get("CHUNCK_SIZE")
|
46 |
chunk_overlap = os.environ.get("CHUNK_OVERLAP")
|
@@ -71,7 +84,11 @@ if not os.path.isdir(index_path):
|
|
71 |
index = generate_index(chunks, embeddings)
|
72 |
else:
|
73 |
print("The index persist directory is present. Loading index ...")
|
74 |
-
index =
|
|
|
|
|
|
|
|
|
75 |
|
76 |
end = timer()
|
77 |
|
|
|
6 |
from langchain.document_loaders import PyPDFDirectoryLoader
|
7 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
+
from langchain.vectorstores.base import VectorStore
|
10 |
from langchain.vectorstores.chroma import Chroma
|
11 |
+
from langchain.vectorstores.faiss import FAISS
|
12 |
|
13 |
from app_modules.utils import *
|
14 |
|
|
|
26 |
return text_splitter.split_documents(documents)
|
27 |
|
28 |
|
29 |
+
def generate_index(
|
30 |
+
chunks: List, embeddings: HuggingFaceInstructEmbeddings
|
31 |
+
) -> VectorStore:
|
32 |
+
if using_faiss:
|
33 |
+
faiss_instructor_embeddings = FAISS.from_documents(
|
34 |
+
documents=chunks, embedding=embeddings
|
35 |
+
)
|
36 |
+
|
37 |
+
faiss_instructor_embeddings.save_local(index_path)
|
38 |
+
return faiss_instructor_embeddings
|
39 |
+
else:
|
40 |
+
chromadb_instructor_embeddings = Chroma.from_documents(
|
41 |
+
documents=chunks, embedding=embeddings, persist_directory=index_path
|
42 |
+
)
|
43 |
|
44 |
+
chromadb_instructor_embeddings.persist()
|
45 |
+
return chromadb_instructor_embeddings
|
46 |
|
47 |
|
48 |
# Constants
|
|
|
52 |
hf_embeddings_model_name = (
|
53 |
os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
|
54 |
)
|
55 |
+
index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
|
56 |
+
using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
|
57 |
source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
|
58 |
chunk_size = os.environ.get("CHUNCK_SIZE")
|
59 |
chunk_overlap = os.environ.get("CHUNK_OVERLAP")
|
|
|
84 |
index = generate_index(chunks, embeddings)
|
85 |
else:
|
86 |
print("The index persist directory is present. Loading index ...")
|
87 |
+
index = (
|
88 |
+
FAISS.load_local(index_path, embeddings)
|
89 |
+
if using_faiss
|
90 |
+
else Chroma(embedding_function=embeddings, persist_directory=index_path)
|
91 |
+
)
|
92 |
|
93 |
end = timer()
|
94 |
|