added Faiss support
Browse files- .env.example +2 -1
- .gitattributes +2 -0
- data/faiss_1024_512/index.faiss +3 -0
- data/faiss_1024_512/index.pkl +3 -0
- ingest.py +25 -8
.env.example
CHANGED
|
@@ -57,7 +57,8 @@ LLAMACPP_MODEL_PATH="./models/wizardLM-7B.ggmlv3.q4_1.bin"
|
|
| 57 |
LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/wizardLM-7B-GGML/resolve/main/wizardLM-7B.ggmlv3.q4_1.bin
|
| 58 |
|
| 59 |
# Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
|
| 60 |
-
CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
|
|
|
|
| 61 |
|
| 62 |
QUESTIONS_FILE_PATH="./data/questions.txt"
|
| 63 |
|
|
|
|
| 57 |
LLAMACPP_DOWNLOAD_LINK=https://huggingface.co/TheBloke/wizardLM-7B-GGML/resolve/main/wizardLM-7B.ggmlv3.q4_1.bin
|
| 58 |
|
| 59 |
# Index for AI Books PDF files - chunk_size=1024 chunk_overlap=512
|
| 60 |
+
# CHROMADB_INDEX_PATH="./data/chromadb_1024_512/"
|
| 61 |
+
FAISS_INDEX_PATH="./data/faiss_1024_512/"
|
| 62 |
|
| 63 |
QUESTIONS_FILE_PATH="./data/questions.txt"
|
| 64 |
|
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/faiss_1024_512/index.faiss filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/faiss_1024_512/index.pkl filter=lfs diff=lfs merge=lfs -text
|
data/faiss_1024_512/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dcb86f8f32c953c7d5c99662a27e43d6261da7b7b4342bac638e6d19bf7ee530
|
| 3 |
+
size 78975021
|
data/faiss_1024_512/index.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:313a047fb82ef5c43661a12b2424aeae88688d7631e4bdaf7de283a0b0763dc9
|
| 3 |
+
size 26672894
|
ingest.py
CHANGED
|
@@ -6,7 +6,9 @@ from typing import List
|
|
| 6 |
from langchain.document_loaders import PyPDFDirectoryLoader
|
| 7 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 9 |
from langchain.vectorstores.chroma import Chroma
|
|
|
|
| 10 |
|
| 11 |
from app_modules.utils import *
|
| 12 |
|
|
@@ -24,13 +26,23 @@ def split_chunks(documents: List, chunk_size, chunk_overlap) -> List:
|
|
| 24 |
return text_splitter.split_documents(documents)
|
| 25 |
|
| 26 |
|
| 27 |
-
def generate_index(
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
|
| 36 |
# Constants
|
|
@@ -40,7 +52,8 @@ device_type, hf_pipeline_device_type = get_device_types()
|
|
| 40 |
hf_embeddings_model_name = (
|
| 41 |
os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
|
| 42 |
)
|
| 43 |
-
index_path = os.environ.get("CHROMADB_INDEX_PATH")
|
|
|
|
| 44 |
source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
|
| 45 |
chunk_size = os.environ.get("CHUNCK_SIZE")
|
| 46 |
chunk_overlap = os.environ.get("CHUNK_OVERLAP")
|
|
@@ -71,7 +84,11 @@ if not os.path.isdir(index_path):
|
|
| 71 |
index = generate_index(chunks, embeddings)
|
| 72 |
else:
|
| 73 |
print("The index persist directory is present. Loading index ...")
|
| 74 |
-
index =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
end = timer()
|
| 77 |
|
|
|
|
| 6 |
from langchain.document_loaders import PyPDFDirectoryLoader
|
| 7 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
| 8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 9 |
+
from langchain.vectorstores.base import VectorStore
|
| 10 |
from langchain.vectorstores.chroma import Chroma
|
| 11 |
+
from langchain.vectorstores.faiss import FAISS
|
| 12 |
|
| 13 |
from app_modules.utils import *
|
| 14 |
|
|
|
|
| 26 |
return text_splitter.split_documents(documents)
|
| 27 |
|
| 28 |
|
| 29 |
+
def generate_index(
|
| 30 |
+
chunks: List, embeddings: HuggingFaceInstructEmbeddings
|
| 31 |
+
) -> VectorStore:
|
| 32 |
+
if using_faiss:
|
| 33 |
+
faiss_instructor_embeddings = FAISS.from_documents(
|
| 34 |
+
documents=chunks, embedding=embeddings
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
faiss_instructor_embeddings.save_local(index_path)
|
| 38 |
+
return faiss_instructor_embeddings
|
| 39 |
+
else:
|
| 40 |
+
chromadb_instructor_embeddings = Chroma.from_documents(
|
| 41 |
+
documents=chunks, embedding=embeddings, persist_directory=index_path
|
| 42 |
+
)
|
| 43 |
|
| 44 |
+
chromadb_instructor_embeddings.persist()
|
| 45 |
+
return chromadb_instructor_embeddings
|
| 46 |
|
| 47 |
|
| 48 |
# Constants
|
|
|
|
| 52 |
hf_embeddings_model_name = (
|
| 53 |
os.environ.get("HF_EMBEDDINGS_MODEL_NAME") or "hkunlp/instructor-xl"
|
| 54 |
)
|
| 55 |
+
index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
|
| 56 |
+
using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
|
| 57 |
source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
|
| 58 |
chunk_size = os.environ.get("CHUNCK_SIZE")
|
| 59 |
chunk_overlap = os.environ.get("CHUNK_OVERLAP")
|
|
|
|
| 84 |
index = generate_index(chunks, embeddings)
|
| 85 |
else:
|
| 86 |
print("The index persist directory is present. Loading index ...")
|
| 87 |
+
index = (
|
| 88 |
+
FAISS.load_local(index_path, embeddings)
|
| 89 |
+
if using_faiss
|
| 90 |
+
else Chroma(embedding_function=embeddings, persist_directory=index_path)
|
| 91 |
+
)
|
| 92 |
|
| 93 |
end = timer()
|
| 94 |
|