Jatinydv commited on
Commit
3bc0a9f
1 Parent(s): 350b4de

Upload ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +28 -0
ingest.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings import HuggingFaceEmbeddings
2
+ from langchain_community.vectorstores import FAISS
3
+ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+
6
+ DATA_PATH = 'data/'
7
+ DB_FAISS_PATH = 'vectorstore/db_faiss'
8
+
9
+ # Create vector database
10
+ def create_vector_db():
11
+ loader = DirectoryLoader(DATA_PATH,
12
+ glob='*.pdf',
13
+ loader_cls=PyPDFLoader)
14
+
15
+ documents = loader.load()
16
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
17
+ chunk_overlap=50)
18
+ texts = text_splitter.split_documents(documents)
19
+
20
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
21
+ model_kwargs={'device': 'cpu'})
22
+
23
+ db = FAISS.from_documents(texts, embeddings)
24
+ db.save_local(DB_FAISS_PATH)
25
+
26
+ if __name__ == "__main__":
27
+ create_vector_db()
28
+