divy131 commited on
Commit
8df2836
·
verified ·
1 Parent(s): ea458b4

Upload ingest (1).py

Browse files
Files changed (1) hide show
  1. ingest (1).py +34 -0
ingest (1).py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing Dependencies
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.vectorstores import FAISS
6
+
7
+ # Dataset Directory Path
8
+ DATASET = "dataset/"
9
+
10
+ # Faiss Index Path
11
+ FAISS_INDEX = "vectorstore/"
12
+
13
+ # Create Vector Store and Index
14
+ def embed_all():
15
+ """
16
+ Embed all files in the dataset directory
17
+ """
18
+ # Create the document loader
19
+ loader = DirectoryLoader(DATASET, glob="*.pdf", loader_cls=PyPDFLoader)
20
+ # Load the documents
21
+ documents = loader.load()
22
+ # Create the splitter
23
+ splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
24
+ # Split the documents into chunks
25
+ chunks = splitter.split_documents(documents)
26
+ # Load the embeddings
27
+ embeddings = HuggingFaceEmbeddings()
28
+ # Create the vector store
29
+ vector_store = FAISS.from_documents(chunks, embeddings)
30
+ # Save the vector store
31
+ vector_store.save_local(FAISS_INDEX)
32
+
33
+ if __name__ == "__main__":
34
+ embed_all()