axjh03 commited on
Commit
c742883
1 Parent(s): af4aa80

vector stores

Browse files
Files changed (1) hide show
  1. ingest.py +14 -6
ingest.py CHANGED
@@ -1,19 +1,27 @@
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
- from langchain.document_loaders import PyPDFLoader, DirectoryLoader
3
- from langchain.embeddings import HuggingFaceBgeEmbeddings
 
 
 
4
  from langchain.vectorstores import FAISS
5
 
6
- DATA_PATH = "/home/user/data"
7
- DB_FAISS_PATH = "/home/user/vectorstores/db_faiss"
8
 
 
9
  def create_vector_db():
10
- loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
 
11
  documents = loader.load()
12
 
13
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
 
14
  texts = text_splitter.split_documents(documents)
15
 
16
- embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
 
 
17
 
18
  db = FAISS.from_documents(texts, embeddings)
19
  db.save_local(DB_FAISS_PATH)
 
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader # could have done any unstructured text loader like ppt and xlsx
3
+
4
+
5
+ from langchain.embeddings import HuggingFaceBgeEmbeddings # we can replace huggingface with facetransformers
6
+
7
  from langchain.vectorstores import FAISS
8
 
9
+ DATA_PATH = "data/"
10
+ DB_FAISS_PATH = "vectorstores/db_faiss"
11
 
12
+ #create vector database
13
  def create_vector_db():
14
+ # WE can change .pdf with any other unstructured text format
15
+ loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls = PyPDFLoader)
16
  documents = loader.load()
17
 
18
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
19
+
20
  texts = text_splitter.split_documents(documents)
21
 
22
+ embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}) # change to GPU if you want
23
+
24
+ # cuda is not supported in my MAC M1! SADLY.
25
 
26
  db = FAISS.from_documents(texts, embeddings)
27
  db.save_local(DB_FAISS_PATH)