axjh03 commited on
Commit
658f5f3
1 Parent(s): 618cff3

vector stores

Browse files
Files changed (1) hide show
  1. ingest.py +14 -20
ingest.py CHANGED
@@ -1,34 +1,28 @@
1
- import os
2
- from langchain_community.vectorstores.faiss import FAISS
3
- from langchain.document_loaders import PyPDFLoader, DirectoryLoader
4
- from langchain.embeddings import HuggingFaceBgeEmbeddings
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
6
 
7
- # Define the directory paths
8
- DATA_PATH = "data"
9
- VECTORSTORES_DIR = "vectorstores"
10
- DB_FAISS_PATH = os.path.join(VECTORSTORES_DIR, "db_faiss")
11
 
12
- # Create the vectorstores directory if it doesn't exist
13
- try:
14
- os.makedirs(VECTORSTORES_DIR, exist_ok=True)
15
- except Exception as e:
16
- print(f"Error creating directory: {e}")
17
 
18
- # Create vector database
 
 
 
 
 
19
  def create_vector_db():
20
- # Load documents from the data directory
21
- loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
22
  documents = loader.load()
23
 
24
- # Split text from documents
25
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
 
26
  texts = text_splitter.split_documents(documents)
27
 
28
- # Initialize embeddings
29
- embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
 
30
 
31
- # Create FAISS vector database
32
  db = FAISS.from_documents(texts, embeddings)
33
  db.save_local(DB_FAISS_PATH)
34
 
 
 
 
 
 
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader # could have done any unstructured text loader like ppt and xlsx
3
 
 
 
 
 
4
 
5
+ from langchain.embeddings import HuggingFaceBgeEmbeddings # we can replace huggingface with facetransformers
 
 
 
 
6
 
7
+ from langchain.vectorstores import FAISS
8
+
9
+ DATA_PATH = "$HOME/data/"
10
+ DB_FAISS_PATH = "$HOME/vectorstores/db_faiss"
11
+
12
+ #create vector database
13
  def create_vector_db():
14
+ # WE can change .pdf with any other unstructured text format
15
+ loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls = PyPDFLoader)
16
  documents = loader.load()
17
 
 
18
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
19
+
20
  texts = text_splitter.split_documents(documents)
21
 
22
+ embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"}) # change to GPU if you want
23
+
24
+ # cuda is not supported in my MAC M1! SADLY.
25
 
 
26
  db = FAISS.from_documents(texts, embeddings)
27
  db.save_local(DB_FAISS_PATH)
28