arjunanand13 commited on
Commit
d90465c
1 Parent(s): c62a6a1

Create store_embedding.py

Browse files
Files changed (1) hide show
  1. store_embedding.py +29 -0
store_embedding.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.document_loaders import TextLoader, DirectoryLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.vectorstores import FAISS
6
+
7
+ # Set your Hugging Face token
8
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
9
+
10
+ # Load documents
11
+ loader = DirectoryLoader('data2/text/range/0-5000', loader_cls=TextLoader)
12
+ documents = loader.load()
13
+ print('len of documents are', len(documents))
14
+
15
+ # Split documents into chunks
16
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250)
17
+ all_splits = text_splitter.split_documents(documents)
18
+ print("Length of all_splits:", len(all_splits))
19
+
20
+ # Generate embeddings
21
+ model_name = "sentence-transformers/all-mpnet-base-v2"
22
+ model_kwargs = {"device": "cuda"}
23
+ embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
24
+
25
+ # Store embeddings in the vector store
26
+ vectorstore = FAISS.from_documents(all_splits, embeddings)
27
+ vectorstore.save_local('faiss_index')
28
+
29
+ print("Embeddings stored successfully!")