Spaces:
Sleeping
Sleeping
import os | |
from langchain.document_loaders import TextLoader, DirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.vectorstores import FAISS | |
# Set your Hugging Face token | |
HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
# Load documents | |
loader = DirectoryLoader('data2/text/range/0-5000', loader_cls=TextLoader) | |
documents = loader.load() | |
print('len of documents are', len(documents)) | |
# Split documents into chunks | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=250) | |
all_splits = text_splitter.split_documents(documents) | |
print("Length of all_splits:", len(all_splits)) | |
# Generate embeddings | |
model_name = "sentence-transformers/all-mpnet-base-v2" | |
model_kwargs = {"device": "cuda"} | |
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs) | |
# Store embeddings in the vector store | |
vectorstore = FAISS.from_documents(all_splits, embeddings) | |
vectorstore.save_local('faiss_index') | |
print("Embeddings stored successfully!") | |