langchat / vectorstore.py
Tanmay09516's picture
Upload 14 files
dd87c4b verified
raw
history blame
3.7 kB
# vectorstore.py
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
def load_and_split_document(file_path, chunk_size=1000, chunk_overlap=150):
"""
Load a document from a file and split it into chunks.
Args:
file_path: Path to the text file.
chunk_size: The maximum size of each chunk.
chunk_overlap: The overlap between chunks.
Returns:
A list of document chunks.
"""
loader = TextLoader(
file_path,
encoding='utf-8',
autodetect_encoding=True
)
try:
documents = loader.load()
except RuntimeError:
# Fallback to a different encoding if autodetection fails
loader = TextLoader(
file_path,
encoding='latin-1',
autodetect_encoding=False
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len
)
chunks = text_splitter.split_documents(documents)
return chunks
def create_vector_stores(doc_paths, embeddings):
"""
Create vector stores from a list of document paths.
Args:
doc_paths: List of paths to document files.
embeddings: The embeddings model to use.
Returns:
A dictionary of vector stores.
"""
vector_stores = {}
os.makedirs("vector_stores", exist_ok=True)
for doc_path in doc_paths:
store_name = os.path.basename(doc_path).split('.')[0]
chunks = load_and_split_document(doc_path)
print(f"Processing {store_name}: {len(chunks)} chunks created")
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local(f"vector_stores/{store_name}")
vector_stores[store_name] = vectorstore
return vector_stores
def create_vector_store_from_folder(folder_path, embeddings):
"""
Create a single vector store from all text files in a folder.
Args:
folder_path: Path to the folder containing text files.
embeddings: The embeddings model to use.
Returns:
A dictionary containing the created vector store.
"""
vector_stores = {}
os.makedirs("vector_stores", exist_ok=True)
all_chunks = []
file_names = []
for filename in os.listdir(folder_path):
if filename.endswith(".txt"):
file_path = os.path.join(folder_path, filename)
chunks = load_and_split_document(file_path)
all_chunks.extend(chunks)
file_names.append(filename)
print(f"Processing {folder_path}: {len(all_chunks)} chunks created from {len(file_names)} files")
vectorstore = FAISS.from_documents(all_chunks, embeddings)
store_name = os.path.basename(folder_path.rstrip('/'))
vectorstore.save_local(f"vector_stores/{store_name}")
vector_stores[store_name] = vectorstore
return vector_stores
def load_all_vector_stores(embeddings):
"""
Load all vector stores from the 'vector_stores' directory.
Args:
embeddings: The embeddings model to use.
Returns:
A dictionary of loaded vector stores.
"""
vector_stores = {}
store_dir = "vector_stores"
for store_name in os.listdir(store_dir):
store_path = os.path.join(store_dir, store_name)
if os.path.isdir(store_path):
vector_stores[store_name] = FAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True)
return vector_stores