ai-codebase-analyst / create_vectorstore.py
arizen-dev's picture
Initial project upload
90f65f7
Raw
History Blame Contribute Delete
1.64 kB
# file: create_vectorstore.py
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
SOURCE_CODE_PATH = "./source_code"
VECTORSTORE_PATH = "./vectorstore/db_faiss"
def main():
"""
Main function to create a FAISS vectorstore from source code documents.
"""
print("--- Starting Vectorstore Creation ---")
# 1. Load: Ingest all .py files from the source_code directory.
print(f"Loading documents from {SOURCE_CODE_PATH}...")
loader = DirectoryLoader(SOURCE_CODE_PATH, glob="**/*.py", loader_cls=TextLoader)
documents = loader.load()
print(f"Loaded {len(documents)} document(s).")
# 2. Split: Break documents into smaller, manageable chunks.
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks.")
# 3. Embed: Create the embedding model.
print("Creating embedding model...")
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
# 4. Store: Create the FAISS vectorstore and save it to disk.
print("Creating FAISS vectorstore...")
db = FAISS.from_documents(texts, embeddings)
db.save_local(VECTORSTORE_PATH)
print("--- Vectorstore Creation Complete ---")
print(f"Vectorstore saved at: {VECTORSTORE_PATH}")
if __name__ == "__main__":
main()