Spaces:
Runtime error
Runtime error
| # file: create_vectorstore.py | |
| from langchain_community.document_loaders import DirectoryLoader, TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| SOURCE_CODE_PATH = "./source_code" | |
| VECTORSTORE_PATH = "./vectorstore/db_faiss" | |
| def main(): | |
| """ | |
| Main function to create a FAISS vectorstore from source code documents. | |
| """ | |
| print("--- Starting Vectorstore Creation ---") | |
| # 1. Load: Ingest all .py files from the source_code directory. | |
| print(f"Loading documents from {SOURCE_CODE_PATH}...") | |
| loader = DirectoryLoader(SOURCE_CODE_PATH, glob="**/*.py", loader_cls=TextLoader) | |
| documents = loader.load() | |
| print(f"Loaded {len(documents)} document(s).") | |
| # 2. Split: Break documents into smaller, manageable chunks. | |
| print("Splitting documents into chunks...") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| texts = text_splitter.split_documents(documents) | |
| print(f"Split into {len(texts)} chunks.") | |
| # 3. Embed: Create the embedding model. | |
| print("Creating embedding model...") | |
| embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') | |
| # 4. Store: Create the FAISS vectorstore and save it to disk. | |
| print("Creating FAISS vectorstore...") | |
| db = FAISS.from_documents(texts, embeddings) | |
| db.save_local(VECTORSTORE_PATH) | |
| print("--- Vectorstore Creation Complete ---") | |
| print(f"Vectorstore saved at: {VECTORSTORE_PATH}") | |
| if __name__ == "__main__": | |
| main() |