import os import shutil from langchain_community.document_loaders import TextLoader, PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain_core.documents import Document from getpass import getpass # Set OpenAI API Key os.environ["OPENAI_API_KEY"] = getpass("Provide OpenAI API Key:") # Function to create and save a combined vector store from all summary documents def create_combined_summary_vector_store(): # Directory containing the Markdown summaries directory_path = "CAPS_Summaries" # List all Markdown files in the directory md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')] # Load the Markdown documents documents = [] for file_name in md_files: file_path = os.path.join(directory_path, file_name) with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # Wrap the content in a Document object documents.append(Document(page_content=content)) print(f"Successfully added {file_name} to the combined vector store.") # Split the documents into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) splits = text_splitter.split_documents(documents) # Create embeddings and vector store embeddings = OpenAIEmbeddings(model="text-embedding-3-large") vector_store = FAISS.from_documents(documents=splits, embedding=embeddings) # Save the vector store locally vector_store.save_local("Combined_Summary_Vectorstore") print("Combined summary vector store creation complete and saved as 'Combined_Summary_Vectorstore'.") # Function to create and save individual vector stores for summary documents def create_individual_summary_vector_stores(): # Directory containing the Markdown summaries directory_path = "CAPS_Summaries" # Directory to save individual vector stores save_directory = "Individual_Summary_Vectorstores" # Ensure the save directory exists os.makedirs(save_directory, exist_ok=True) # List all Markdown files in the directory md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')] # Process each file individually for file_name in md_files: file_path = os.path.join(directory_path, file_name) with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # Wrap the content in a Document object document = Document(page_content=content) print(f"Successfully loaded {file_name}.") # Split the document into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) splits = text_splitter.split_documents([document]) # Create embeddings and vector store for each document embeddings = OpenAIEmbeddings(model="text-embedding-3-large") vector_store = FAISS.from_documents(documents=splits, embedding=embeddings) # Save the vector store locally with a unique name in the specified directory vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore") vector_store.save_local(vector_store_name) print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.") print(f"All Individual Summary Vectorstores created.") # Function to create and save individual vector stores for all documents in CAPS_Summaries and CAPS def create_individual_vector_stores_for_all_documents(): # Directories containing the documents summary_directory = "CAPS_Summaries" caps_directory = "CAPS" # Directory to save individual vector stores save_directory = "Individual_All_Vectorstores" # Ensure the save directory exists os.makedirs(save_directory, exist_ok=True) # List all Markdown files in the summary directory summary_files = [f for f in os.listdir(summary_directory) if f.endswith('.md')] # List all PDF files in the CAPS directory caps_files = [f for f in os.listdir(caps_directory) if f.endswith('.pdf')] # Process each summary file individually by copying existing vector stores for file_name in summary_files: # Source vector store path in Individual_Summary_Vectorstores source_vector_store_name = os.path.join("Individual_Summary_Vectorstores", f"{os.path.splitext(file_name)[0]}_vectorstore") # Destination vector store path in Individual_All_Vectorstores destination_vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore") # Copy the vector store shutil.copytree(source_vector_store_name, destination_vector_store_name, dirs_exist_ok=True) print(f"Copied vector store for {file_name} to '{destination_vector_store_name}'.") # Process each CAPS file individually for file_name in caps_files: file_path = os.path.join(caps_directory, file_name) loader = PyPDFLoader(file_path) documents = loader.load() print(f"Successfully loaded {file_name} from CAPS.") # Split the document into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) splits = text_splitter.split_documents(documents) # Create embeddings and vector store for each document embeddings = OpenAIEmbeddings(model="text-embedding-3-large") vector_store = FAISS.from_documents(documents=splits, embedding=embeddings) # Save the vector store locally with a unique name in the specified directory vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore") vector_store.save_local(vector_store_name) print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.") print(f"All Individual Vectorstores for complete and summary plans created.") # Run the functions to create and save the vector stores if __name__ == "__main__": create_combined_summary_vector_store() create_individual_summary_vector_stores() create_individual_vector_stores_for_all_documents()