File size: 6,264 Bytes
98a33c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import shutil
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from getpass import getpass
# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = getpass("Provide OpenAI API Key:")
# Function to create and save a combined vector store from all summary documents
def create_combined_summary_vector_store():
# Directory containing the Markdown summaries
directory_path = "CAPS_Summaries"
# List all Markdown files in the directory
md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')]
# Load the Markdown documents
documents = []
for file_name in md_files:
file_path = os.path.join(directory_path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Wrap the content in a Document object
documents.append(Document(page_content=content))
print(f"Successfully added {file_name} to the combined vector store.")
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(documents)
# Create embeddings and vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)
# Save the vector store locally
vector_store.save_local("Combined_Summary_Vectorstore")
print("Combined summary vector store creation complete and saved as 'Combined_Summary_Vectorstore'.")
# Function to create and save individual vector stores for summary documents
def create_individual_summary_vector_stores():
# Directory containing the Markdown summaries
directory_path = "CAPS_Summaries"
# Directory to save individual vector stores
save_directory = "Individual_Summary_Vectorstores"
# Ensure the save directory exists
os.makedirs(save_directory, exist_ok=True)
# List all Markdown files in the directory
md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')]
# Process each file individually
for file_name in md_files:
file_path = os.path.join(directory_path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Wrap the content in a Document object
document = Document(page_content=content)
print(f"Successfully loaded {file_name}.")
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents([document])
# Create embeddings and vector store for each document
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)
# Save the vector store locally with a unique name in the specified directory
vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
vector_store.save_local(vector_store_name)
print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.")
print(f"All Individual Summary Vectorstores created.")
# Function to create and save individual vector stores for all documents in CAPS_Summaries and CAPS
def create_individual_vector_stores_for_all_documents():
# Directories containing the documents
summary_directory = "CAPS_Summaries"
caps_directory = "CAPS"
# Directory to save individual vector stores
save_directory = "Individual_All_Vectorstores"
# Ensure the save directory exists
os.makedirs(save_directory, exist_ok=True)
# List all Markdown files in the summary directory
summary_files = [f for f in os.listdir(summary_directory) if f.endswith('.md')]
# List all PDF files in the CAPS directory
caps_files = [f for f in os.listdir(caps_directory) if f.endswith('.pdf')]
# Process each summary file individually by copying existing vector stores
for file_name in summary_files:
# Source vector store path in Individual_Summary_Vectorstores
source_vector_store_name = os.path.join("Individual_Summary_Vectorstores", f"{os.path.splitext(file_name)[0]}_vectorstore")
# Destination vector store path in Individual_All_Vectorstores
destination_vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
# Copy the vector store
shutil.copytree(source_vector_store_name, destination_vector_store_name, dirs_exist_ok=True)
print(f"Copied vector store for {file_name} to '{destination_vector_store_name}'.")
# Process each CAPS file individually
for file_name in caps_files:
file_path = os.path.join(caps_directory, file_name)
loader = PyPDFLoader(file_path)
documents = loader.load()
print(f"Successfully loaded {file_name} from CAPS.")
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(documents)
# Create embeddings and vector store for each document
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)
# Save the vector store locally with a unique name in the specified directory
vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
vector_store.save_local(vector_store_name)
print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.")
print(f"All Individual Vectorstores for complete and summary plans created.")
# Run the functions to create and save the vector stores
if __name__ == "__main__":
create_combined_summary_vector_store()
create_individual_summary_vector_stores()
create_individual_vector_stores_for_all_documents()
|