File size: 6,264 Bytes
98a33c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import shutil
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from getpass import getpass

# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = getpass("Provide OpenAI API Key:")

# Function to create and save a combined vector store from all summary documents
def create_combined_summary_vector_store():
    # Directory containing the Markdown summaries
    directory_path = "CAPS_Summaries"

    # List all Markdown files in the directory
    md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')]

    # Load the Markdown documents
    documents = []
    for file_name in md_files:
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Wrap the content in a Document object
            documents.append(Document(page_content=content))
        print(f"Successfully added {file_name} to the combined vector store.")

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
    splits = text_splitter.split_documents(documents)

    # Create embeddings and vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)

    # Save the vector store locally
    vector_store.save_local("Combined_Summary_Vectorstore")
    print("Combined summary vector store creation complete and saved as 'Combined_Summary_Vectorstore'.")

# Function to create and save individual vector stores for summary documents
def create_individual_summary_vector_stores():
    # Directory containing the Markdown summaries
    directory_path = "CAPS_Summaries"
    # Directory to save individual vector stores
    save_directory = "Individual_Summary_Vectorstores"

    # Ensure the save directory exists
    os.makedirs(save_directory, exist_ok=True)

    # List all Markdown files in the directory
    md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')]

    # Process each file individually
    for file_name in md_files:
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Wrap the content in a Document object
            document = Document(page_content=content)
            print(f"Successfully loaded {file_name}.")

        # Split the document into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
        splits = text_splitter.split_documents([document])

        # Create embeddings and vector store for each document
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)

        # Save the vector store locally with a unique name in the specified directory
        vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
        vector_store.save_local(vector_store_name)
        print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.")
    print(f"All Individual Summary Vectorstores created.")

# Function to create and save individual vector stores for all documents in CAPS_Summaries and CAPS
def create_individual_vector_stores_for_all_documents():
    # Directories containing the documents
    summary_directory = "CAPS_Summaries"
    caps_directory = "CAPS"
    # Directory to save individual vector stores
    save_directory = "Individual_All_Vectorstores"

    # Ensure the save directory exists
    os.makedirs(save_directory, exist_ok=True)

    # List all Markdown files in the summary directory
    summary_files = [f for f in os.listdir(summary_directory) if f.endswith('.md')]
    # List all PDF files in the CAPS directory
    caps_files = [f for f in os.listdir(caps_directory) if f.endswith('.pdf')]

    # Process each summary file individually by copying existing vector stores
    for file_name in summary_files:
        # Source vector store path in Individual_Summary_Vectorstores
        source_vector_store_name = os.path.join("Individual_Summary_Vectorstores", f"{os.path.splitext(file_name)[0]}_vectorstore")
        # Destination vector store path in Individual_All_Vectorstores
        destination_vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
        # Copy the vector store
        shutil.copytree(source_vector_store_name, destination_vector_store_name, dirs_exist_ok=True)
        print(f"Copied vector store for {file_name} to '{destination_vector_store_name}'.")

    # Process each CAPS file individually
    for file_name in caps_files:
        file_path = os.path.join(caps_directory, file_name)
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        print(f"Successfully loaded {file_name} from CAPS.")

        # Split the document into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
        splits = text_splitter.split_documents(documents)

        # Create embeddings and vector store for each document
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)

        # Save the vector store locally with a unique name in the specified directory
        vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
        vector_store.save_local(vector_store_name)
        print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.")
    print(f"All Individual Vectorstores for complete and summary plans created.")

# Run the functions to create and save the vector stores
if __name__ == "__main__":
    create_combined_summary_vector_store()
    create_individual_summary_vector_stores()
    create_individual_vector_stores_for_all_documents()