File size: 6,745 Bytes
4495c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import shutil
import argparse
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Function to create and save a combined vector store from all summary documents
def create_combined_summary_vector_store(api_key):
    # Directory containing the Markdown summaries (input directory)
    directory_path = "./CAPS_Summaries"
    os.environ["OPENAI_API_KEY"] = api_key

    # Check if the input directory exists
    if not os.path.exists(directory_path):
        os.makedirs(directory_path, exist_ok=True)
        print(f"Input directory '{directory_path}' did not exist and has been created. Please add your summary files.")
        return

    # List all Markdown files in the directory
    md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')]

    # Load the Markdown documents
    documents = []
    for file_name in md_files:
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Wrap the content in a Document object
            documents.append(Document(page_content=content))

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
    splits = text_splitter.split_documents(documents)

    # Create embeddings and vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)

    # Define and create the output directory for the combined vector store
    combined_vector_store_dir = "Combined_Summary_Vectorstore"
    os.makedirs(combined_vector_store_dir, exist_ok=True)

    # Save the vector store locally
    vector_store.save_local(combined_vector_store_dir)
    print(f"Combined summary vector store creation complete and saved as '{combined_vector_store_dir}'.")


# Function to create and save individual vector store for a summary document
def create_individual_summary_vector_stores(api_key, summary_file_name):
    # Directory containing the Markdown summaries (input directory)
    directory_path = "./CAPS_Summaries"
    os.environ["OPENAI_API_KEY"] = api_key

    # Directory to save individual vector stores
    save_directory = "./Individual_Summary_Vectorstores"
    os.makedirs(save_directory, exist_ok=True)

    file_path = os.path.join(directory_path, summary_file_name)
    if not os.path.exists(file_path):
        print(f"Summary file {summary_file_name} not found in {directory_path}.")
        return

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        # Wrap the content in a Document object
        document = Document(page_content=content)

        # Split the document into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
        splits = text_splitter.split_documents([document])

        # Create embeddings and vector store for the document
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)

        # Save the vector store locally with a unique name in the specified directory
        vector_store_name = os.path.join(save_directory, f"{os.path.splitext(summary_file_name)[0]}_vectorstore")
        os.makedirs(vector_store_name, exist_ok=True)  # Create destination directory if it doesn't exist
        vector_store.save_local(vector_store_name)
        print(f"Vector store for {summary_file_name} created and saved as '{vector_store_name}'.")


# Function to create and save individual vector stores for all documents in CAPS_Summaries and CAPS
def create_individual_vector_stores_for_all_documents(api_key, file_name, summary_file_name):
    # Directories containing the documents
    caps_directory = "./CAPS"
    os.environ["OPENAI_API_KEY"] = api_key

    # Directory to save individual vector stores
    save_directory = "./Individual_All_Vectorstores"
    os.makedirs(save_directory, exist_ok=True)

    # Source vector store path in Individual_Summary_Vectorstores
    source_vector_store_name = os.path.join("./Individual_Summary_Vectorstores", f"{os.path.splitext(summary_file_name)[0]}_vectorstore")
    # Destination vector store path in Individual_All_Vectorstores
    destination_vector_store_name = os.path.join(save_directory, f"{os.path.splitext(summary_file_name)[0]}_vectorstore")
    # Copy the vector store (this will create the destination directory if needed)
    shutil.copytree(source_vector_store_name, destination_vector_store_name, dirs_exist_ok=True)
    print(f"Copied vector store for {file_name} to '{destination_vector_store_name}'.")

    file_path = os.path.join(caps_directory, file_name)
    if not os.path.exists(file_path):
        print(f"File {file_name} not found in {caps_directory}.")
        return

    loader = PyPDFLoader(file_path)
    documents = loader.load()
    print(f"Successfully loaded {file_name} from {caps_directory}.")

    # Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
    splits = text_splitter.split_documents(documents)

    # Create embeddings and vector store for the document
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)

    # Save the vector store locally with a unique name in the specified directory
    vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
    os.makedirs(vector_store_name, exist_ok=True)  # Create destination directory if it doesn't exist
    vector_store.save_local(vector_store_name)
    print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.")


# Run the functions to create and save the vector stores
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process vector store creation.")
    parser.add_argument("api_key", type=str, help="OpenAI API Key")
    parser.add_argument("file_name", type=str, help="Name of the file")
    parser.add_argument("summary_file_name", type=str, help="Name of the summary file")

    args = parser.parse_args()

    create_combined_summary_vector_store(args.api_key)
    create_individual_summary_vector_stores(args.api_key, args.summary_file_name)
    create_individual_vector_stores_for_all_documents(args.api_key, args.file_name, args.summary_file_name)