Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 6,745 Bytes
4495c4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import os
import shutil
import argparse
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
# Function to create and save a combined vector store from all summary documents
def create_combined_summary_vector_store(api_key):
# Directory containing the Markdown summaries (input directory)
directory_path = "./CAPS_Summaries"
os.environ["OPENAI_API_KEY"] = api_key
# Check if the input directory exists
if not os.path.exists(directory_path):
os.makedirs(directory_path, exist_ok=True)
print(f"Input directory '{directory_path}' did not exist and has been created. Please add your summary files.")
return
# List all Markdown files in the directory
md_files = [f for f in os.listdir(directory_path) if f.endswith('.md')]
# Load the Markdown documents
documents = []
for file_name in md_files:
file_path = os.path.join(directory_path, file_name)
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Wrap the content in a Document object
documents.append(Document(page_content=content))
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(documents)
# Create embeddings and vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)
# Define and create the output directory for the combined vector store
combined_vector_store_dir = "Combined_Summary_Vectorstore"
os.makedirs(combined_vector_store_dir, exist_ok=True)
# Save the vector store locally
vector_store.save_local(combined_vector_store_dir)
print(f"Combined summary vector store creation complete and saved as '{combined_vector_store_dir}'.")
# Function to create and save individual vector store for a summary document
def create_individual_summary_vector_stores(api_key, summary_file_name):
# Directory containing the Markdown summaries (input directory)
directory_path = "./CAPS_Summaries"
os.environ["OPENAI_API_KEY"] = api_key
# Directory to save individual vector stores
save_directory = "./Individual_Summary_Vectorstores"
os.makedirs(save_directory, exist_ok=True)
file_path = os.path.join(directory_path, summary_file_name)
if not os.path.exists(file_path):
print(f"Summary file {summary_file_name} not found in {directory_path}.")
return
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Wrap the content in a Document object
document = Document(page_content=content)
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents([document])
# Create embeddings and vector store for the document
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)
# Save the vector store locally with a unique name in the specified directory
vector_store_name = os.path.join(save_directory, f"{os.path.splitext(summary_file_name)[0]}_vectorstore")
os.makedirs(vector_store_name, exist_ok=True) # Create destination directory if it doesn't exist
vector_store.save_local(vector_store_name)
print(f"Vector store for {summary_file_name} created and saved as '{vector_store_name}'.")
# Function to create and save individual vector stores for all documents in CAPS_Summaries and CAPS
def create_individual_vector_stores_for_all_documents(api_key, file_name, summary_file_name):
# Directories containing the documents
caps_directory = "./CAPS"
os.environ["OPENAI_API_KEY"] = api_key
# Directory to save individual vector stores
save_directory = "./Individual_All_Vectorstores"
os.makedirs(save_directory, exist_ok=True)
# Source vector store path in Individual_Summary_Vectorstores
source_vector_store_name = os.path.join("./Individual_Summary_Vectorstores", f"{os.path.splitext(summary_file_name)[0]}_vectorstore")
# Destination vector store path in Individual_All_Vectorstores
destination_vector_store_name = os.path.join(save_directory, f"{os.path.splitext(summary_file_name)[0]}_vectorstore")
# Copy the vector store (this will create the destination directory if needed)
shutil.copytree(source_vector_store_name, destination_vector_store_name, dirs_exist_ok=True)
print(f"Copied vector store for {file_name} to '{destination_vector_store_name}'.")
file_path = os.path.join(caps_directory, file_name)
if not os.path.exists(file_path):
print(f"File {file_name} not found in {caps_directory}.")
return
loader = PyPDFLoader(file_path)
documents = loader.load()
print(f"Successfully loaded {file_name} from {caps_directory}.")
# Split the document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500)
splits = text_splitter.split_documents(documents)
# Create embeddings and vector store for the document
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = FAISS.from_documents(documents=splits, embedding=embeddings)
# Save the vector store locally with a unique name in the specified directory
vector_store_name = os.path.join(save_directory, f"{os.path.splitext(file_name)[0]}_vectorstore")
os.makedirs(vector_store_name, exist_ok=True) # Create destination directory if it doesn't exist
vector_store.save_local(vector_store_name)
print(f"Vector store for {file_name} created and saved as '{vector_store_name}'.")
# Run the functions to create and save the vector stores
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process vector store creation.")
parser.add_argument("api_key", type=str, help="OpenAI API Key")
parser.add_argument("file_name", type=str, help="Name of the file")
parser.add_argument("summary_file_name", type=str, help="Name of the summary file")
args = parser.parse_args()
create_combined_summary_vector_store(args.api_key)
create_individual_summary_vector_stores(args.api_key, args.summary_file_name)
create_individual_vector_stores_for_all_documents(args.api_key, args.file_name, args.summary_file_name)
|