mvp / step_1_index_documents.py
Math
Add functions
22ecb24
import time
from typing import List
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex
import os
import streamlit as st
from llama_index.core import Settings
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
def index_documents(ground_truth_files, proposal_files, st):
ground_truth_documents = []
proposals_documents = []
if ground_truth_files:
for file in ground_truth_files:
ground_truth_documents.extend(read_pdf_from_upload(file))
if proposal_files:
for file in proposal_files:
proposals_documents.extend(read_pdf_from_upload(file))
st.session_state.state.index_ground_truth = VectorStoreIndex.from_documents(ground_truth_documents)
st.session_state.state.index_proposals = VectorStoreIndex.from_documents(proposals_documents
)
details = f"""
Ground Truth Files: {', '.join(file.name for file in ground_truth_files)}\n
Proposal Files: {', '.join(file.name for file in proposal_files)}\n
---
index for ground truth: {st.session_state.state.index_ground_truth}\n
index for proposals: {st.session_state.state.index_proposals}\n
""" # Store details as a string
return "Step 1: Documents indexed successfully", details
def read_pdf_from_upload(uploaded_file):
if uploaded_file is not None:
try:
pdf_bytes: bytes = uploaded_file.read()
temp_file_path = uploaded_file.name
with open(temp_file_path, "wb") as temp_pdf:
temp_pdf.write(pdf_bytes)
reader = SimpleDirectoryReader(input_files=[temp_file_path])
documents = reader.load_data()
os.remove(temp_file_path)
return documents
except Exception as e:
st.error(f"Error processing PDF: {e}")
return []
return []
def index_documents_fn(folder_path, index_name, reader='simple'):
print(f"- Creating or loading index for: {folder_path} using the reader type '{reader}'")
embedding_model = OpenAIEmbedding()
PERSIST_DIR = f"./{index_name}/RAGFiles"
METADATA_FILE = f"./{index_name}/metadata.json"
total_pages = 0
try:
if not os.path.exists(PERSIST_DIR):
print(f" - Creating new index")
if reader == 'simple':
reader = SimpleDirectoryReader(folder_path)
documents = reader.load_data()
# Get total pages from metadata
total_pages = len(documents)
elif reader == 'smart':
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
pdf_url = find_pdf_in_folder(folder_path)
if pdf_url:
print(f"PDF URL: {pdf_url}")
documents = pdf_loader.load_data(pdf_url)
total_pages = pdf_loader.get_number_of_pages(pdf_url)
else:
print("No PDF file found in the folder.")
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist(persist_dir=PERSIST_DIR)
# Save metadata including total pages
metadata = {
"total_pages": total_pages,
}
with open(METADATA_FILE, 'w') as f:
json.dump(metadata, f)
else:
print(f" - Reusing old index")
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = cast(VectorStoreIndex, load_index_from_storage(storage_context))
# Load total pages from metadata file
if os.path.exists(METADATA_FILE):
with open(METADATA_FILE, 'r') as f:
metadata = json.load(f)
total_pages = metadata.get("total_pages", 0)
else:
print("Warning: Metadata file not found")
return index, total_pages
except Exception as e:
print(f"Error in document indexing: {str(e)}")
return {"index": "", "pages": 0}
# def index_documents_fn(folder_path, index_name, reader='simple'):
# print(f"- Creating or loading index for: {folder_path} using the reader type '{reader}'")
# embedding_model = OpenAIEmbedding()
# PERSIST_DIR = f"./{index_name}/RAGFiles"
# METADATA_FILE = f"./{index_name}/metadata.json"
# total_pages = 0
# try:
# if not os.path.exists(PERSIST_DIR):
# print(f" - Creating new index")
# if reader == 'simple':
# reader = SimpleDirectoryReader(folder_path)
# documents = reader.load_data()
# # Get total pages from metadata
# total_pages = len(documents)
# elif reader == 'smart':
# pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
# pdf_url = find_pdf_in_folder(folder_path)
# if pdf_url:
# print(f"PDF URL: {pdf_url}")
# documents = pdf_loader.load_data(pdf_url)
# total_pages = pdf_loader.get_number_of_pages(pdf_url)
# else:
# print("No PDF file found in the folder.")
# index = VectorStoreIndex.from_documents(documents)
# index.storage_context.persist(persist_dir=PERSIST_DIR)
# # Save metadata including total pages
# metadata = {
# "total_pages": total_pages,
# }
# with open(METADATA_FILE, 'w') as f:
# json.dump(metadata, f)
# else:
# print(f" - Reusing old index")
# storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
# index = cast(VectorStoreIndex, load_index_from_storage(storage_context))
# # Load total pages from metadata file
# if os.path.exists(METADATA_FILE):
# with open(METADATA_FILE, 'r') as f:
# metadata = json.load(f)
# total_pages = metadata.get("total_pages", 0)
# else:
# print("Warning: Metadata file not found")
# return index, total_pages
# except Exception as e:
# print(f"Error in document indexing: {str(e)}")
# return {"index": "", "pages": 0}