|
|
|
import time |
|
from typing import List |
|
from llama_index.embeddings.openai import OpenAIEmbedding |
|
from llama_index.core import Document, SimpleDirectoryReader, VectorStoreIndex |
|
import os |
|
import streamlit as st |
|
from llama_index.core import Settings |
|
|
|
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small") |
|
|
|
def index_documents(ground_truth_files, proposal_files, st): |
|
ground_truth_documents = [] |
|
proposals_documents = [] |
|
|
|
if ground_truth_files: |
|
for file in ground_truth_files: |
|
ground_truth_documents.extend(read_pdf_from_upload(file)) |
|
|
|
if proposal_files: |
|
for file in proposal_files: |
|
proposals_documents.extend(read_pdf_from_upload(file)) |
|
|
|
st.session_state.state.index_ground_truth = VectorStoreIndex.from_documents(ground_truth_documents) |
|
st.session_state.state.index_proposals = VectorStoreIndex.from_documents(proposals_documents |
|
) |
|
details = f""" |
|
Ground Truth Files: {', '.join(file.name for file in ground_truth_files)}\n |
|
Proposal Files: {', '.join(file.name for file in proposal_files)}\n |
|
--- |
|
index for ground truth: {st.session_state.state.index_ground_truth}\n |
|
index for proposals: {st.session_state.state.index_proposals}\n |
|
""" |
|
|
|
return "Step 1: Documents indexed successfully", details |
|
|
|
def read_pdf_from_upload(uploaded_file): |
|
if uploaded_file is not None: |
|
try: |
|
pdf_bytes: bytes = uploaded_file.read() |
|
temp_file_path = uploaded_file.name |
|
with open(temp_file_path, "wb") as temp_pdf: |
|
temp_pdf.write(pdf_bytes) |
|
|
|
reader = SimpleDirectoryReader(input_files=[temp_file_path]) |
|
documents = reader.load_data() |
|
|
|
os.remove(temp_file_path) |
|
return documents |
|
except Exception as e: |
|
st.error(f"Error processing PDF: {e}") |
|
return [] |
|
return [] |
|
|
|
def index_documents_fn(folder_path, index_name, reader='simple'): |
|
print(f"- Creating or loading index for: {folder_path} using the reader type '{reader}'") |
|
embedding_model = OpenAIEmbedding() |
|
|
|
PERSIST_DIR = f"./{index_name}/RAGFiles" |
|
METADATA_FILE = f"./{index_name}/metadata.json" |
|
total_pages = 0 |
|
|
|
try: |
|
if not os.path.exists(PERSIST_DIR): |
|
print(f" - Creating new index") |
|
if reader == 'simple': |
|
reader = SimpleDirectoryReader(folder_path) |
|
documents = reader.load_data() |
|
|
|
total_pages = len(documents) |
|
elif reader == 'smart': |
|
pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url) |
|
pdf_url = find_pdf_in_folder(folder_path) |
|
if pdf_url: |
|
print(f"PDF URL: {pdf_url}") |
|
documents = pdf_loader.load_data(pdf_url) |
|
total_pages = pdf_loader.get_number_of_pages(pdf_url) |
|
else: |
|
print("No PDF file found in the folder.") |
|
|
|
index = VectorStoreIndex.from_documents(documents) |
|
index.storage_context.persist(persist_dir=PERSIST_DIR) |
|
|
|
|
|
metadata = { |
|
"total_pages": total_pages, |
|
} |
|
with open(METADATA_FILE, 'w') as f: |
|
json.dump(metadata, f) |
|
else: |
|
print(f" - Reusing old index") |
|
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) |
|
index = cast(VectorStoreIndex, load_index_from_storage(storage_context)) |
|
|
|
|
|
if os.path.exists(METADATA_FILE): |
|
with open(METADATA_FILE, 'r') as f: |
|
metadata = json.load(f) |
|
total_pages = metadata.get("total_pages", 0) |
|
else: |
|
print("Warning: Metadata file not found") |
|
|
|
return index, total_pages |
|
|
|
except Exception as e: |
|
print(f"Error in document indexing: {str(e)}") |
|
return {"index": "", "pages": 0} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|