Spaces:
Sleeping
Sleeping
File size: 5,294 Bytes
a7aaec4 904f39b a7aaec4 904f39b a7aaec4 904f39b a7aaec4 904f39b a7aaec4 904f39b a7aaec4 904f39b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import streamlit as st
import os
import tempfile
import pickle
import faiss
import numpy as np
from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, query_llm_with_context
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Set page configuration
st.set_page_config(
page_title="PDF RAG System",
page_icon="π",
layout="wide"
)
# Title and description
st.title("π PDF RAG System")
st.markdown("""
This application allows you to upload a PDF file, ask questions about its content, and get AI-generated answers based on the document.
""")
# File upload section
st.header("1. Upload PDF")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf", key="pdf_uploader")
# Initialize session state variables
if 'pdf_processed' not in st.session_state:
st.session_state.pdf_processed = False
if 'index' not in st.session_state:
st.session_state.index = None
if 'chunks' not in st.session_state:
st.session_state.chunks = None
if 'pdf_path' not in st.session_state:
st.session_state.pdf_path = None
# Process the uploaded PDF
if uploaded_file is not None and not st.session_state.pdf_processed:
with st.spinner("Processing PDF..."):
# Create a temporary file to save the uploaded PDF
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
tmp_file.write(uploaded_file.getvalue())
st.session_state.pdf_path = tmp_file.name
# Extract text from PDF
pdf_text = extract_text_from_pdf(st.session_state.pdf_path)
# Chunk the text
chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100)
st.session_state.chunks = chunks
# Create embeddings
embeddings = embedding_function(chunks)
# Convert embeddings to numpy array if they aren't already
if not isinstance(embeddings, np.ndarray):
embeddings = np.array(embeddings).astype('float32')
# Get the dimension of the embeddings
dimension = embeddings.shape[1]
# Initialize FAISS index
index = faiss.IndexFlatL2(dimension)
# Add vectors to the index
index.add(embeddings)
# Save the index and chunks
faiss.write_index(index, "./faiss_index")
with open("./document_chunks.pkl", 'wb') as f:
pickle.dump(chunks, f)
# Update session state
st.session_state.index = index
st.session_state.pdf_processed = True
st.success(f"PDF processed successfully! {len(chunks)} chunks created.")
# Query section
st.header("2. Ask a Question")
query = st.text_input("Enter your question about the PDF content:", key="query_input")
# Add a button to submit the query
if st.button("Get Answer", key="get_answer_button") and query and st.session_state.pdf_processed:
with st.spinner("Retrieving relevant information and generating answer..."):
try:
# Generate embedding for the query
query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
# Search the index
n_results = 5
distances, indices = st.session_state.index.search(query_embedding, n_results)
# Get the documents
documents = [st.session_state.chunks[i] for i in indices[0]]
# Convert distances to similarity scores (L2 distance: lower is better)
# Normalize distances to [0, 1] range where 1 is most similar
max_distance = np.max(distances)
similarity_scores = [1 - (dist / max_distance) for dist in distances[0]]
# Create context from retrieved documents
context = (documents, similarity_scores)
# Query the LLM with context
answer = query_llm_with_context(query, context, top_n=3)
# Display the answer
st.header("3. Answer")
st.write(answer)
# Display the retrieved documents
with st.expander("View Retrieved Documents", expanded=False):
for i, (doc, score) in enumerate(zip(documents, similarity_scores)):
st.markdown(f"**Document {i+1}** (Relevance: {score:.4f})")
st.text(doc[:500] + "..." if len(doc) > 500 else doc)
st.markdown("---")
except Exception as e:
st.error(f"An error occurred: {str(e)}")
logger.exception("Error during query processing")
# Add a reset button
if st.button("Reset and Upload New PDF", key="reset_button"):
# Clean up temporary files
if st.session_state.pdf_path and os.path.exists(st.session_state.pdf_path):
os.unlink(st.session_state.pdf_path)
# Reset session state
st.session_state.pdf_processed = False
st.session_state.index = None
st.session_state.chunks = None
st.session_state.pdf_path = None
# Reload the page
st.experimental_rerun()
# Footer
st.markdown("---")
st.markdown("Built with Streamlit, FAISS, and Hugging Face API") |