File size: 5,294 Bytes
a7aaec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
904f39b
a7aaec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
904f39b
a7aaec4
 
904f39b
a7aaec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
904f39b
a7aaec4
 
 
 
 
 
 
 
 
 
904f39b
a7aaec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
904f39b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import streamlit as st
import os
import tempfile
import pickle
import faiss
import numpy as np
from helper import extract_text_from_pdf, chunk_text, embedding_function, embedding_model, query_llm_with_context
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set page configuration
st.set_page_config(
    page_title="PDF RAG System",
    page_icon="πŸ“š",
    layout="wide"
)

# Title and description
st.title("πŸ“š PDF RAG System")
st.markdown("""
This application allows you to upload a PDF file, ask questions about its content, and get AI-generated answers based on the document.
""")

# File upload section
st.header("1. Upload PDF")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf", key="pdf_uploader")

# Initialize session state variables
if 'pdf_processed' not in st.session_state:
    st.session_state.pdf_processed = False
if 'index' not in st.session_state:
    st.session_state.index = None
if 'chunks' not in st.session_state:
    st.session_state.chunks = None
if 'pdf_path' not in st.session_state:
    st.session_state.pdf_path = None

# Process the uploaded PDF
if uploaded_file is not None and not st.session_state.pdf_processed:
    with st.spinner("Processing PDF..."):
        # Create a temporary file to save the uploaded PDF
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(uploaded_file.getvalue())
            st.session_state.pdf_path = tmp_file.name
        
        # Extract text from PDF
        pdf_text = extract_text_from_pdf(st.session_state.pdf_path)
        
        # Chunk the text
        chunks = chunk_text(pdf_text, chunk_size=1000, chunk_overlap=100)
        st.session_state.chunks = chunks
        
        # Create embeddings
        embeddings = embedding_function(chunks)
        
        # Convert embeddings to numpy array if they aren't already
        if not isinstance(embeddings, np.ndarray):
            embeddings = np.array(embeddings).astype('float32')
        
        # Get the dimension of the embeddings
        dimension = embeddings.shape[1]
        
        # Initialize FAISS index
        index = faiss.IndexFlatL2(dimension)
        
        # Add vectors to the index
        index.add(embeddings)
        
        # Save the index and chunks
        faiss.write_index(index, "./faiss_index")
        with open("./document_chunks.pkl", 'wb') as f:
            pickle.dump(chunks, f)
        
        # Update session state
        st.session_state.index = index
        st.session_state.pdf_processed = True
        
        st.success(f"PDF processed successfully! {len(chunks)} chunks created.")

# Query section
st.header("2. Ask a Question")
query = st.text_input("Enter your question about the PDF content:", key="query_input")

# Add a button to submit the query
if st.button("Get Answer", key="get_answer_button") and query and st.session_state.pdf_processed:
    with st.spinner("Retrieving relevant information and generating answer..."):
        try:
            # Generate embedding for the query
            query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
            
            # Search the index
            n_results = 5
            distances, indices = st.session_state.index.search(query_embedding, n_results)
            
            # Get the documents
            documents = [st.session_state.chunks[i] for i in indices[0]]
            
            # Convert distances to similarity scores (L2 distance: lower is better)
            # Normalize distances to [0, 1] range where 1 is most similar
            max_distance = np.max(distances)
            similarity_scores = [1 - (dist / max_distance) for dist in distances[0]]
            
            # Create context from retrieved documents
            context = (documents, similarity_scores)
            
            # Query the LLM with context
            answer = query_llm_with_context(query, context, top_n=3)
            
            # Display the answer
            st.header("3. Answer")
            st.write(answer)
            
            # Display the retrieved documents
            with st.expander("View Retrieved Documents", expanded=False):
                for i, (doc, score) in enumerate(zip(documents, similarity_scores)):
                    st.markdown(f"**Document {i+1}** (Relevance: {score:.4f})")
                    st.text(doc[:500] + "..." if len(doc) > 500 else doc)
                    st.markdown("---")
        
        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
            logger.exception("Error during query processing")

# Add a reset button
if st.button("Reset and Upload New PDF", key="reset_button"):
    # Clean up temporary files
    if st.session_state.pdf_path and os.path.exists(st.session_state.pdf_path):
        os.unlink(st.session_state.pdf_path)
    
    # Reset session state
    st.session_state.pdf_processed = False
    st.session_state.index = None
    st.session_state.chunks = None
    st.session_state.pdf_path = None
    
    # Reload the page
    st.experimental_rerun()

# Footer
st.markdown("---")
st.markdown("Built with Streamlit, FAISS, and Hugging Face API")