Spaces:

anamjafar6
/

RAg_mentor

Running

App Files Files Community

anamjafar6 commited on Sep 27

Commit

91d79b3

verified ·

1 Parent(s): 0216ffc

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -216

app.py CHANGED Viewed

@@ -12,11 +12,17 @@ try:
 except ImportError:
     Groq = None
 # -----------------------------
 # Utility Functions
 # -----------------------------
 def load_api_key() -> str:
-    """Load the GROQ API key from Hugging Face secrets or env vars."""
     api_key = os.environ.get("GROQ_API_KEY")
     if not api_key:
         try:
@@ -68,7 +74,8 @@ def pdf_to_chunks(uploaded_file, chunk_size: int = 500, overlap: int = 50) -> Li
             continue
         words = text.split()
-        for i in range(0, len(words), chunk_size - overlap):
             chunk_text = " ".join(words[i:i + chunk_size])
             if chunk_text.strip():
                 chunks.append({
@@ -117,11 +124,13 @@ def create_vector_database(chunks: List[Dict], embedding_model: SentenceTransfor
     # Store only the collection name (not object) in session_state
     st.session_state.collection_name = collection_name
     return collection_name
 def query_vector_database(query: str, embedding_model: SentenceTransformer,
-                          top_k: int = 5) -> List[Dict]:
     """Query ChromaDB for relevant chunks."""
     if "collection_name" not in st.session_state:
         st.error("No active collection found. Upload and process a PDF first.")
@@ -163,7 +172,10 @@ def query_vector_database(query: str, embedding_model: SentenceTransformer,
         elif isinstance(distance, (int, float)) and distance <= 1:
             similarity = max(0, 1 - distance)
         else:
-            similarity = float(distance)
         relevant_chunks.append({
             "text": doc,
@@ -207,6 +219,7 @@ Answer:"""
                 max_tokens=500
             )
         else:
             chat_resp = client.create(prompt=prompt, max_tokens=500)
         if hasattr(chat_resp, "choices"):
@@ -223,16 +236,17 @@ Answer:"""
         return f"Error generating answer: {e}"
 # STREAMLIT UI
 def main():
     """Main Streamlit application."""
     # Page configuration with wide layout for centered design
     st.set_page_config(
-        page_title="PageMentor",  # Browser tab title
-        page_icon="📚",  # Browser tab icon
-        layout="wide"  # Wide layout allows for centered container
     )
     # Custom CSS for professional styling and centered layout
@@ -244,101 +258,18 @@ def main():
             margin: 0 auto;
             padding: 2rem 1rem;
         }
-        /* Professional light theme with soft colors */
-        .stApp {
-            background-color: #f8f9fa;
-        }
-        /* Styled header section */
-        .header-container {
-            text-align: center;
-            padding: 2rem 0;
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            border-radius: 15px;
-            margin-bottom: 2rem;
-            box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-        }
-        .header-title {
-            color: white;
-            font-size: 2.5rem;
-            font-weight: 700;
-            margin-bottom: 0.5rem;
-        }
-        .header-subtitle {
-            color: rgba(255, 255, 255, 0.9);
-            font-size: 1.1rem;
-        }
-        /* Chat bubble style for answers */
-        .answer-box {
-            background-color: white;
-            border-radius: 15px;
-            padding: 1.5rem;
-            margin: 1rem 0;
-            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
-            border-left: 4px solid #667eea;
-        }
-        /* Source cards styling */
-        .source-card {
-            background-color: #f0f2f6;
-            border-radius: 10px;
-            padding: 1rem;
-            margin: 0.5rem 0;
-            border-left: 3px solid #764ba2;
-        }
-        /* Button styling */
-        .stButton > button {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            color: white;
-            border: none;
-            border-radius: 8px;
-            padding: 0.5rem 2rem;
-            font-weight: 600;
-            transition: transform 0.2s;
-        }
-        .stButton > button:hover {
-            transform: translateY(-2px);
-            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
-        }
-        /* File uploader styling */
-        .uploadedFile {
-            background-color: white;
-            border-radius: 10px;
-            padding: 1rem;
-        }
-        /* Text input styling */
-        .stTextInput > div > div > input {
-            border-radius: 8px;
-            border: 2px solid #e0e0e0;
-            padding: 0.75rem;
-        }
-        .stTextInput > div > div > input:focus {
-            border-color: #667eea;
-            box-shadow: 0 0 0 2px rgba(102, 126, 234, 0.1);
-        }
-        /* Footer styling */
-        .footer {
-            text-align: center;
-            padding: 2rem 0;
-            margin-top: 3rem;
-            border-top: 1px solid #e0e0e0;
-            color: #666;
-        }
-        /* Success/Error message styling */
-        .stSuccess, .stInfo, .stWarning, .stError {
-            border-radius: 8px;
-        }
         </style>
     """, unsafe_allow_html=True)
@@ -350,152 +281,145 @@ def main():
         </div>
     """, unsafe_allow_html=True)
-    # Horizontal divider after header
     st.markdown("---")
-    # Initialize session state for storing data
-    if 'vector_db' not in st.session_state:  # Check if database exists in session
-        st.session_state.vector_db = None  # Initialize as None
-    if 'embedding_model' not in st.session_state:  # Check if model exists in session
-        st.session_state.embedding_model = None  # Initialize as None
-    if 'processed_file' not in st.session_state:  # Track processed file name
-        st.session_state.processed_file = None  # Initialize as None
-    # Load embedding model
-    if st.session_state.embedding_model is None:  # If model not loaded
-        with st.spinner("🔄 Loading AI models..."):  # Show loading spinner with icon
-            st.session_state.embedding_model = load_embedding_model()  # Load model
-    # Main content area with two columns for better layout
-    col1, col2 = st.columns([2, 1])  # Create two columns with 2:1 ratio
-    with col1:  # Main content column
-        # File upload section with styled container
-        with st.container():  # Container for grouped elements
-            st.markdown("### 📄 Upload Your Document")  # Section header with icon
-            st.markdown("*Select a PDF file to start learning*")  # Helper text in italics
             uploaded_file = st.file_uploader(
                 "Choose a PDF file",
-                type="pdf",  # Only allow PDF files
-                help="Upload any PDF document - textbooks, etc.",  # Expanded help text
-                label_visibility="collapsed"  # Hide redundant label
             )
-            # Process uploaded file with enhanced feedback
-            if uploaded_file is not None:  # If user uploaded a file
-                # Show file info in a nice format
-                file_info = st.container()  # Container for file information
-                with file_info:
-                    st.info(f"📎 **File:** {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")  # Display file details
-                if st.button("🚀 Process Document", use_container_width=True):  # Full width button
-                    with st.spinner("📖 Reading and analyzing your document..."):  # Processing message
-                        # Extract text from PDF
-                        with st.spinner("📖 Reading and analyzing your document..."):
-                            chunks = pdf_to_chunks(uploaded_file)  # your pdf_to_chunks function
-                            if chunks:
-                                total_pages = len({c['page_number'] for c in chunks})
-                                st.success(f"✅ Successfully processed **{total_pages} pages**")
-                                st.info(f"📝 Created **{len(chunks)}** searchable text segments")
-                            else:
-                                st.error("❌ Failed to extract any text from the uploaded PDF.")
-                                return  # stop further processing
                             # Create vector database
-                            if st.session_state.embedding_model:  # If embedding model is loaded
-                                with st.spinner("🧠 Building knowledge base..."):  # Database creation message
-                                    st.session_state.vector_db = create_vector_database(
-                                        chunks, st.session_state.embedding_model
-                                    )
-                                    if st.session_state.vector_db:  # If database created successfully
-                                        st.success("✅ **Ready to answer your questions!**")  # Final success message
-                                        st.session_state.processed_file = uploaded_file.name  # Store processed file name
-                                        st.balloons()  # Celebration animation
                                     else:
-                                        st.error("❌ Failed to create knowledge base")  # Error message
                             else:
-                                st.error("❌ AI model not available")  # Model error
-                        else:
-                            st.error(f"❌ Failed to process PDF: {pdf_result['error']}")  # Extraction error
     # Question answering section
-    if st.session_state.vector_db is not None:  # If database is ready
-        st.markdown("---")  # Visual separator
-        st.markdown("### 💬 Ask Your Questions")  # Section header with icon
-        # Show which document is loaded
-        if st.session_state.processed_file:  # If we have a processed file name
-            st.markdown(f"*Currently learning from: **{st.session_state.processed_file}***")  # Display current document
-        # Create a form for better UX
-        with st.form(key="question_form"):  # Form container for question input
             question = st.text_input(
                 "What would you like to know?",
-                placeholder="e.g., What is the main topic? Summarize chapter 3. Explain the key concepts.",  # Multiple examples
-                help="Ask any question about the content of your document",  # Help text
-                label_visibility="collapsed"  # Hide redundant label
             )
-            # Submit button inside form
             submit_button = st.form_submit_button(
                 "🔍 Get Answer",
-                use_container_width=True  # Full width button
             )
-        # Process question when form is submitted
-        if submit_button and question.strip():  # If form submitted with non-empty question
-            with st.spinner("🤔 Thinking..."):  # Processing message
                 # Query vector database
                 relevant_chunks = query_vector_database(
                     question,
-                    st.session_state.embedding_model,
-                    top_k=5
                 )
                 # Filter by similarity threshold
-                SIMILARITY_THRESHOLD = 0.20  # put this at the top of file if not already
                 relevant_chunks = [c for c in relevant_chunks if c.get('similarity', 0) >= SIMILARITY_THRESHOLD]
-            # Check results after spinner block
             if not relevant_chunks:
                 st.warning("❌ No sufficiently relevant passages found (increase threshold or rephrase question).")
             else:
-            # Generate answer
-            client = setup_groq()
-            if not client:
-                st.error("❌ LLM not configured. Check GROQ_API_KEY and that 'groq' is installed.")
-            else:
-                answer = generate_answer_with_groq(client, question, relevant_chunks)
                     # Display answer in chat bubble style
-                    st.markdown("#### 🎯 Answer")  # Answer header
-                    st.markdown(f'<div class="answer-box">{answer}</div>', unsafe_allow_html=True)  # Styled answer box
                     # Display sources in a clean format
-                    st.markdown("#### 📚 Top Sources")  # Sources header
-                    st.markdown("*Most relevant passages from your document:*")  # Sources description
-                    for i, chunk in enumerate(relevant_chunks, 1):  # Loop through sources with numbering
-                        # Create expandable source cards
                         with st.expander(
                             f"**Source {i}** | 📄 Page {chunk['page_number']} | "
-                            f"🎯 Relevance: {chunk['similarity']*100:.0f}%"  # Convert to percentage
                         ):
-                            st.markdown(f'<div class="source-card">{chunk["text"][:500]}...</div>',
-                                      unsafe_allow_html=True)  # Display truncated text in styled card
-                else:
-                    st.warning("❌ No relevant information found for your question. Try rephrasing or asking about topics covered in the document.")  # Enhanced warning
     else:
-        # Welcome message when no document is loaded
         st.markdown("""
         <div style='text-align: center; padding: 3rem; background-color: white; border-radius: 15px; margin: 2rem 0;'>
             <h3>👋 Welcome to PageMentor!</h3>
@@ -504,8 +428,7 @@ def main():
         </div>
         """, unsafe_allow_html=True)
-    # Footer - centered at bottom
     st.markdown("""
     <div class="footer">
         <p>Built with ❤️ using Streamlit | Powered by Hugging Face | © 2025 PageMentor</p>
@@ -513,7 +436,6 @@ def main():
     </div>
     """, unsafe_allow_html=True)
-# RUN THE APPLICATION
-if __name__ == "__main__":  # Only run if this file is executed directly
-    main()  # Start the Streamlit app

 except ImportError:
     Groq = None
+# -----------------------------
+# Config
+# -----------------------------
+SIMILARITY_THRESHOLD = 0.20
+TOP_K = 5
 # -----------------------------
 # Utility Functions
 # -----------------------------
 def load_api_key() -> str:
+    """Load the GROQ API key from environment or Hugging Face token fallback."""
     api_key = os.environ.get("GROQ_API_KEY")
     if not api_key:
         try:
             continue
         words = text.split()
+        step = max(1, chunk_size - overlap)
+        for i in range(0, len(words), step):
             chunk_text = " ".join(words[i:i + chunk_size])
             if chunk_text.strip():
                 chunks.append({
     # Store only the collection name (not object) in session_state
     st.session_state.collection_name = collection_name
+    # Also store a simple flag in vector_db for UI readiness
+    st.session_state.vector_db = collection_name
     return collection_name
 def query_vector_database(query: str, embedding_model: SentenceTransformer,
+                          top_k: int = TOP_K) -> List[Dict]:
     """Query ChromaDB for relevant chunks."""
     if "collection_name" not in st.session_state:
         st.error("No active collection found. Upload and process a PDF first.")
         elif isinstance(distance, (int, float)) and distance <= 1:
             similarity = max(0, 1 - distance)
         else:
+            try:
+                similarity = float(distance)
+            except Exception:
+                similarity = 0.0
         relevant_chunks.append({
             "text": doc,
                 max_tokens=500
             )
         else:
+            # Fallback generic call
             chat_resp = client.create(prompt=prompt, max_tokens=500)
         if hasattr(chat_resp, "choices"):
         return f"Error generating answer: {e}"
+# --------------------------------
 # STREAMLIT UI
+# --------------------------------
 def main():
     """Main Streamlit application."""
     # Page configuration with wide layout for centered design
     st.set_page_config(
+        page_title="PageMentor",
+        page_icon="📚",
+        layout="wide"
     )
     # Custom CSS for professional styling and centered layout
             margin: 0 auto;
             padding: 2rem 1rem;
         }
+        .stApp { background-color: #f8f9fa; }
+        .header-container { text-align: center; padding: 2rem 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 2rem; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
+        .header-title { color: white; font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; }
+        .header-subtitle { color: rgba(255,255,255,0.9); font-size: 1.1rem; }
+        .answer-box { background-color: white; border-radius: 15px; padding: 1.5rem; margin: 1rem 0; box-shadow: 0 2px 8px rgba(0,0,0,0.08); border-left: 4px solid #667eea; }
+        .source-card { background-color: #f0f2f6; border-radius: 10px; padding: 1rem; margin: 0.5rem 0; border-left: 3px solid #764ba2; }
+        .stButton > button { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border: none; border-radius: 8px; padding: 0.5rem 2rem; font-weight: 600; transition: transform 0.2s; }
+        .stButton > button:hover { transform: translateY(-2px); box-shadow: 0 4px 12px rgba(102,126,234,0.4); }
+        .uploadedFile { background-color: white; border-radius: 10px; padding: 1rem; }
+        .stTextInput > div > div > input { border-radius: 8px; border: 2px solid #e0e0e0; padding: 0.75rem; }
+        .stTextInput > div > div > input:focus { border-color: #667eea; box-shadow: 0 0 0 2px rgba(102,126,234,0.1); }
+        .footer { text-align: center; padding: 2rem 0; margin-top: 3rem; border-top: 1px solid #e0e0e0; color: #666; }
         </style>
     """, unsafe_allow_html=True)
         </div>
     """, unsafe_allow_html=True)
     st.markdown("---")
+    # Session state init
+    if 'vector_db' not in st.session_state:
+        st.session_state.vector_db = None
+    if 'embedding_model' not in st.session_state:
+        st.session_state.embedding_model = None
+    if 'processed_file' not in st.session_state:
+        st.session_state.processed_file = None
+    if 'collection_name' not in st.session_state:
+        st.session_state.collection_name = None
+    # Load embedding model if not loaded
+    if st.session_state.embedding_model is None:
+        with st.spinner("🔄 Loading AI models..."):
+            st.session_state.embedding_model = load_embedding_model()
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        with st.container():
+            st.markdown("### 📄 Upload Your Document")
+            st.markdown("*Select a PDF file to start learning*")
             uploaded_file = st.file_uploader(
                 "Choose a PDF file",
+                type="pdf",
+                help="Upload any PDF document - textbooks, research papers, articles, etc.",
+                label_visibility="collapsed"
             )
+            if uploaded_file is not None:
+                st.info(f"📎 **File:** {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
+                if st.button("🚀 Process Document", use_container_width=True):
+                    # attempt best-effort cleanup of prior collection
+                    try:
+                        old_name = st.session_state.get("collection_name")
+                        if old_name:
+                            client_tmp = chromadb.Client()
+                            if hasattr(client_tmp, "delete_collection"):
+                                try:
+                                    client_tmp.delete_collection(old_name)
+                                except Exception:
+                                    pass
+                    except Exception:
+                        pass
+                    # reset state
+                    st.session_state.vector_db = None
+                    st.session_state.collection_name = None
+                    st.session_state.processed_file = None
+                    # process file
+                    with st.spinner("📖 Reading and analyzing your document..."):
+                        chunks = pdf_to_chunks(uploaded_file)
+                        if not chunks:
+                            st.error("❌ Failed to extract any text from the uploaded PDF.")
+                        else:
+                            total_pages = len({c['page_number'] for c in chunks})
+                            st.success(f"✅ Successfully processed **{total_pages} pages**")
+                            st.info(f"📝 Created **{len(chunks)}** searchable text segments")
                             # Create vector database
+                            if st.session_state.embedding_model:
+                                with st.spinner("🧠 Building knowledge base..."):
+                                    collection_name = create_vector_database(chunks, st.session_state.embedding_model)
+                                    if collection_name:
+                                        st.session_state.processed_file = uploaded_file.name
+                                        st.success("✅ **Ready to answer your questions!**")
+                                        st.balloons()
                                     else:
+                                        st.error("❌ Failed to create knowledge base")
                             else:
+                                st.error("❌ AI model not available")
     # Question answering section
+    if st.session_state.vector_db is not None:
+        st.markdown("---")
+        st.markdown("### 💬 Ask Your Questions")
+        if st.session_state.processed_file:
+            st.markdown(f"*Currently learning from: **{st.session_state.processed_file}***")
+        with st.form(key="question_form"):
             question = st.text_input(
                 "What would you like to know?",
+                placeholder="e.g., What is the main topic? Summarize chapter 3. Explain the key concepts.",
+                help="Ask any question about the content of your document",
+                label_visibility="collapsed"
             )
             submit_button = st.form_submit_button(
                 "🔍 Get Answer",
+                use_container_width=True
             )
+        if submit_button and question.strip():
+            with st.spinner("🤔 Thinking..."):
                 # Query vector database
+                embedding_model = st.session_state.embedding_model
                 relevant_chunks = query_vector_database(
                     question,
+                    embedding_model,
+                    top_k=TOP_K
                 )
                 # Filter by similarity threshold
                 relevant_chunks = [c for c in relevant_chunks if c.get('similarity', 0) >= SIMILARITY_THRESHOLD]
+            # After spinner
             if not relevant_chunks:
                 st.warning("❌ No sufficiently relevant passages found (increase threshold or rephrase question).")
             else:
+                # Generate answer
+                client = setup_groq()
+                if not client:
+                    st.error("❌ LLM not configured. Check GROQ_API_KEY and that 'groq' is installed.")
+                else:
+                    answer = generate_answer_with_groq(client, question, relevant_chunks)
                     # Display answer in chat bubble style
+                    st.markdown("#### 🎯 Answer")
+                    st.markdown(f'<div class="answer-box">{answer}</div>', unsafe_allow_html=True)
                     # Display sources in a clean format
+                    st.markdown("#### 📚 Top Sources")
+                    st.markdown("*Most relevant passages from your document:*")
+                    for i, chunk in enumerate(relevant_chunks, 1):
                         with st.expander(
                             f"**Source {i}** | 📄 Page {chunk['page_number']} | "
+                            f"🎯 Relevance: {chunk['similarity']*100:.0f}%"
                         ):
+                            st.markdown(f'<div class="source-card">{chunk["text"][:500]}...</div>',
+                                        unsafe_allow_html=True)
     else:
         st.markdown("""
         <div style='text-align: center; padding: 3rem; background-color: white; border-radius: 15px; margin: 2rem 0;'>
             <h3>👋 Welcome to PageMentor!</h3>
         </div>
         """, unsafe_allow_html=True)
+    # Footer
     st.markdown("""
     <div class="footer">
         <p>Built with ❤️ using Streamlit | Powered by Hugging Face | © 2025 PageMentor</p>
     </div>
     """, unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()