anamjafar6 commited on
Commit
91d79b3
Β·
verified Β·
1 Parent(s): 0216ffc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -216
app.py CHANGED
@@ -12,11 +12,17 @@ try:
12
  except ImportError:
13
  Groq = None
14
 
 
 
 
 
 
 
15
  # -----------------------------
16
  # Utility Functions
17
  # -----------------------------
18
  def load_api_key() -> str:
19
- """Load the GROQ API key from Hugging Face secrets or env vars."""
20
  api_key = os.environ.get("GROQ_API_KEY")
21
  if not api_key:
22
  try:
@@ -68,7 +74,8 @@ def pdf_to_chunks(uploaded_file, chunk_size: int = 500, overlap: int = 50) -> Li
68
  continue
69
 
70
  words = text.split()
71
- for i in range(0, len(words), chunk_size - overlap):
 
72
  chunk_text = " ".join(words[i:i + chunk_size])
73
  if chunk_text.strip():
74
  chunks.append({
@@ -117,11 +124,13 @@ def create_vector_database(chunks: List[Dict], embedding_model: SentenceTransfor
117
 
118
  # Store only the collection name (not object) in session_state
119
  st.session_state.collection_name = collection_name
 
 
120
  return collection_name
121
 
122
 
123
  def query_vector_database(query: str, embedding_model: SentenceTransformer,
124
- top_k: int = 5) -> List[Dict]:
125
  """Query ChromaDB for relevant chunks."""
126
  if "collection_name" not in st.session_state:
127
  st.error("No active collection found. Upload and process a PDF first.")
@@ -163,7 +172,10 @@ def query_vector_database(query: str, embedding_model: SentenceTransformer,
163
  elif isinstance(distance, (int, float)) and distance <= 1:
164
  similarity = max(0, 1 - distance)
165
  else:
166
- similarity = float(distance)
 
 
 
167
 
168
  relevant_chunks.append({
169
  "text": doc,
@@ -207,6 +219,7 @@ Answer:"""
207
  max_tokens=500
208
  )
209
  else:
 
210
  chat_resp = client.create(prompt=prompt, max_tokens=500)
211
 
212
  if hasattr(chat_resp, "choices"):
@@ -223,16 +236,17 @@ Answer:"""
223
  return f"Error generating answer: {e}"
224
 
225
 
 
226
  # STREAMLIT UI
227
-
228
  def main():
229
  """Main Streamlit application."""
230
 
231
  # Page configuration with wide layout for centered design
232
  st.set_page_config(
233
- page_title="PageMentor", # Browser tab title
234
- page_icon="πŸ“š", # Browser tab icon
235
- layout="wide" # Wide layout allows for centered container
236
  )
237
 
238
  # Custom CSS for professional styling and centered layout
@@ -244,101 +258,18 @@ def main():
244
  margin: 0 auto;
245
  padding: 2rem 1rem;
246
  }
247
-
248
- /* Professional light theme with soft colors */
249
- .stApp {
250
- background-color: #f8f9fa;
251
- }
252
-
253
- /* Styled header section */
254
- .header-container {
255
- text-align: center;
256
- padding: 2rem 0;
257
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
258
- border-radius: 15px;
259
- margin-bottom: 2rem;
260
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
261
- }
262
-
263
- .header-title {
264
- color: white;
265
- font-size: 2.5rem;
266
- font-weight: 700;
267
- margin-bottom: 0.5rem;
268
- }
269
-
270
- .header-subtitle {
271
- color: rgba(255, 255, 255, 0.9);
272
- font-size: 1.1rem;
273
- }
274
-
275
- /* Chat bubble style for answers */
276
- .answer-box {
277
- background-color: white;
278
- border-radius: 15px;
279
- padding: 1.5rem;
280
- margin: 1rem 0;
281
- box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
282
- border-left: 4px solid #667eea;
283
- }
284
-
285
- /* Source cards styling */
286
- .source-card {
287
- background-color: #f0f2f6;
288
- border-radius: 10px;
289
- padding: 1rem;
290
- margin: 0.5rem 0;
291
- border-left: 3px solid #764ba2;
292
- }
293
-
294
- /* Button styling */
295
- .stButton > button {
296
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
297
- color: white;
298
- border: none;
299
- border-radius: 8px;
300
- padding: 0.5rem 2rem;
301
- font-weight: 600;
302
- transition: transform 0.2s;
303
- }
304
-
305
- .stButton > button:hover {
306
- transform: translateY(-2px);
307
- box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
308
- }
309
-
310
- /* File uploader styling */
311
- .uploadedFile {
312
- background-color: white;
313
- border-radius: 10px;
314
- padding: 1rem;
315
- }
316
-
317
- /* Text input styling */
318
- .stTextInput > div > div > input {
319
- border-radius: 8px;
320
- border: 2px solid #e0e0e0;
321
- padding: 0.75rem;
322
- }
323
-
324
- .stTextInput > div > div > input:focus {
325
- border-color: #667eea;
326
- box-shadow: 0 0 0 2px rgba(102, 126, 234, 0.1);
327
- }
328
-
329
- /* Footer styling */
330
- .footer {
331
- text-align: center;
332
- padding: 2rem 0;
333
- margin-top: 3rem;
334
- border-top: 1px solid #e0e0e0;
335
- color: #666;
336
- }
337
-
338
- /* Success/Error message styling */
339
- .stSuccess, .stInfo, .stWarning, .stError {
340
- border-radius: 8px;
341
- }
342
  </style>
343
  """, unsafe_allow_html=True)
344
 
@@ -350,152 +281,145 @@ def main():
350
  </div>
351
  """, unsafe_allow_html=True)
352
 
353
- # Horizontal divider after header
354
  st.markdown("---")
355
 
356
- # Initialize session state for storing data
357
- if 'vector_db' not in st.session_state: # Check if database exists in session
358
- st.session_state.vector_db = None # Initialize as None
359
- if 'embedding_model' not in st.session_state: # Check if model exists in session
360
- st.session_state.embedding_model = None # Initialize as None
361
- if 'processed_file' not in st.session_state: # Track processed file name
362
- st.session_state.processed_file = None # Initialize as None
363
-
364
- # Load embedding model
365
- if st.session_state.embedding_model is None: # If model not loaded
366
- with st.spinner("πŸ”„ Loading AI models..."): # Show loading spinner with icon
367
- st.session_state.embedding_model = load_embedding_model() # Load model
368
-
369
- # Main content area with two columns for better layout
370
- col1, col2 = st.columns([2, 1]) # Create two columns with 2:1 ratio
371
-
372
- with col1: # Main content column
373
- # File upload section with styled container
374
- with st.container(): # Container for grouped elements
375
- st.markdown("### πŸ“„ Upload Your Document") # Section header with icon
376
- st.markdown("*Select a PDF file to start learning*") # Helper text in italics
377
-
378
  uploaded_file = st.file_uploader(
379
  "Choose a PDF file",
380
- type="pdf", # Only allow PDF files
381
- help="Upload any PDF document - textbooks, etc.", # Expanded help text
382
- label_visibility="collapsed" # Hide redundant label
383
  )
384
 
385
- # Process uploaded file with enhanced feedback
386
- if uploaded_file is not None: # If user uploaded a file
387
- # Show file info in a nice format
388
- file_info = st.container() # Container for file information
389
- with file_info:
390
- st.info(f"πŸ“Ž **File:** {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)") # Display file details
391
-
392
- if st.button("πŸš€ Process Document", use_container_width=True): # Full width button
393
- with st.spinner("πŸ“– Reading and analyzing your document..."): # Processing message
394
- # Extract text from PDF
395
- with st.spinner("πŸ“– Reading and analyzing your document..."):
396
- chunks = pdf_to_chunks(uploaded_file) # your pdf_to_chunks function
397
-
398
- if chunks:
399
- total_pages = len({c['page_number'] for c in chunks})
400
- st.success(f"βœ… Successfully processed **{total_pages} pages**")
401
- st.info(f"πŸ“ Created **{len(chunks)}** searchable text segments")
402
- else:
403
- st.error("❌ Failed to extract any text from the uploaded PDF.")
404
- return # stop further processing
405
-
 
 
 
 
 
 
 
 
 
 
 
406
 
407
  # Create vector database
408
- if st.session_state.embedding_model: # If embedding model is loaded
409
- with st.spinner("🧠 Building knowledge base..."): # Database creation message
410
- st.session_state.vector_db = create_vector_database(
411
- chunks, st.session_state.embedding_model
412
- )
413
-
414
- if st.session_state.vector_db: # If database created successfully
415
- st.success("βœ… **Ready to answer your questions!**") # Final success message
416
- st.session_state.processed_file = uploaded_file.name # Store processed file name
417
- st.balloons() # Celebration animation
418
  else:
419
- st.error("❌ Failed to create knowledge base") # Error message
420
  else:
421
- st.error("❌ AI model not available") # Model error
422
-
423
- else:
424
- st.error(f"❌ Failed to process PDF: {pdf_result['error']}") # Extraction error
425
 
426
  # Question answering section
427
- if st.session_state.vector_db is not None: # If database is ready
428
- st.markdown("---") # Visual separator
429
- st.markdown("### πŸ’¬ Ask Your Questions") # Section header with icon
430
-
431
- # Show which document is loaded
432
- if st.session_state.processed_file: # If we have a processed file name
433
- st.markdown(f"*Currently learning from: **{st.session_state.processed_file}***") # Display current document
434
-
435
- # Create a form for better UX
436
- with st.form(key="question_form"): # Form container for question input
437
  question = st.text_input(
438
  "What would you like to know?",
439
- placeholder="e.g., What is the main topic? Summarize chapter 3. Explain the key concepts.", # Multiple examples
440
- help="Ask any question about the content of your document", # Help text
441
- label_visibility="collapsed" # Hide redundant label
442
  )
443
-
444
- # Submit button inside form
445
  submit_button = st.form_submit_button(
446
  "πŸ” Get Answer",
447
- use_container_width=True # Full width button
448
  )
449
 
450
- # Process question when form is submitted
451
- if submit_button and question.strip(): # If form submitted with non-empty question
452
- with st.spinner("πŸ€” Thinking..."): # Processing message
453
  # Query vector database
 
454
  relevant_chunks = query_vector_database(
455
  question,
456
- st.session_state.embedding_model,
457
- top_k=5
458
  )
459
 
460
  # Filter by similarity threshold
461
- SIMILARITY_THRESHOLD = 0.20 # put this at the top of file if not already
462
  relevant_chunks = [c for c in relevant_chunks if c.get('similarity', 0) >= SIMILARITY_THRESHOLD]
463
 
464
- # Check results after spinner block
465
  if not relevant_chunks:
466
  st.warning("❌ No sufficiently relevant passages found (increase threshold or rephrase question).")
467
  else:
468
- # Generate answer
469
- client = setup_groq()
470
- if not client:
471
- st.error("❌ LLM not configured. Check GROQ_API_KEY and that 'groq' is installed.")
472
- else:
473
- answer = generate_answer_with_groq(client, question, relevant_chunks)
474
-
475
-
476
 
477
  # Display answer in chat bubble style
478
- st.markdown("#### 🎯 Answer") # Answer header
479
- st.markdown(f'<div class="answer-box">{answer}</div>', unsafe_allow_html=True) # Styled answer box
480
-
481
  # Display sources in a clean format
482
- st.markdown("#### πŸ“š Top Sources") # Sources header
483
- st.markdown("*Most relevant passages from your document:*") # Sources description
484
-
485
- for i, chunk in enumerate(relevant_chunks, 1): # Loop through sources with numbering
486
- # Create expandable source cards
487
  with st.expander(
488
  f"**Source {i}** | πŸ“„ Page {chunk['page_number']} | "
489
- f"🎯 Relevance: {chunk['similarity']*100:.0f}%" # Convert to percentage
490
  ):
491
- st.markdown(f'<div class="source-card">{chunk["text"][:500]}...</div>',
492
- unsafe_allow_html=True) # Display truncated text in styled card
493
-
494
- else:
495
- st.warning("❌ No relevant information found for your question. Try rephrasing or asking about topics covered in the document.") # Enhanced warning
496
 
497
  else:
498
- # Welcome message when no document is loaded
499
  st.markdown("""
500
  <div style='text-align: center; padding: 3rem; background-color: white; border-radius: 15px; margin: 2rem 0;'>
501
  <h3>πŸ‘‹ Welcome to PageMentor!</h3>
@@ -504,8 +428,7 @@ def main():
504
  </div>
505
  """, unsafe_allow_html=True)
506
 
507
-
508
- # Footer - centered at bottom
509
  st.markdown("""
510
  <div class="footer">
511
  <p>Built with ❀️ using Streamlit | Powered by Hugging Face | Β© 2025 PageMentor</p>
@@ -513,7 +436,6 @@ def main():
513
  </div>
514
  """, unsafe_allow_html=True)
515
 
516
- # RUN THE APPLICATION
517
 
518
- if __name__ == "__main__": # Only run if this file is executed directly
519
- main() # Start the Streamlit app
 
12
  except ImportError:
13
  Groq = None
14
 
15
+ # -----------------------------
16
+ # Config
17
+ # -----------------------------
18
+ SIMILARITY_THRESHOLD = 0.20
19
+ TOP_K = 5
20
+
21
  # -----------------------------
22
  # Utility Functions
23
  # -----------------------------
24
  def load_api_key() -> str:
25
+ """Load the GROQ API key from environment or Hugging Face token fallback."""
26
  api_key = os.environ.get("GROQ_API_KEY")
27
  if not api_key:
28
  try:
 
74
  continue
75
 
76
  words = text.split()
77
+ step = max(1, chunk_size - overlap)
78
+ for i in range(0, len(words), step):
79
  chunk_text = " ".join(words[i:i + chunk_size])
80
  if chunk_text.strip():
81
  chunks.append({
 
124
 
125
  # Store only the collection name (not object) in session_state
126
  st.session_state.collection_name = collection_name
127
+ # Also store a simple flag in vector_db for UI readiness
128
+ st.session_state.vector_db = collection_name
129
  return collection_name
130
 
131
 
132
  def query_vector_database(query: str, embedding_model: SentenceTransformer,
133
+ top_k: int = TOP_K) -> List[Dict]:
134
  """Query ChromaDB for relevant chunks."""
135
  if "collection_name" not in st.session_state:
136
  st.error("No active collection found. Upload and process a PDF first.")
 
172
  elif isinstance(distance, (int, float)) and distance <= 1:
173
  similarity = max(0, 1 - distance)
174
  else:
175
+ try:
176
+ similarity = float(distance)
177
+ except Exception:
178
+ similarity = 0.0
179
 
180
  relevant_chunks.append({
181
  "text": doc,
 
219
  max_tokens=500
220
  )
221
  else:
222
+ # Fallback generic call
223
  chat_resp = client.create(prompt=prompt, max_tokens=500)
224
 
225
  if hasattr(chat_resp, "choices"):
 
236
  return f"Error generating answer: {e}"
237
 
238
 
239
+ # --------------------------------
240
  # STREAMLIT UI
241
+ # --------------------------------
242
  def main():
243
  """Main Streamlit application."""
244
 
245
  # Page configuration with wide layout for centered design
246
  st.set_page_config(
247
+ page_title="PageMentor",
248
+ page_icon="πŸ“š",
249
+ layout="wide"
250
  )
251
 
252
  # Custom CSS for professional styling and centered layout
 
258
  margin: 0 auto;
259
  padding: 2rem 1rem;
260
  }
261
+ .stApp { background-color: #f8f9fa; }
262
+ .header-container { text-align: center; padding: 2rem 0; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 15px; margin-bottom: 2rem; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
263
+ .header-title { color: white; font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; }
264
+ .header-subtitle { color: rgba(255,255,255,0.9); font-size: 1.1rem; }
265
+ .answer-box { background-color: white; border-radius: 15px; padding: 1.5rem; margin: 1rem 0; box-shadow: 0 2px 8px rgba(0,0,0,0.08); border-left: 4px solid #667eea; }
266
+ .source-card { background-color: #f0f2f6; border-radius: 10px; padding: 1rem; margin: 0.5rem 0; border-left: 3px solid #764ba2; }
267
+ .stButton > button { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border: none; border-radius: 8px; padding: 0.5rem 2rem; font-weight: 600; transition: transform 0.2s; }
268
+ .stButton > button:hover { transform: translateY(-2px); box-shadow: 0 4px 12px rgba(102,126,234,0.4); }
269
+ .uploadedFile { background-color: white; border-radius: 10px; padding: 1rem; }
270
+ .stTextInput > div > div > input { border-radius: 8px; border: 2px solid #e0e0e0; padding: 0.75rem; }
271
+ .stTextInput > div > div > input:focus { border-color: #667eea; box-shadow: 0 0 0 2px rgba(102,126,234,0.1); }
272
+ .footer { text-align: center; padding: 2rem 0; margin-top: 3rem; border-top: 1px solid #e0e0e0; color: #666; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  </style>
274
  """, unsafe_allow_html=True)
275
 
 
281
  </div>
282
  """, unsafe_allow_html=True)
283
 
 
284
  st.markdown("---")
285
 
286
+ # Session state init
287
+ if 'vector_db' not in st.session_state:
288
+ st.session_state.vector_db = None
289
+ if 'embedding_model' not in st.session_state:
290
+ st.session_state.embedding_model = None
291
+ if 'processed_file' not in st.session_state:
292
+ st.session_state.processed_file = None
293
+ if 'collection_name' not in st.session_state:
294
+ st.session_state.collection_name = None
295
+
296
+ # Load embedding model if not loaded
297
+ if st.session_state.embedding_model is None:
298
+ with st.spinner("πŸ”„ Loading AI models..."):
299
+ st.session_state.embedding_model = load_embedding_model()
300
+
301
+ col1, col2 = st.columns([2, 1])
302
+
303
+ with col1:
304
+ with st.container():
305
+ st.markdown("### πŸ“„ Upload Your Document")
306
+ st.markdown("*Select a PDF file to start learning*")
307
+
308
  uploaded_file = st.file_uploader(
309
  "Choose a PDF file",
310
+ type="pdf",
311
+ help="Upload any PDF document - textbooks, research papers, articles, etc.",
312
+ label_visibility="collapsed"
313
  )
314
 
315
+ if uploaded_file is not None:
316
+ st.info(f"πŸ“Ž **File:** {uploaded_file.name} ({uploaded_file.size / 1024:.1f} KB)")
317
+
318
+ if st.button("πŸš€ Process Document", use_container_width=True):
319
+ # attempt best-effort cleanup of prior collection
320
+ try:
321
+ old_name = st.session_state.get("collection_name")
322
+ if old_name:
323
+ client_tmp = chromadb.Client()
324
+ if hasattr(client_tmp, "delete_collection"):
325
+ try:
326
+ client_tmp.delete_collection(old_name)
327
+ except Exception:
328
+ pass
329
+ except Exception:
330
+ pass
331
+
332
+ # reset state
333
+ st.session_state.vector_db = None
334
+ st.session_state.collection_name = None
335
+ st.session_state.processed_file = None
336
+
337
+ # process file
338
+ with st.spinner("πŸ“– Reading and analyzing your document..."):
339
+ chunks = pdf_to_chunks(uploaded_file)
340
+
341
+ if not chunks:
342
+ st.error("❌ Failed to extract any text from the uploaded PDF.")
343
+ else:
344
+ total_pages = len({c['page_number'] for c in chunks})
345
+ st.success(f"βœ… Successfully processed **{total_pages} pages**")
346
+ st.info(f"πŸ“ Created **{len(chunks)}** searchable text segments")
347
 
348
  # Create vector database
349
+ if st.session_state.embedding_model:
350
+ with st.spinner("🧠 Building knowledge base..."):
351
+ collection_name = create_vector_database(chunks, st.session_state.embedding_model)
352
+ if collection_name:
353
+ st.session_state.processed_file = uploaded_file.name
354
+ st.success("βœ… **Ready to answer your questions!**")
355
+ st.balloons()
 
 
 
356
  else:
357
+ st.error("❌ Failed to create knowledge base")
358
  else:
359
+ st.error("❌ AI model not available")
 
 
 
360
 
361
  # Question answering section
362
+ if st.session_state.vector_db is not None:
363
+ st.markdown("---")
364
+ st.markdown("### πŸ’¬ Ask Your Questions")
365
+
366
+ if st.session_state.processed_file:
367
+ st.markdown(f"*Currently learning from: **{st.session_state.processed_file}***")
368
+
369
+ with st.form(key="question_form"):
 
 
370
  question = st.text_input(
371
  "What would you like to know?",
372
+ placeholder="e.g., What is the main topic? Summarize chapter 3. Explain the key concepts.",
373
+ help="Ask any question about the content of your document",
374
+ label_visibility="collapsed"
375
  )
376
+
 
377
  submit_button = st.form_submit_button(
378
  "πŸ” Get Answer",
379
+ use_container_width=True
380
  )
381
 
382
+ if submit_button and question.strip():
383
+ with st.spinner("πŸ€” Thinking..."):
 
384
  # Query vector database
385
+ embedding_model = st.session_state.embedding_model
386
  relevant_chunks = query_vector_database(
387
  question,
388
+ embedding_model,
389
+ top_k=TOP_K
390
  )
391
 
392
  # Filter by similarity threshold
 
393
  relevant_chunks = [c for c in relevant_chunks if c.get('similarity', 0) >= SIMILARITY_THRESHOLD]
394
 
395
+ # After spinner
396
  if not relevant_chunks:
397
  st.warning("❌ No sufficiently relevant passages found (increase threshold or rephrase question).")
398
  else:
399
+ # Generate answer
400
+ client = setup_groq()
401
+ if not client:
402
+ st.error("❌ LLM not configured. Check GROQ_API_KEY and that 'groq' is installed.")
403
+ else:
404
+ answer = generate_answer_with_groq(client, question, relevant_chunks)
 
 
405
 
406
  # Display answer in chat bubble style
407
+ st.markdown("#### 🎯 Answer")
408
+ st.markdown(f'<div class="answer-box">{answer}</div>', unsafe_allow_html=True)
409
+
410
  # Display sources in a clean format
411
+ st.markdown("#### πŸ“š Top Sources")
412
+ st.markdown("*Most relevant passages from your document:*")
413
+
414
+ for i, chunk in enumerate(relevant_chunks, 1):
 
415
  with st.expander(
416
  f"**Source {i}** | πŸ“„ Page {chunk['page_number']} | "
417
+ f"🎯 Relevance: {chunk['similarity']*100:.0f}%"
418
  ):
419
+ st.markdown(f'<div class="source-card">{chunk["text"][:500]}...</div>',
420
+ unsafe_allow_html=True)
 
 
 
421
 
422
  else:
 
423
  st.markdown("""
424
  <div style='text-align: center; padding: 3rem; background-color: white; border-radius: 15px; margin: 2rem 0;'>
425
  <h3>πŸ‘‹ Welcome to PageMentor!</h3>
 
428
  </div>
429
  """, unsafe_allow_html=True)
430
 
431
+ # Footer
 
432
  st.markdown("""
433
  <div class="footer">
434
  <p>Built with ❀️ using Streamlit | Powered by Hugging Face | Β© 2025 PageMentor</p>
 
436
  </div>
437
  """, unsafe_allow_html=True)
438
 
 
439
 
440
+ if __name__ == "__main__":
441
+ main()