Shreyas094 commited on
Commit
5a71f95
·
verified ·
1 Parent(s): 1e878de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -10
app.py CHANGED
@@ -25,6 +25,8 @@ import requests
25
  import random
26
  import datetime
27
  from groq import Groq
 
 
28
 
29
  # Automatically get the current year
30
  current_year = datetime.datetime.now().year
@@ -56,6 +58,9 @@ groq_client = Groq(api_key=GROQ_API_KEY)
56
  # Initialize the similarity model
57
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
58
 
 
 
 
59
 
60
  # Set up a session with retry mechanism
61
  def requests_retry_session(
@@ -418,6 +423,46 @@ Your response should be detailed, informative, accurate, and directly relevant t
418
  logger.error(f"Error in LLM summarization: {e}")
419
  return "Error: Unable to generate a summary. Please try again."
420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
422
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
423
  try:
@@ -566,12 +611,17 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
566
 
567
  logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
568
 
569
- # Step 5: Scrape full content for top documents (up to num_results)
570
- for doc in reranked_docs[:num_results]:
571
- full_content = scrape_full_content(doc['url'], max_chars)
572
- doc['full_content'] = full_content
573
-
574
- # Prepare JSON for LLM
 
 
 
 
 
575
  llm_input = {
576
  "query": query,
577
  "documents": [
@@ -581,10 +631,17 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
581
  "summary": doc['summary'],
582
  "full_content": doc['full_content']
583
  } for doc in reranked_docs[:num_results]
 
 
 
 
 
 
 
584
  ]
585
  }
586
 
587
- # Step 6: LLM Summarization
588
  llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
589
 
590
  return llm_summary
@@ -593,7 +650,6 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
593
  logger.error(f"Unexpected error in search_and_scrape: {e}")
594
  return f"An unexpected error occurred during the search and scrape process: {e}"
595
 
596
-
597
  def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
598
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
599
 
@@ -611,8 +667,8 @@ def chat_function(message, history, num_results, max_chars, time_range, language
611
  llm_temperature=llm_temperature,
612
  model=model
613
  )
614
-
615
- yield response
616
 
617
  iface = gr.ChatInterface(
618
  chat_function,
 
25
  import random
26
  import datetime
27
  from groq import Groq
28
+ import faiss
29
+ import numpy as np
30
 
31
  # Automatically get the current year
32
  current_year = datetime.datetime.now().year
 
58
  # Initialize the similarity model
59
  similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
60
 
61
+ # Global variable to store the FAISS index
62
+ faiss_index = None
63
+ document_store = []
64
 
65
  # Set up a session with retry mechanism
66
  def requests_retry_session(
 
423
  logger.error(f"Error in LLM summarization: {e}")
424
  return "Error: Unable to generate a summary. Please try again."
425
 
426
+ def create_or_reset_faiss_index(dimension=384): # 384 is the dimension for 'all-MiniLM-L6-v2' model
427
+ global faiss_index
428
+ faiss_index = faiss.IndexFlatL2(dimension)
429
+
430
+ def add_documents_to_faiss(documents):
431
+ global faiss_index, document_store
432
+
433
+ # Clear previous documents
434
+ document_store.clear()
435
+
436
+ # Create embeddings for the documents
437
+ embeddings = []
438
+ for doc in documents:
439
+ # Combine title and content for embedding
440
+ text_to_embed = f"{doc['title']} {doc['content'][:500]}" # Limit content to first 500 chars for efficiency
441
+ embedding = embedding_model.encode(text_to_embed)
442
+ embeddings.append(embedding)
443
+ document_store.append(doc)
444
+
445
+ # Convert to numpy array
446
+ embeddings_array = np.array(embeddings).astype('float32')
447
+
448
+ # Add to FAISS index
449
+ faiss_index.add(embeddings_array)
450
+
451
+ def search_similar_documents(query, k=5):
452
+ global faiss_index, document_store
453
+
454
+ # Create query embedding
455
+ query_embedding = embedding_model.encode(query)
456
+ query_embedding = np.array([query_embedding]).astype('float32')
457
+
458
+ # Search in FAISS index
459
+ distances, indices = faiss_index.search(query_embedding, k)
460
+
461
+ # Retrieve similar documents
462
+ similar_docs = [document_store[i] for i in indices[0]]
463
+
464
+ return similar_docs
465
+
466
  def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
467
  engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
468
  try:
 
611
 
612
  logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, finance-related documents.")
613
 
614
+ # After Step 5: Scrape full content for top documents
615
+ # Create or reset FAISS index
616
+ create_or_reset_faiss_index()
617
+
618
+ # Add documents to FAISS index
619
+ add_documents_to_faiss(reranked_docs[:num_results])
620
+
621
+ # Search for similar documents in the vector DB
622
+ similar_docs = search_similar_documents(query, k=num_results)
623
+
624
+ # Prepare JSON for LLM, now including similar documents from vector DB
625
  llm_input = {
626
  "query": query,
627
  "documents": [
 
631
  "summary": doc['summary'],
632
  "full_content": doc['full_content']
633
  } for doc in reranked_docs[:num_results]
634
+ ],
635
+ "similar_documents": [
636
+ {
637
+ "title": doc['title'],
638
+ "url": doc['url'],
639
+ "content": doc['content'][:500] # Limit content for brevity
640
+ } for doc in similar_docs
641
  ]
642
  }
643
 
644
+ # Step 6: LLM Summarization (keep as is)
645
  llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
646
 
647
  return llm_summary
 
650
  logger.error(f"Unexpected error in search_and_scrape: {e}")
651
  return f"An unexpected error occurred during the search and scrape process: {e}"
652
 
 
653
  def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
654
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
655
 
 
667
  llm_temperature=llm_temperature,
668
  model=model
669
  )
670
+
671
+ yield response
672
 
673
  iface = gr.ChatInterface(
674
  chat_function,