Shreyas094 commited on
Commit
27f1192
·
verified ·
1 Parent(s): d0bc86a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -9
app.py CHANGED
@@ -28,6 +28,7 @@ from groq import Groq
28
  import os
29
  from mistralai import Mistral
30
  from dotenv import load_dotenv
 
31
 
32
  # Automatically get the current year
33
  current_year = datetime.datetime.now().year
@@ -222,7 +223,13 @@ Rephrased query:
222
  logger.error(f"Error rephrasing query with LLM: {e}")
223
  return query # Fallback to original query if rephrasing fails
224
 
225
- def rerank_documents(query, documents, similarity_threshold=0.95, max_results=5):
 
 
 
 
 
 
226
  try:
227
  # Step 1: Encode the query and document summaries
228
  query_embedding = similarity_model.encode(query, convert_to_tensor=True)
@@ -240,8 +247,8 @@ def rerank_documents(query, documents, similarity_threshold=0.95, max_results=5)
240
  # Combine documents and cosine scores
241
  scored_documents = list(zip(documents, cosine_scores))
242
 
243
- # Step 3: Sort documents by cosine similarity score
244
- scored_documents.sort(key=lambda x: x[1], reverse=True)
245
 
246
  # Step 4: Filter out similar documents
247
  filtered_docs = []
@@ -428,7 +435,11 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
428
  logger.info("No need to perform search based on the rephrased query.")
429
  return "No search needed for the provided input."
430
 
431
- # Step 2: Perform search
 
 
 
 
432
  # Search query parameters
433
  params = {
434
  'q': rephrased_query,
@@ -534,7 +545,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
534
 
535
  logger.info(f"Successfully scraped {len(scraped_content)} documents.")
536
 
537
- # Step 3: Assess relevance, summarize, and check for uniqueness
538
  relevant_documents = []
539
  unique_summaries = []
540
  for doc in scraped_content:
@@ -545,11 +556,14 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
545
  summary_text = summary.replace("Summary: ", "").strip()
546
 
547
  if is_content_unique(summary_text, unique_summaries):
 
 
548
  relevant_documents.append({
549
  "title": doc['title'],
550
  "url": doc['url'],
551
  "summary": summary_text,
552
- "scraper": doc['scraper']
 
553
  })
554
  unique_summaries.append(summary_text)
555
  else:
@@ -559,8 +573,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
559
  logger.warning("No relevant and unique documents found.")
560
  return "No relevant and unique financial news found for the given query."
561
 
562
- # Step 4: Rerank documents based on similarity to query
563
- reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
564
 
565
  if not reranked_docs:
566
  logger.warning("No documents remained after reranking.")
@@ -595,7 +609,6 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
595
  logger.error(f"Unexpected error in search_and_scrape: {e}")
596
  return f"An unexpected error occurred during the search and scrape process: {e}"
597
 
598
-
599
  def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model, use_pydf2):
600
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
601
 
 
28
  import os
29
  from mistralai import Mistral
30
  from dotenv import load_dotenv
31
+ import re
32
 
33
  # Automatically get the current year
34
  current_year = datetime.datetime.now().year
 
223
  logger.error(f"Error rephrasing query with LLM: {e}")
224
  return query # Fallback to original query if rephrasing fails
225
 
226
+ def extract_entity_domain(query):
227
+ # Use a simple regex pattern to extract domain names from the query
228
+ domain_pattern = r'\b(?:https?://)?(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+)\b'
229
+ matches = re.findall(domain_pattern, query)
230
+ return matches[0] if matches else None
231
+
232
+ def rerank_documents_with_priority(query, documents, entity_domain, similarity_threshold=0.95, max_results=5):
233
  try:
234
  # Step 1: Encode the query and document summaries
235
  query_embedding = similarity_model.encode(query, convert_to_tensor=True)
 
247
  # Combine documents and cosine scores
248
  scored_documents = list(zip(documents, cosine_scores))
249
 
250
+ # Step 3: Sort documents by cosine similarity score and prioritize entity domain
251
+ scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
252
 
253
  # Step 4: Filter out similar documents
254
  filtered_docs = []
 
435
  logger.info("No need to perform search based on the rephrased query.")
436
  return "No search needed for the provided input."
437
 
438
+ # Step 2: Extract entity domain
439
+ entity_domain = extract_entity_domain(rephrased_query)
440
+ logger.info(f"Extracted entity domain: {entity_domain}")
441
+
442
+ # Step 3: Perform search
443
  # Search query parameters
444
  params = {
445
  'q': rephrased_query,
 
545
 
546
  logger.info(f"Successfully scraped {len(scraped_content)} documents.")
547
 
548
+ # Step 4: Assess relevance, summarize, and check for uniqueness
549
  relevant_documents = []
550
  unique_summaries = []
551
  for doc in scraped_content:
 
556
  summary_text = summary.replace("Summary: ", "").strip()
557
 
558
  if is_content_unique(summary_text, unique_summaries):
559
+ doc_domain = urlparse(doc['url']).netloc
560
+ is_entity_domain = doc_domain == entity_domain
561
  relevant_documents.append({
562
  "title": doc['title'],
563
  "url": doc['url'],
564
  "summary": summary_text,
565
+ "scraper": doc['scraper'],
566
+ "is_entity_domain": is_entity_domain
567
  })
568
  unique_summaries.append(summary_text)
569
  else:
 
573
  logger.warning("No relevant and unique documents found.")
574
  return "No relevant and unique financial news found for the given query."
575
 
576
+ # Step 5: Rerank documents based on similarity to query and prioritize entity domain
577
+ reranked_docs = rerank_documents_with_priority(rephrased_query, relevant_documents, entity_domain, similarity_threshold=0.95, max_results=num_results)
578
 
579
  if not reranked_docs:
580
  logger.warning("No documents remained after reranking.")
 
609
  logger.error(f"Unexpected error in search_and_scrape: {e}")
610
  return f"An unexpected error occurred during the search and scrape process: {e}"
611
 
 
612
  def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model, use_pydf2):
613
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
614