Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -28,6 +28,7 @@ from groq import Groq
|
|
28 |
import os
|
29 |
from mistralai import Mistral
|
30 |
from dotenv import load_dotenv
|
|
|
31 |
|
32 |
# Automatically get the current year
|
33 |
current_year = datetime.datetime.now().year
|
@@ -222,7 +223,13 @@ Rephrased query:
|
|
222 |
logger.error(f"Error rephrasing query with LLM: {e}")
|
223 |
return query # Fallback to original query if rephrasing fails
|
224 |
|
225 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
try:
|
227 |
# Step 1: Encode the query and document summaries
|
228 |
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
@@ -240,8 +247,8 @@ def rerank_documents(query, documents, similarity_threshold=0.95, max_results=5)
|
|
240 |
# Combine documents and cosine scores
|
241 |
scored_documents = list(zip(documents, cosine_scores))
|
242 |
|
243 |
-
# Step 3: Sort documents by cosine similarity score
|
244 |
-
scored_documents.sort(key=lambda x: x[1], reverse=
|
245 |
|
246 |
# Step 4: Filter out similar documents
|
247 |
filtered_docs = []
|
@@ -428,7 +435,11 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
428 |
logger.info("No need to perform search based on the rephrased query.")
|
429 |
return "No search needed for the provided input."
|
430 |
|
431 |
-
# Step 2:
|
|
|
|
|
|
|
|
|
432 |
# Search query parameters
|
433 |
params = {
|
434 |
'q': rephrased_query,
|
@@ -534,7 +545,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
534 |
|
535 |
logger.info(f"Successfully scraped {len(scraped_content)} documents.")
|
536 |
|
537 |
-
|
538 |
relevant_documents = []
|
539 |
unique_summaries = []
|
540 |
for doc in scraped_content:
|
@@ -545,11 +556,14 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
545 |
summary_text = summary.replace("Summary: ", "").strip()
|
546 |
|
547 |
if is_content_unique(summary_text, unique_summaries):
|
|
|
|
|
548 |
relevant_documents.append({
|
549 |
"title": doc['title'],
|
550 |
"url": doc['url'],
|
551 |
"summary": summary_text,
|
552 |
-
"scraper": doc['scraper']
|
|
|
553 |
})
|
554 |
unique_summaries.append(summary_text)
|
555 |
else:
|
@@ -559,8 +573,8 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
559 |
logger.warning("No relevant and unique documents found.")
|
560 |
return "No relevant and unique financial news found for the given query."
|
561 |
|
562 |
-
# Step
|
563 |
-
reranked_docs =
|
564 |
|
565 |
if not reranked_docs:
|
566 |
logger.warning("No documents remained after reranking.")
|
@@ -595,7 +609,6 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
595 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
596 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
597 |
|
598 |
-
|
599 |
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model, use_pydf2):
|
600 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
601 |
|
|
|
28 |
import os
|
29 |
from mistralai import Mistral
|
30 |
from dotenv import load_dotenv
|
31 |
+
import re
|
32 |
|
33 |
# Automatically get the current year
|
34 |
current_year = datetime.datetime.now().year
|
|
|
223 |
logger.error(f"Error rephrasing query with LLM: {e}")
|
224 |
return query # Fallback to original query if rephrasing fails
|
225 |
|
226 |
+
def extract_entity_domain(query):
|
227 |
+
# Use a simple regex pattern to extract domain names from the query
|
228 |
+
domain_pattern = r'\b(?:https?://)?(?:www\.)?([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+)\b'
|
229 |
+
matches = re.findall(domain_pattern, query)
|
230 |
+
return matches[0] if matches else None
|
231 |
+
|
232 |
+
def rerank_documents_with_priority(query, documents, entity_domain, similarity_threshold=0.95, max_results=5):
|
233 |
try:
|
234 |
# Step 1: Encode the query and document summaries
|
235 |
query_embedding = similarity_model.encode(query, convert_to_tensor=True)
|
|
|
247 |
# Combine documents and cosine scores
|
248 |
scored_documents = list(zip(documents, cosine_scores))
|
249 |
|
250 |
+
# Step 3: Sort documents by cosine similarity score and prioritize entity domain
|
251 |
+
scored_documents.sort(key=lambda x: (not x[0]['is_entity_domain'], -x[1]), reverse=False)
|
252 |
|
253 |
# Step 4: Filter out similar documents
|
254 |
filtered_docs = []
|
|
|
435 |
logger.info("No need to perform search based on the rephrased query.")
|
436 |
return "No search needed for the provided input."
|
437 |
|
438 |
+
# Step 2: Extract entity domain
|
439 |
+
entity_domain = extract_entity_domain(rephrased_query)
|
440 |
+
logger.info(f"Extracted entity domain: {entity_domain}")
|
441 |
+
|
442 |
+
# Step 3: Perform search
|
443 |
# Search query parameters
|
444 |
params = {
|
445 |
'q': rephrased_query,
|
|
|
545 |
|
546 |
logger.info(f"Successfully scraped {len(scraped_content)} documents.")
|
547 |
|
548 |
+
# Step 4: Assess relevance, summarize, and check for uniqueness
|
549 |
relevant_documents = []
|
550 |
unique_summaries = []
|
551 |
for doc in scraped_content:
|
|
|
556 |
summary_text = summary.replace("Summary: ", "").strip()
|
557 |
|
558 |
if is_content_unique(summary_text, unique_summaries):
|
559 |
+
doc_domain = urlparse(doc['url']).netloc
|
560 |
+
is_entity_domain = doc_domain == entity_domain
|
561 |
relevant_documents.append({
|
562 |
"title": doc['title'],
|
563 |
"url": doc['url'],
|
564 |
"summary": summary_text,
|
565 |
+
"scraper": doc['scraper'],
|
566 |
+
"is_entity_domain": is_entity_domain
|
567 |
})
|
568 |
unique_summaries.append(summary_text)
|
569 |
else:
|
|
|
573 |
logger.warning("No relevant and unique documents found.")
|
574 |
return "No relevant and unique financial news found for the given query."
|
575 |
|
576 |
+
# Step 5: Rerank documents based on similarity to query and prioritize entity domain
|
577 |
+
reranked_docs = rerank_documents_with_priority(rephrased_query, relevant_documents, entity_domain, similarity_threshold=0.95, max_results=num_results)
|
578 |
|
579 |
if not reranked_docs:
|
580 |
logger.warning("No documents remained after reranking.")
|
|
|
609 |
logger.error(f"Unexpected error in search_and_scrape: {e}")
|
610 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
611 |
|
|
|
612 |
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model, use_pydf2):
|
613 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
614 |
|