Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -327,13 +327,17 @@ Remember to focus on financial aspects and implications in your assessment and s
|
|
327 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
328 |
return "Error: Unable to assess relevance and summarize"
|
329 |
|
330 |
-
def scrape_full_content(url, max_chars=3000, timeout=5):
|
331 |
try:
|
332 |
logger.info(f"Scraping full content from: {url}")
|
333 |
|
334 |
# Check if the URL ends with .pdf
|
335 |
if url.lower().endswith('.pdf'):
|
336 |
-
|
|
|
|
|
|
|
|
|
337 |
|
338 |
# Use Newspaper3k for non-PDF content
|
339 |
content = scrape_with_newspaper(url)
|
@@ -399,7 +403,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
|
|
399 |
return "Error: Unable to generate a summary. Please try again."
|
400 |
|
401 |
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
402 |
-
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
|
403 |
try:
|
404 |
# Step 1: Rephrase the Query
|
405 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
@@ -472,7 +476,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
472 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
473 |
break
|
474 |
|
475 |
-
|
476 |
if len(scraped_content) >= num_results:
|
477 |
break
|
478 |
|
@@ -486,7 +490,10 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
486 |
try:
|
487 |
logger.info(f"Processing content from: {url}")
|
488 |
|
489 |
-
content = scrape_full_content(url, max_chars, timeout)
|
|
|
|
|
|
|
490 |
|
491 |
if not content:
|
492 |
logger.warning(f"Failed to scrape content from {url}")
|
@@ -574,7 +581,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
|
|
574 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
575 |
|
576 |
|
577 |
-
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
|
578 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
579 |
|
580 |
response = search_and_scrape(
|
@@ -589,7 +596,8 @@ def chat_function(message, history, num_results, max_chars, time_range, language
|
|
589 |
safesearch=safesearch,
|
590 |
method=method,
|
591 |
llm_temperature=llm_temperature,
|
592 |
-
model=model
|
|
|
593 |
)
|
594 |
|
595 |
yield response
|
@@ -615,6 +623,7 @@ iface = gr.ChatInterface(
|
|
615 |
gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
|
616 |
gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
|
617 |
gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
|
|
|
618 |
],
|
619 |
additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
|
620 |
retry_btn="Retry",
|
|
|
327 |
logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
|
328 |
return "Error: Unable to assess relevance and summarize"
|
329 |
|
330 |
+
def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
|
331 |
try:
|
332 |
logger.info(f"Scraping full content from: {url}")
|
333 |
|
334 |
# Check if the URL ends with .pdf
|
335 |
if url.lower().endswith('.pdf'):
|
336 |
+
if use_pydf2:
|
337 |
+
return scrape_pdf_content(url, max_chars, timeout)
|
338 |
+
else:
|
339 |
+
logger.info(f"Skipping PDF document: {url}")
|
340 |
+
return None
|
341 |
|
342 |
# Use Newspaper3k for non-PDF content
|
343 |
content = scrape_with_newspaper(url)
|
|
|
403 |
return "Error: Unable to generate a summary. Please try again."
|
404 |
|
405 |
def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
|
406 |
+
engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface", use_pydf2=True):
|
407 |
try:
|
408 |
# Step 1: Rephrase the Query
|
409 |
rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
|
|
|
476 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
477 |
break
|
478 |
|
479 |
+
for result in results:
|
480 |
if len(scraped_content) >= num_results:
|
481 |
break
|
482 |
|
|
|
490 |
try:
|
491 |
logger.info(f"Processing content from: {url}")
|
492 |
|
493 |
+
content = scrape_full_content(url, max_chars, timeout, use_pydf2)
|
494 |
+
|
495 |
+
if content is None: # This means it's a PDF and use_pydf2 is False
|
496 |
+
continue
|
497 |
|
498 |
if not content:
|
499 |
logger.warning(f"Failed to scrape content from {url}")
|
|
|
581 |
return f"An unexpected error occurred during the search and scrape process: {e}"
|
582 |
|
583 |
|
584 |
+
def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model, use_pydf2):
|
585 |
chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
|
586 |
|
587 |
response = search_and_scrape(
|
|
|
596 |
safesearch=safesearch,
|
597 |
method=method,
|
598 |
llm_temperature=llm_temperature,
|
599 |
+
model=model,
|
600 |
+
use_pydf2=use_pydf2
|
601 |
)
|
602 |
|
603 |
yield response
|
|
|
623 |
gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
|
624 |
gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
|
625 |
gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
|
626 |
+
gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
|
627 |
],
|
628 |
additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
|
629 |
retry_btn="Retry",
|