Shreyas094 commited on
Commit
eaf3dee
·
verified ·
1 Parent(s): 9b298f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -327,13 +327,17 @@ Remember to focus on financial aspects and implications in your assessment and s
327
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
328
  return "Error: Unable to assess relevance and summarize"
329
 
330
- def scrape_full_content(url, max_chars=3000, timeout=5):
331
  try:
332
  logger.info(f"Scraping full content from: {url}")
333
 
334
  # Check if the URL ends with .pdf
335
  if url.lower().endswith('.pdf'):
336
- return scrape_pdf_content(url, max_chars, timeout)
 
 
 
 
337
 
338
  # Use Newspaper3k for non-PDF content
339
  content = scrape_with_newspaper(url)
@@ -399,7 +403,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
399
  return "Error: Unable to generate a summary. Please try again."
400
 
401
  def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
402
- engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
403
  try:
404
  # Step 1: Rephrase the Query
405
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
@@ -472,7 +476,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
472
  logger.warning(f"No more results returned from SearXNG on page {page}.")
473
  break
474
 
475
- for result in results:
476
  if len(scraped_content) >= num_results:
477
  break
478
 
@@ -486,7 +490,10 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
486
  try:
487
  logger.info(f"Processing content from: {url}")
488
 
489
- content = scrape_full_content(url, max_chars, timeout)
 
 
 
490
 
491
  if not content:
492
  logger.warning(f"Failed to scrape content from {url}")
@@ -574,7 +581,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
574
  return f"An unexpected error occurred during the search and scrape process: {e}"
575
 
576
 
577
- def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
578
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
579
 
580
  response = search_and_scrape(
@@ -589,7 +596,8 @@ def chat_function(message, history, num_results, max_chars, time_range, language
589
  safesearch=safesearch,
590
  method=method,
591
  llm_temperature=llm_temperature,
592
- model=model
 
593
  )
594
 
595
  yield response
@@ -615,6 +623,7 @@ iface = gr.ChatInterface(
615
  gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
616
  gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
617
  gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
 
618
  ],
619
  additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
620
  retry_btn="Retry",
 
327
  logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
328
  return "Error: Unable to assess relevance and summarize"
329
 
330
+ def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
331
  try:
332
  logger.info(f"Scraping full content from: {url}")
333
 
334
  # Check if the URL ends with .pdf
335
  if url.lower().endswith('.pdf'):
336
+ if use_pydf2:
337
+ return scrape_pdf_content(url, max_chars, timeout)
338
+ else:
339
+ logger.info(f"Skipping PDF document: {url}")
340
+ return None
341
 
342
  # Use Newspaper3k for non-PDF content
343
  content = scrape_with_newspaper(url)
 
403
  return "Error: Unable to generate a summary. Please try again."
404
 
405
  def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
406
+ engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface", use_pydf2=True):
407
  try:
408
  # Step 1: Rephrase the Query
409
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
 
476
  logger.warning(f"No more results returned from SearXNG on page {page}.")
477
  break
478
 
479
+ for result in results:
480
  if len(scraped_content) >= num_results:
481
  break
482
 
 
490
  try:
491
  logger.info(f"Processing content from: {url}")
492
 
493
+ content = scrape_full_content(url, max_chars, timeout, use_pydf2)
494
+
495
+ if content is None: # This means it's a PDF and use_pydf2 is False
496
+ continue
497
 
498
  if not content:
499
  logger.warning(f"Failed to scrape content from {url}")
 
581
  return f"An unexpected error occurred during the search and scrape process: {e}"
582
 
583
 
584
+ def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model, use_pydf2):
585
  chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
586
 
587
  response = search_and_scrape(
 
596
  safesearch=safesearch,
597
  method=method,
598
  llm_temperature=llm_temperature,
599
+ model=model,
600
+ use_pydf2=use_pydf2
601
  )
602
 
603
  yield response
 
623
  gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
624
  gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
625
  gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
626
+ gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
627
  ],
628
  additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
629
  retry_btn="Retry",