SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 8, 2024

Commit

eaf3dee

verified ·

1 Parent(s): 9b298f8

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -7

app.py CHANGED Viewed

@@ -327,13 +327,17 @@ Remember to focus on financial aspects and implications in your assessment and s
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
-def scrape_full_content(url, max_chars=3000, timeout=5):
     try:
         logger.info(f"Scraping full content from: {url}")
         # Check if the URL ends with .pdf
         if url.lower().endswith('.pdf'):
-            return scrape_pdf_content(url, max_chars, timeout)
         # Use Newspaper3k for non-PDF content
         content = scrape_with_newspaper(url)
@@ -399,7 +403,7 @@ Your response should be detailed, informative, accurate, and directly relevant t
         return "Error: Unable to generate a summary. Please try again."
 def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
-                      engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface"):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
@@ -472,7 +476,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
                 logger.warning(f"No more results returned from SearXNG on page {page}.")
                 break
-            for result in results:
                 if len(scraped_content) >= num_results:
                     break
@@ -486,7 +490,10 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
                 try:
                     logger.info(f"Processing content from: {url}")
-                    content = scrape_full_content(url, max_chars, timeout)
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
@@ -574,7 +581,7 @@ def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_r
         return f"An unexpected error occurred during the search and scrape process: {e}"
-def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
     response = search_and_scrape(
@@ -589,7 +596,8 @@ def chat_function(message, history, num_results, max_chars, time_range, language
         safesearch=safesearch,
         method=method,
         llm_temperature=llm_temperature,
-        model=model
     )
     yield response
@@ -615,6 +623,7 @@ iface = gr.ChatInterface(
         gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
         gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
         gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
     ],
     additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
     retry_btn="Retry",

         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
+def scrape_full_content(url, max_chars=3000, timeout=5, use_pydf2=True):
     try:
         logger.info(f"Scraping full content from: {url}")
         # Check if the URL ends with .pdf
         if url.lower().endswith('.pdf'):
+            if use_pydf2:
+                return scrape_pdf_content(url, max_chars, timeout)
+            else:
+                logger.info(f"Skipping PDF document: {url}")
+                return None
         # Use Newspaper3k for non-PDF content
         content = scrape_with_newspaper(url)
         return "Error: Unable to generate a summary. Please try again."
 def search_and_scrape(query, chat_history, num_results=5, max_chars=3000, time_range="", language="all", category="",
+                      engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, model="huggingface", use_pydf2=True):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
                 logger.warning(f"No more results returned from SearXNG on page {page}.")
                 break
+             for result in results:
                 if len(scraped_content) >= num_results:
                     break
                 try:
                     logger.info(f"Processing content from: {url}")
+                    content = scrape_full_content(url, max_chars, timeout, use_pydf2)
+                    if content is None:  # This means it's a PDF and use_pydf2 is False
+                        continue
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
+def chat_function(message, history, num_results, max_chars, time_range, language, category, engines, safesearch, method, llm_temperature, model, use_pydf2):
     chat_history = "\n".join([f"{role}: {msg}" for role, msg in history])
     response = search_and_scrape(
         safesearch=safesearch,
         method=method,
         llm_temperature=llm_temperature,
+        model=model,
+        use_pydf2=use_pydf2
     )
     yield response
         gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
         gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
         gr.Dropdown(["huggingface", "groq"], value="huggingface", label="LLM Model"),
+        gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
     ],
     additional_inputs_accordion=gr.Accordion("⚙️ Advanced Parameters", open=True),
     retry_btn="Retry",