Shreyas094 commited on
Commit
b864c9d
·
verified ·
1 Parent(s): 5239e89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -56
app.py CHANGED
@@ -475,9 +475,9 @@ Your response should be detailed, informative, accurate, and directly relevant t
475
  return "Error: Unable to generate a summary. Please try again."
476
 
477
  def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
478
- engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5):
479
 
480
- try:
481
  # Step 1: Rephrase the Query
482
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
483
  logger.info(f"Rephrased Query: {rephrased_query}")
@@ -489,74 +489,76 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
489
  # Step 2: Perform search
490
  if use_duckduckgo:
491
  search_results = duckduckgo_search(rephrased_query, num_results, time_range, language, safesearch)
 
492
  else:
493
- # Search query parameters
494
- params = {
495
- 'q': rephrased_query,
496
- 'format': 'json',
497
- 'time_range': time_range,
498
- 'language': language,
499
- 'category': category,
500
- 'engines': ','.join(engines),
501
- 'safesearch': safesearch
502
- }
 
 
 
503
 
504
- # Remove empty parameters
505
- params = {k: v for k, v in params.items() if v != ""}
506
-
507
- # If no engines are specified, set default engines
508
- if 'engines' not in params:
509
- params['engines'] = 'google' # Default to 'google' or any preferred engine
510
- logger.info("No engines specified. Defaulting to 'google'.")
511
-
512
- # Headers for SearXNG request
513
- headers = {
514
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
515
- 'Accept': 'application/json, text/javascript, */*; q=0.01',
516
- 'Accept-Language': 'en-US,en;q=0.5',
517
- 'Origin': 'https://shreyas094-searxng-local.hf.space',
518
- 'Referer': 'https://shreyas094-searxng-local.hf.space/',
519
- 'DNT': '1',
520
- 'Connection': 'keep-alive',
521
- 'Sec-Fetch-Dest': 'empty',
522
- 'Sec-Fetch-Mode': 'cors',
523
- 'Sec-Fetch-Site': 'same-origin',
524
- }
525
 
526
  scraped_content = []
527
  page = 1
528
  while len(scraped_content) < num_results:
529
- # Update params with current page
530
- params['pageno'] = page
 
531
 
532
- # Send request to SearXNG
533
- logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
534
- session = requests_retry_session()
535
 
536
- try:
537
- if method.upper() == "GET":
538
- response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
539
- else: # POST
540
- response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
541
-
542
- response.raise_for_status()
543
- except requests.exceptions.RequestException as e:
544
- logger.error(f"Error during SearXNG request: {e}")
545
- return f"An error occurred during the search request: {e}"
546
 
547
- search_results = response.json()
548
- logger.debug(f"SearXNG Response: {search_results}")
549
 
550
- results = search_results.get('results', [])
551
- if not results:
552
- logger.warning(f"No more results returned from SearXNG on page {page}.")
553
- break
554
 
555
  for result in results:
556
  if len(scraped_content) >= num_results:
557
  break
558
 
559
- url = result.get('url', '')
560
  title = result.get('title', 'No title')
561
 
562
  if not is_valid_url(url):
@@ -584,7 +586,10 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_cha
584
  except Exception as e:
585
  logger.error(f"Unexpected error while scraping {url}: {e}")
586
 
587
- page += 1
 
 
 
588
 
589
  if not scraped_content:
590
  logger.warning("No content scraped from search results.")
 
475
  return "Error: Unable to generate a summary. Please try again."
476
 
477
  def search_and_scrape(query, chat_history, num_results=5, scraper="bs4", max_chars=3000, time_range="", language="all", category="",
478
+ engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=5, use_duckduckgo=False):
479
 
480
+ try:
481
  # Step 1: Rephrase the Query
482
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
483
  logger.info(f"Rephrased Query: {rephrased_query}")
 
489
  # Step 2: Perform search
490
  if use_duckduckgo:
491
  search_results = duckduckgo_search(rephrased_query, num_results, time_range, language, safesearch)
492
+ results = search_results # Assign DuckDuckGo results directly
493
  else:
494
+ # Search query parameters
495
+ params = {
496
+ 'q': rephrased_query,
497
+ 'format': 'json',
498
+ 'time_range': time_range,
499
+ 'language': language,
500
+ 'category': category,
501
+ 'engines': ','.join(engines),
502
+ 'safesearch': safesearch
503
+ }
504
+
505
+ # Remove empty parameters
506
+ params = {k: v for k, v in params.items() if v != ""}
507
 
508
+ # If no engines are specified, set default engines
509
+ if 'engines' not in params:
510
+ params['engines'] = 'google' # Default to 'google' or any preferred engine
511
+ logger.info("No engines specified. Defaulting to 'google'.")
512
+
513
+ # Headers for SearXNG request
514
+ headers = {
515
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
516
+ 'Accept': 'application/json, text/javascript, */*; q=0.01',
517
+ 'Accept-Language': 'en-US,en;q=0.5',
518
+ 'Origin': 'https://shreyas094-searxng-local.hf.space',
519
+ 'Referer': 'https://shreyas094-searxng-local.hf.space/',
520
+ 'DNT': '1',
521
+ 'Connection': 'keep-alive',
522
+ 'Sec-Fetch-Dest': 'empty',
523
+ 'Sec-Fetch-Mode': 'cors',
524
+ 'Sec-Fetch-Site': 'same-origin',
525
+ }
 
 
 
526
 
527
  scraped_content = []
528
  page = 1
529
  while len(scraped_content) < num_results:
530
+ if not use_duckduckgo:
531
+ # Update params with current page
532
+ params['pageno'] = page
533
 
534
+ # Send request to SearXNG
535
+ logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
536
+ session = requests_retry_session()
537
 
538
+ try:
539
+ if method.upper() == "GET":
540
+ response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
541
+ else: # POST
542
+ response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
543
+
544
+ response.raise_for_status()
545
+ except requests.exceptions.RequestException as e:
546
+ logger.error(f"Error during SearXNG request: {e}")
547
+ return f"An error occurred during the search request: {e}"
548
 
549
+ search_results = response.json()
550
+ logger.debug(f"SearXNG Response: {search_results}")
551
 
552
+ results = search_results.get('results', [])
553
+ if not results:
554
+ logger.warning(f"No more results returned from SearXNG on page {page}.")
555
+ break
556
 
557
  for result in results:
558
  if len(scraped_content) >= num_results:
559
  break
560
 
561
+ url = result.get('url', '') if not use_duckduckgo else result.get('href', '')
562
  title = result.get('title', 'No title')
563
 
564
  if not is_valid_url(url):
 
586
  except Exception as e:
587
  logger.error(f"Unexpected error while scraping {url}: {e}")
588
 
589
+ if use_duckduckgo:
590
+ break # DuckDuckGo search doesn't support pagination in this implementation
591
+ else:
592
+ page += 1
593
 
594
  if not scraped_content:
595
  logger.warning("No content scraped from search results.")