SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 2, 2024

Commit

07efc76

•

1 Parent(s): 1a81bf1

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -62

app.py CHANGED Viewed

@@ -371,71 +371,71 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
                 break
             for result in results:
-            if len(scraped_content) >= num_results:
-                break
-            url = result.get('url', '')
-            title = result.get('title', 'No title')
-            if not is_valid_url(url):
-                logger.warning(f"Invalid URL: {url}")
-                continue
-            try:
-                logger.info(f"Scraping content from: {url}")
-                # Implement a retry mechanism with different user agents
-                user_agents = [
-                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
-                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-                ]
-                content = ""
-                for ua in user_agents:
-                    try:
-                        if scraper == "bs4":
-                            session.headers.update({'User-Agent': ua})
-                            content = scrape_with_bs4(url, session, max_chars)
-                        else:  # trafilatura
-                            # Use urllib to handle custom headers for trafilatura
-                            req = Request(url, headers={'User-Agent': ua})
-                            with urlopen(req) as response:
-                                downloaded = response.read()
-                            # Configure trafilatura to use a specific user agent
-                            config = use_config()
-                            config.set("DEFAULT", "USER_AGENT", ua)
-                            content = scrape_with_trafilatura(url, max_chars)
-                        if content:
-                            break
-                    except requests.exceptions.HTTPError as e:
-                        if e.response.status_code == 403:
-                            logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
                             continue
-                        else:
-                            raise
-                    except Exception as e:
-                        logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
                         continue
-                if not content:
-                    logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
-                    continue
-                scraped_content.append({
-                    "title": title,
-                    "url": url,
-                    "content": content,  # No need to slice here as it's already limited
-                    "scraper": scraper
-                })
-                logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
-            except requests.exceptions.RequestException as e:
-                logger.error(f"Error scraping {url}: {e}")
-            except Exception as e:
-                logger.error(f"Unexpected error while scraping {url}: {e}")
             page += 1

                 break
             for result in results:
+                if len(scraped_content) >= num_results:
+                    break
+                url = result.get('url', '')
+                title = result.get('title', 'No title')
+                if not is_valid_url(url):
+                    logger.warning(f"Invalid URL: {url}")
+                    continue
+                try:
+                    logger.info(f"Scraping content from: {url}")
+                    # Implement a retry mechanism with different user agents
+                    user_agents = [
+                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
+                        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+                    ]
+                    content = ""
+                    for ua in user_agents:
+                        try:
+                            if scraper == "bs4":
+                                session.headers.update({'User-Agent': ua})
+                                content = scrape_with_bs4(url, session, max_chars)
+                            else:  # trafilatura
+                                # Use urllib to handle custom headers for trafilatura
+                                req = Request(url, headers={'User-Agent': ua})
+                                with urlopen(req) as response:
+                                    downloaded = response.read()
+                                # Configure trafilatura to use a specific user agent
+                                config = use_config()
+                                config.set("DEFAULT", "USER_AGENT", ua)
+                                content = scrape_with_trafilatura(url, max_chars)
+                            if content:
+                                break
+                        except requests.exceptions.HTTPError as e:
+                            if e.response.status_code == 403:
+                                logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
+                                continue
+                            else:
+                                raise
+                        except Exception as e:
+                            logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
                             continue
+                    if not content:
+                        logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
                         continue
+                    scraped_content.append({
+                        "title": title,
+                        "url": url,
+                        "content": content,  # No need to slice here as it's already limited
+                        "scraper": scraper
+                    })
+                    logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
+                except requests.exceptions.RequestException as e:
+                    logger.error(f"Error scraping {url}: {e}")
+                except Exception as e:
+                    logger.error(f"Unexpected error while scraping {url}: {e}")
             page += 1