Spaces:
Build error
Build error
| import requests | |
| import undetected_chromedriver as uc | |
| from langchain.tools import tool | |
| from bs4 import BeautifulSoup | |
| from duckduckgo_search import DDGS | |
| class WebScapeAdv_UC: | |
| def scrape_with_undetectable_chrome(url: str) -> str: | |
| """ | |
| Scrape webpage content using Selenium with undetectable Chrome driver. | |
| :param url: The URL of the webpage to scrape. | |
| :return: The text content of the webpage. | |
| """ | |
| try: | |
| options = uc.ChromeOptions() | |
| options.add_argument('--headless') | |
| options.add_argument('--no-sandbox') | |
| options.add_argument('--disable-dev-shm-usage') | |
| # Initialize undetectable Chrome driver | |
| driver = uc.Chrome(options=options) | |
| driver.get(url) | |
| html = driver.page_source | |
| driver.quit() # Ensure to quit the driver to free resources | |
| soup = BeautifulSoup(html, 'html.parser') | |
| return soup.get_text() | |
| except Exception as e: | |
| return f"Failed to fetch content with error: {e}" | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import undetected_chromedriver as uc | |
| def scrape_with_fallback(url: str) -> str: | |
| """ | |
| Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed. | |
| :param url: The URL of the webpage to scrape. | |
| :return: The text content of the webpage. | |
| """ | |
| # Try scraping with requests and BeautifulSoup | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| if len(soup.get_text().strip()) > 100: # Arbitrary threshold of 100 characters | |
| return soup.get_text() | |
| # If the first attempt fails, fallback to Selenium with undetectable Chrome driver | |
| try: | |
| options = uc.ChromeOptions() | |
| options.add_argument('--headless') | |
| options.add_argument('--no-sandbox') | |
| options.add_argument('--disable-dev-shm-usage') | |
| # Initialize undetectable Chrome driver | |
| driver = uc.Chrome(options=options) | |
| driver.get(url) | |
| html = driver.page_source | |
| driver.quit() # Ensure to quit the driver to free resources | |
| soup = BeautifulSoup(html, 'html.parser') | |
| return soup.get_text() | |
| except Exception as e: | |
| return f"Failed to fetch content with error: {e}" | |