Spaces:

ibombonato
/

nat-ad

Running

App Files Files Community

nat-ad / stealth_scrape_tool.py

ibombonato

Template Image v0 (#5)

df0eb35 verified 3 months ago

raw

history blame

5.71 kB

	import asyncio
	from playwright.async_api import async_playwright
	from playwright_stealth import Stealth
	from bs4 import BeautifulSoup
	from crewai.tools import BaseTool

	class StealthScrapeTool(BaseTool):
	name: str = "Stealth Web Scraper"
	description: str = "A tool for stealthily scraping content from a given URL using Playwright and a CSS selector."

	async def _arun(self, website_url: str, css_element = "body", wait_for_selectors: list[str] = None) -> str:
	try:
	async with Stealth().use_async(async_playwright()) as p:
	browser = await p.chromium.launch(headless=True)
	page = await browser.new_page()

	print(f"StealthScrapeTool: Starting scraping for {website_url}...")
	print(f"StealthScrapeTool: Navigating to {website_url}")
	await page.goto(website_url, timeout=120000)
	await asyncio.sleep(5)

	# Scroll to the bottom of the page repeatedly to load all dynamic content
	print("StealthScrapeTool: Scrolling through the page to load dynamic content...")
	print("StealthScrapeTool: Getting initial scrollHeight...")
	last_height = await page.evaluate("document.body.scrollHeight")
	print(f"StealthScrapeTool: Initial scrollHeight: {last_height}")
	scroll_attempts = 0
	max_scroll_attempts = 10

	while scroll_attempts < max_scroll_attempts:
	print(f"StealthScrapeTool: Scroll attempt {scroll_attempts + 1}")
	print("StealthScrapeTool: Scrolling to bottom...")
	await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
	print("StealthScrapeTool: Scrolled. Waiting for content to load...")

	await asyncio.sleep(5)

	print("StealthScrapeTool: Getting new scrollHeight...")
	new_height = await page.evaluate("document.body.scrollHeight")
	print(f"StealthScrapeTool: New scrollHeight: {new_height}")
	if new_height == last_height:
	print("StealthScrapeTool: ScrollHeight unchanged. Breaking scroll loop.")
	break
	last_height = new_height
	scroll_attempts += 1
	print("StealthScrapeTool: Finished scrolling.")

	print(f"StealthScrapeTool: Page loaded. Attempting to find element with selector '{css_element}'")

	# Element waiting logic
	selectors_to_wait_for = []
	if wait_for_selectors:
	print("StealthScrapeTool: Additional selectors to wait for provided.")
	selectors_to_wait_for.extend(wait_for_selectors)

	# Always include css_element in the list of selectors to wait for
	selectors_to_wait_for.append(css_element)

	combined_selector = ", ".join(selectors_to_wait_for)
	print(f"StealthScrapeTool: Waiting for selectors: {combined_selector}")
	await page.wait_for_selector(combined_selector, timeout=60000, state='attached')


	print("StealthScrapeTool: Required elements found. Extracting content...")
	html_content = await page.content()
	soup = BeautifulSoup(html_content, 'html.parser')

	# Debug print to confirm if waited-for elements are in the scraped content
	if soup.select_one("#all-reviews"):
	print("StealthScrapeTool: #all-reviews found in scraped content.")
	else:
	print("StealthScrapeTool: #all-reviews NOT found in scraped content.")

	target_element = soup.select_one(css_element)
	if target_element:
	# Clean the HTML content
	print(f"Successfully found element with selector '{css_element}'. Cleaning content...")
	for script in target_element.find_all("script"):
	script.decompose()
	for style_tag in target_element.find_all("style"):
	style_tag.decompose()
	for img in target_element.find_all("img"):
	img.decompose()
	for svg in target_element.find_all("svg"):
	svg.decompose()
	for iframe in target_element.find_all("iframe"):
	iframe.decompose()
	for source_tag in target_element.find_all("source"):
	source_tag.decompose()

	# Remove style attributes from all tags
	for tag in target_element.find_all(True):
	if 'style' in tag.attrs:
	del tag['style']

	return target_element.prettify()
	else:
	return f"Error: Could not find element with selector '{css_element}' on the page."
	except Exception as e:
	return f"Error during stealth web scraping: {e}"

	def _run(self, website_url: str, css_element: str, wait_for_selectors: list[str] = None) -> str:
	# This method is for synchronous execution, which is not ideal for Playwright.
	# CrewAI typically calls _arun for async tools.
	# For simplicity, we'll just call the async version here.
	return asyncio.run(self._arun(website_url, css_element, wait_for_selectors))