Spaces:

mmmay0722
/

WebQA-Agent

Sleeping

App Files Files Community

WebQA-Agent / tests /test_crawler.py

mmmay0722

copy webqa-agent file

aeabbe4 about 2 months ago

raw

history blame contribute delete

11.9 kB

	import json
	import os
	import sys
	from datetime import datetime
	from pathlib import Path
	from typing import Any, Dict, List

	# Add project root to Python path
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

	import pytest
	from playwright.async_api import async_playwright

	from webqa_agent.crawler.crawl import CrawlHandler
	from webqa_agent.crawler.deep_crawler import DeepCrawler

	# pytest tests/test_crawler.py::TestCrawler::test_highlight_crawl -v -s --url https://google.com
	# pytest tests/test_crawler.py -v -s --url https://google.com


	class TestCrawler:
	"""Test suite for web crawling functionality with different parameters."""

	# Default test URLs (can be overridden)
	DEFAULT_TEST_URLS = 'https://google.com'

	# Different crawl parameter combinations to test
	CRAWL_PARAMS = [
	{'name': 'highlight_crawl', 'highlight': True, 'highlight_text': False, 'viewport_only': True},
	{'name': 'text_highlight_crawl', 'highlight': True, 'highlight_text': True, 'viewport_only': True},
	{'name': 'viewport_highlight_crawl', 'highlight': True, 'highlight_text': False, 'viewport_only': True},
	]

	# Directories (class attributes; accessible via self)
	test_results_dir = Path(__file__).parent / 'crawler_test_results'
	screenshots_dir = test_results_dir / 'screenshots'
	id_maps_dir = test_results_dir / 'id_maps'
	crawl_data_dir = test_results_dir / 'crawl_data'
	clean_id_maps_dir = test_results_dir / 'clean_id_maps'

	async def setup_method(self):
	"""Setup method called before each test."""
	# Ensure directories exist
	self.test_results_dir.mkdir(parents=True, exist_ok=True)
	self.screenshots_dir.mkdir(parents=True, exist_ok=True)
	self.id_maps_dir.mkdir(parents=True, exist_ok=True)
	self.crawl_data_dir.mkdir(parents=True, exist_ok=True)
	self.clean_id_maps_dir.mkdir(parents=True, exist_ok=True)
	self.playwright = await async_playwright().start()
	self.browser = await self.playwright.chromium.launch(
	headless=False,
	args=[
	'--no-sandbox',
	'--disable-setuid-sandbox',
	'--disable-gpu',
	'--force-device-scale-factor=1',
	],
	)
	self.context = await self.browser.new_context(
	viewport={'width': 1280, 'height': 720},
	)
	self.page = await self.context.new_page()

	# Set default timeout
	self.page.set_default_navigation_timeout(30000)
	self.page.set_default_timeout(30000)

	async def teardown_method(self):
	"""Teardown method called after each test."""
	if self.context:
	await self.context.close()
	if self.browser:
	await self.browser.close()
	if self.playwright:
	await self.playwright.stop()

	def get_timestamp(self) -> str:
	"""Get timestamp for file naming."""
	return datetime.now().strftime('%Y%m%d_%H%M%S')

	async def take_before_screenshot(self, url: str, param_name: str) -> str:
	"""Take screenshot before crawling."""
	timestamp = self.get_timestamp()
	safe_url = url.replace('://', '_').replace('/', '_')
	screenshot_path = self.screenshots_dir / f'{param_name}_{safe_url}_before_{timestamp}.png'
	await self.page.screenshot(path=str(screenshot_path), full_page=True)
	return str(screenshot_path)

	async def take_after_screenshot(self, url: str, param_name: str) -> str:
	"""Take screenshot after crawling (with possible highlights)"""
	timestamp = self.get_timestamp()
	screenshot_path = (
	self.screenshots_dir / f"{param_name}_{url.replace('://', '_').replace('/', '_')}_after_{timestamp}.png"
	)
	await self.page.screenshot(path=str(screenshot_path), full_page=True)
	return str(screenshot_path)

	def save_id_map(self, url: str, param_name: str, id_map: Dict[str, Any]) -> str:
	"""Save ID map to JSON file."""
	timestamp = self.get_timestamp()
	id_map_path = (
	self.id_maps_dir / f"{param_name}_{url.replace('://', '_').replace('/', '_')}_id_map_{timestamp}.json"
	)

	with open(id_map_path, 'w', encoding='utf-8') as f:
	json.dump(id_map, f, ensure_ascii=False, indent=2)

	return str(id_map_path)

	def save_clean_id_map(self, url: str, param_name: str, clean_id_map: Dict[str, Any]) -> str:
	"""Save clean ID map to JSON file."""
	timestamp = self.get_timestamp()
	clean_id_map_path = (
	self.clean_id_maps_dir / f"{param_name}_{url.replace('://', '_').replace('/', '_')}_clean_id_map_{timestamp}.json"
	)

	with open(clean_id_map_path, 'w', encoding='utf-8') as f:
	json.dump(clean_id_map, f, ensure_ascii=False, indent=2)

	return str(clean_id_map_path)

	def save_crawl_data(self, url: str, param_name: str, crawl_data: Dict[str, Any]) -> str:
	"""Save crawl data to JSON file."""
	timestamp = self.get_timestamp()
	crawl_data_path = (
	self.crawl_data_dir
	/ f"{param_name}_{url.replace('://', '_').replace('/', '_')}_crawl_data_{timestamp}.json"
	)

	with open(crawl_data_path, 'w', encoding='utf-8') as f:
	json.dump(crawl_data, f, ensure_ascii=False, indent=2)

	return str(crawl_data_path)

	def save_test_summary(self, test_results: List[Dict[str, Any]]) -> str:
	"""Save test summary to JSON file."""
	timestamp = self.get_timestamp()
	summary_path = self.test_results_dir / f'test_summary_{timestamp}.json'

	with open(summary_path, 'w', encoding='utf-8') as f:
	json.dump(test_results, f, ensure_ascii=False, indent=2)

	return str(summary_path)

	async def crawl_single_url(self, url: str, params: Dict[str, Any]) -> Dict[str, Any]:
	"""Crawl a single URL with specified parameters using the current
	page/context."""
	await self.page.goto(url, wait_until='networkidle')

	# Take before screenshot
	before_screenshot = await self.take_before_screenshot(url, params['name'])

	# Initialize crawler and perform crawling
	crawler = DeepCrawler(self.page)
	crawl_result = await crawler.crawl(
	page=self.page,
	highlight=params['highlight'],
	highlight_text=params['highlight_text'],
	viewport_only=params['viewport_only'],
	)
	crawl_data = crawl_result.element_tree
	id_map = crawl_result.raw_dict()
	clean_id_map = crawl_result.clean_dict()

	# Take after screenshot
	after_screenshot = await self.take_after_screenshot(url, params['name'])

	# Save results
	id_map_path = self.save_id_map(url, params['name'], id_map)
	clean_id_map_path = self.save_clean_id_map(url, params['name'], clean_id_map)
	crawl_data_path = self.save_crawl_data(url, params['name'], crawl_data)

	# Remove markers if highlights were added
	if params['highlight']:
	await crawler.remove_marker(self.page)

	return {
	'url': url,
	'parameters': params,
	'results': {
	'before_screenshot': before_screenshot,
	'after_screenshot': after_screenshot,
	'id_map_path': id_map_path,
	'clean_id_map_path': clean_id_map_path,
	'crawl_data_path': crawl_data_path,
	'success': True,
	},
	}

	@pytest.mark.asyncio
	async def test_crawl_link(self, request):
	"""Test integration with CrawlHandler for link extraction."""
	await self.setup_method()

	try:
	# Resolve URL from CLI/env or default
	test_url = request.config.getoption('--url') or self.DEFAULT_TEST_URLS

	# Navigate to the test URL
	await self.page.goto(test_url, wait_until='networkidle')

	# Take before screenshot
	before_screenshot = await self.take_before_screenshot(test_url, 'crawl_handler')

	# Initialize crawl handler
	crawl_handler = CrawlHandler(test_url)

	# Extract links
	links = await crawl_handler.extract_links(self.page)
	print(f'🔗 Found {len(links)} links')

	# Get clickable elements using crawl handler
	clickable_elements = await crawl_handler.clickable_elements_detection(self.page)
	print(f'🖱️ Found {len(clickable_elements)} clickable elements')

	# Take after screenshot
	after_screenshot = await self.take_after_screenshot(test_url, 'crawl_handler')

	# Save results
	results = {
	'url': test_url,
	'links': links,
	'clickable_elements': clickable_elements,
	'links_count': len(links),
	'clickable_elements_count': len(clickable_elements),
	}

	results_path = self.save_crawl_data(test_url, 'crawl_handler', results)

	# Assertions
	assert isinstance(links, list)
	assert isinstance(clickable_elements, list)
	assert os.path.exists(before_screenshot)
	assert os.path.exists(after_screenshot)
	assert os.path.exists(results_path)

	print('CrawlHandler integration test passed')

	finally:
	await self.teardown_method()

	@pytest.mark.asyncio
	async def test_highlight_crawl(self, request):
	"""Test highlighted crawl parameters."""
	await self.setup_method()

	try:
	test_url = request.config.getoption('--url') or self.DEFAULT_TEST_URLS

	params = self.CRAWL_PARAMS[0] # highlight_crawl
	result = await self.crawl_single_url(test_url, params)

	assert result['results']['success']
	assert os.path.exists(result['results']['before_screenshot'])
	assert os.path.exists(result['results']['after_screenshot'])
	assert os.path.exists(result['results']['id_map_path'])
	assert os.path.exists(result['results']['crawl_data_path'])
	finally:
	await self.teardown_method()

	@pytest.mark.asyncio
	async def test_text_highlight_crawl(self, request):
	"""Test full highlight crawl parameters."""
	await self.setup_method()

	try:
	test_url = request.config.getoption('--url') or self.DEFAULT_TEST_URLS

	params = self.CRAWL_PARAMS[1] # text_highlight_crawl
	result = await self.crawl_single_url(test_url, params)

	assert result['results']['success']
	assert os.path.exists(result['results']['before_screenshot'])
	assert os.path.exists(result['results']['after_screenshot'])
	assert os.path.exists(result['results']['id_map_path'])
	assert os.path.exists(result['results']['crawl_data_path'])

	finally:
	await self.teardown_method()

	@pytest.mark.asyncio
	async def test_viewport_highlight_crawl(self, request):
	"""Test viewport highlight crawl parameters."""
	await self.setup_method()

	try:
	test_url = request.config.getoption('--url') or self.DEFAULT_TEST_URLS

	params = self.CRAWL_PARAMS[2] # viewport_highlight_crawl
	result = await self.crawl_single_url(test_url, params)

	assert result['results']['success']
	assert os.path.exists(result['results']['before_screenshot'])
	assert os.path.exists(result['results']['after_screenshot'])
	assert os.path.exists(result['results']['id_map_path'])
	assert os.path.exists(result['results']['crawl_data_path'])

	finally:
	await self.teardown_method()