Spaces:
Sleeping
Sleeping
""" | |
Crawl4AI Demo Application | |
==================================================== | |
This is a modified version of the Crawl4AI demo application specifically designed | |
for deployment on Hugging Face Spaces. | |
Features: | |
--------- | |
- Web interface built with Gradio for interactive use | |
- Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS) | |
- Configurable word count threshold | |
- Markdown output with metadata | |
- Sub-page crawling capabilities | |
- Lazy loading support | |
Usage: | |
------ | |
This version is specifically designed for Hugging Face Spaces deployment. | |
Simply upload this file to your Space and it will automatically run. | |
Dependencies: | |
------------ | |
- gradio | |
- crawl4ai>=0.4.3b0 | |
- python-dotenv>=1.0.0 | |
- pydantic>=2.5.0 | |
""" | |
import gradio as gr | |
import asyncio | |
from typing import Optional, Dict, Any, List, Set | |
from enum import Enum | |
from pydantic import BaseModel | |
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig | |
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy | |
import urllib.parse | |
class CrawlerType(str, Enum): | |
"""Enumeration of supported crawler types.""" | |
BASIC = "basic" | |
LLM = "llm" | |
COSINE = "cosine" | |
JSON_CSS = "json_css" | |
class ExtractionType(str, Enum): | |
"""Enumeration of supported extraction strategies.""" | |
DEFAULT = "default" | |
CSS = "css" | |
XPATH = "xpath" | |
LLM = "llm" | |
COMBINED = "combined" | |
class CrawlRequest(BaseModel): | |
"""Request model for crawling operations.""" | |
url: str | |
crawler_type: CrawlerType = CrawlerType.BASIC | |
extraction_type: ExtractionType = ExtractionType.DEFAULT | |
word_count_threshold: int = 100 | |
css_selector: Optional[str] = None | |
xpath_query: Optional[str] = None | |
excluded_tags: Optional[list] = None | |
scan_full_page: bool = False | |
scroll_delay: float = 0.5 | |
crawl_subpages: bool = False | |
max_depth: int = 1 | |
exclude_external_links: bool = True | |
max_pages: int = 10 | |
def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any: | |
"""Create an extraction strategy based on the specified type.""" | |
if extraction_type == ExtractionType.CSS and css_selector: | |
schema = { | |
"name": "Content", | |
"baseSelector": css_selector, | |
"fields": [ | |
{"name": "title", "selector": "h1,h2", "type": "text"}, | |
{"name": "text", "selector": "p", "type": "text"}, | |
{"name": "links", "selector": "a", "type": "attribute", "attribute": "href"} | |
] | |
} | |
return JsonCssExtractionStrategy(schema) | |
return None | |
async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict: | |
"""Recursively crawl pages including sub-pages up to the specified depth.""" | |
if visited is None: | |
visited = set() | |
if current_depth > request.max_depth or len(visited) >= request.max_pages: | |
return None | |
# Normalize URL to avoid duplicates | |
normalized_url = urllib.parse.urljoin(request.url, '/') | |
if normalized_url in visited: | |
return None | |
# Create run configuration for current page | |
run_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
verbose=True, | |
word_count_threshold=request.word_count_threshold, | |
css_selector=request.css_selector, | |
excluded_tags=request.excluded_tags or ["nav", "footer", "header"], | |
exclude_external_links=request.exclude_external_links, | |
wait_for=f"css:{request.css_selector}" if request.css_selector else None, | |
wait_for_images=True, | |
page_timeout=30000, | |
scan_full_page=request.scan_full_page, | |
scroll_delay=request.scroll_delay, | |
extraction_strategy=create_extraction_strategy( | |
request.extraction_type, | |
request.css_selector, | |
request.xpath_query | |
) | |
) | |
browser_config = BrowserConfig( | |
headless=True, | |
viewport_width=1920, | |
viewport_height=1080 | |
) | |
results = { | |
"pages": [], | |
"total_links": 0, | |
"visited_pages": len(visited) | |
} | |
try: | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun(url=request.url, config=run_config) | |
if not result.success: | |
print(f"Failed to crawl {request.url}: {result.error_message}") | |
return None | |
# Add current page result | |
page_result = { | |
"url": request.url, | |
"markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "", | |
"extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None, | |
"depth": current_depth | |
} | |
results["pages"].append(page_result) | |
visited.add(normalized_url) | |
# Process sub-pages if enabled | |
if request.crawl_subpages and hasattr(result, 'links'): | |
internal_links = result.links.get("internal", []) | |
if internal_links: | |
results["total_links"] += len(internal_links) | |
for link in internal_links: | |
if len(visited) >= request.max_pages: | |
break | |
try: | |
normalized_link = urllib.parse.urljoin(request.url, link) | |
link_domain = urllib.parse.urlparse(normalized_link).netloc | |
if normalized_link in visited or (request.exclude_external_links and link_domain != base_url): | |
continue | |
sub_request = CrawlRequest( | |
**{**request.dict(), "url": normalized_link} | |
) | |
sub_result = await crawl_with_subpages( | |
sub_request, | |
base_url, | |
current_depth + 1, | |
visited | |
) | |
if sub_result: | |
results["pages"].extend(sub_result["pages"]) | |
results["total_links"] += sub_result["total_links"] | |
results["visited_pages"] = len(visited) | |
except Exception as e: | |
print(f"Error processing link {link}: {str(e)}") | |
continue | |
return results | |
except Exception as e: | |
print(f"Error crawling {request.url}: {str(e)}") | |
return None | |
async def crawl_url(request: CrawlRequest) -> Dict: | |
"""Crawl a URL and return the extracted content.""" | |
try: | |
base_url = urllib.parse.urlparse(request.url).netloc | |
if request.crawl_subpages: | |
results = await crawl_with_subpages(request, base_url) | |
if not results or not results["pages"]: | |
raise Exception(f"Failed to crawl pages starting from {request.url}") | |
combined_markdown = "\\n\\n---\\n\\n".join( | |
f"## Page: {page['url']}\\n{page['markdown']}" | |
for page in results["pages"] | |
) | |
return { | |
"markdown": combined_markdown, | |
"metadata": { | |
"url": request.url, | |
"crawler_type": request.crawler_type.value, | |
"extraction_type": request.extraction_type.value, | |
"word_count_threshold": request.word_count_threshold, | |
"css_selector": request.css_selector, | |
"xpath_query": request.xpath_query, | |
"scan_full_page": request.scan_full_page, | |
"scroll_delay": request.scroll_delay, | |
"total_pages_crawled": results["visited_pages"], | |
"total_links_found": results["total_links"], | |
"max_depth_reached": min(request.max_depth, max(page["depth"] for page in results["pages"])) | |
}, | |
"pages": results["pages"] | |
} | |
else: | |
wait_condition = f"css:{request.css_selector}" if request.css_selector else None | |
run_config = CrawlerRunConfig( | |
cache_mode=CacheMode.BYPASS, | |
word_count_threshold=request.word_count_threshold, | |
css_selector=request.css_selector, | |
excluded_tags=request.excluded_tags or ["nav", "footer", "header"], | |
wait_for=wait_condition, | |
wait_for_images=True, | |
page_timeout=30000, | |
scan_full_page=request.scan_full_page, | |
scroll_delay=request.scroll_delay, | |
extraction_strategy=create_extraction_strategy( | |
request.extraction_type, | |
request.css_selector, | |
request.xpath_query | |
) | |
) | |
browser_config = BrowserConfig( | |
headless=True, | |
viewport_width=1920, | |
viewport_height=1080 | |
) | |
async with AsyncWebCrawler(config=browser_config) as crawler: | |
result = await crawler.arun(url=request.url, config=run_config) | |
if not result.success: | |
raise Exception(result.error_message) | |
images = result.media.get("images", []) if hasattr(result, 'media') else [] | |
image_info = "\n### Images Found\n" if images else "" | |
for i, img in enumerate(images[:5]): | |
image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n" | |
if img.get('alt'): | |
image_info += f" Alt: {img['alt']}\n" | |
if img.get('score'): | |
image_info += f" Score: {img['score']}\n" | |
return { | |
"markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "", | |
"metadata": { | |
"url": request.url, | |
"crawler_type": request.crawler_type.value, | |
"extraction_type": request.extraction_type.value, | |
"word_count_threshold": request.word_count_threshold, | |
"css_selector": request.css_selector, | |
"xpath_query": request.xpath_query, | |
"scan_full_page": request.scan_full_page, | |
"scroll_delay": request.scroll_delay, | |
"wait_condition": wait_condition | |
}, | |
"extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None, | |
"image_info": image_info | |
} | |
except Exception as e: | |
raise Exception(str(e)) | |
async def gradio_crawl( | |
url: str, | |
crawler_type: str, | |
extraction_type: str, | |
word_count_threshold: int, | |
css_selector: str, | |
xpath_query: str, | |
scan_full_page: bool, | |
scroll_delay: float, | |
crawl_subpages: bool, | |
max_depth: int, | |
max_pages: int, | |
exclude_external_links: bool | |
) -> tuple[str, str]: | |
"""Handle crawling requests from the Gradio interface.""" | |
try: | |
request = CrawlRequest( | |
url=url, | |
crawler_type=CrawlerType(crawler_type.lower()), | |
extraction_type=ExtractionType(extraction_type.lower()), | |
word_count_threshold=word_count_threshold, | |
css_selector=css_selector if css_selector else None, | |
xpath_query=xpath_query if xpath_query else None, | |
scan_full_page=scan_full_page, | |
scroll_delay=scroll_delay, | |
crawl_subpages=crawl_subpages, | |
max_depth=max_depth, | |
max_pages=max_pages, | |
exclude_external_links=exclude_external_links | |
) | |
result = await crawl_url(request) | |
markdown_content = str(result["markdown"]) if result.get("markdown") else "" | |
metadata_str = f"""### Metadata | |
- URL: {result['metadata']['url']} | |
- Crawler Type: {result['metadata']['crawler_type']} | |
- Extraction Type: {result['metadata']['extraction_type']} | |
- Word Count Threshold: {result['metadata']['word_count_threshold']} | |
- CSS Selector: {result['metadata']['css_selector'] or 'None'} | |
- XPath Query: {result['metadata']['xpath_query'] or 'None'} | |
- Full Page Scan: {result['metadata']['scan_full_page']} | |
- Scroll Delay: {result['metadata']['scroll_delay']}s""" | |
if crawl_subpages: | |
metadata_str += f""" | |
- Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)} | |
- Total Links Found: {result['metadata'].get('total_links_found', 0)} | |
- Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}""" | |
if result.get('image_info'): | |
metadata_str += f"\n\n{result['image_info']}" | |
if result.get("extracted_content"): | |
metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```" | |
return markdown_content, metadata_str | |
except Exception as e: | |
error_msg = f"Error: {str(e)}" | |
return error_msg, "Error occurred while crawling" | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=gradio_crawl, | |
inputs=[ | |
gr.Textbox( | |
label="URL", | |
placeholder="Enter URL to crawl", | |
info="The webpage URL to extract content from" | |
), | |
gr.Dropdown( | |
choices=["Basic", "LLM", "Cosine", "JSON/CSS"], | |
label="Crawler Type", | |
value="Basic", | |
info="Select the content extraction strategy" | |
), | |
gr.Dropdown( | |
choices=["Default", "CSS", "XPath", "LLM", "Combined"], | |
label="Extraction Type", | |
value="Default", | |
info="Choose how to extract content from the page" | |
), | |
gr.Slider( | |
minimum=50, | |
maximum=500, | |
value=100, | |
step=50, | |
label="Word Count Threshold", | |
info="Minimum number of words required for content extraction" | |
), | |
gr.Textbox( | |
label="CSS Selector", | |
placeholder="e.g., article.content, main.post", | |
info="CSS selector to target specific content (used with CSS extraction type)" | |
), | |
gr.Textbox( | |
label="XPath Query", | |
placeholder="e.g., //article[@class='content']", | |
info="XPath query to target specific content (used with XPath extraction type)" | |
), | |
gr.Checkbox( | |
label="Scan Full Page", | |
value=False, | |
info="Enable to scroll through the entire page to load lazy content" | |
), | |
gr.Slider( | |
minimum=0.1, | |
maximum=2.0, | |
value=0.5, | |
step=0.1, | |
label="Scroll Delay", | |
info="Delay between scroll steps in seconds when scanning full page" | |
), | |
gr.Checkbox( | |
label="Crawl Sub-pages", | |
value=False, | |
info="Enable to crawl links found on the page" | |
), | |
gr.Slider( | |
minimum=1, | |
maximum=5, | |
value=1, | |
step=1, | |
label="Max Crawl Depth", | |
info="Maximum depth for recursive crawling (1 = only direct links)" | |
), | |
gr.Slider( | |
minimum=1, | |
maximum=50, | |
value=10, | |
step=5, | |
label="Max Pages", | |
info="Maximum number of pages to crawl" | |
), | |
gr.Checkbox( | |
label="Exclude External Links", | |
value=True, | |
info="Only crawl links within the same domain" | |
) | |
], | |
outputs=[ | |
gr.Markdown(label="Generated Markdown"), | |
gr.Markdown(label="Metadata & Extraction Results") | |
], | |
title="Crawl4AI Demo", | |
description=""" | |
This demo allows you to extract content from web pages using different crawling and extraction strategies. | |
1. Enter a URL to crawl | |
2. Select a crawler type (Basic, LLM, Cosine, JSON/CSS) | |
3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined) | |
4. Configure additional options: | |
- Word count threshold for content filtering | |
- CSS selectors for targeting specific content | |
- XPath queries for precise extraction | |
- Full page scanning for lazy-loaded content | |
- Scroll delay for controlling page scanning speed | |
- Sub-page crawling with depth control | |
- Maximum number of pages to crawl | |
- External link filtering | |
The extracted content will be displayed in markdown format along with metadata and extraction results. | |
When sub-page crawling is enabled, content from all crawled pages will be combined in the output. | |
""", | |
examples=[ | |
["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True], | |
["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True], | |
] | |
) | |
# For Hugging Face Spaces, we launch just the Gradio interface | |
if __name__ == "__main__": | |
demo.launch() | |