Spaces:
Sleeping
Sleeping
| """ | |
| Crawl4AI Demo Application | |
| ==================================================== | |
| This is a modified version of the Crawl4AI demo application specifically designed | |
| for deployment on Hugging Face Spaces. | |
| Features: | |
| --------- | |
| - Web interface built with Gradio for interactive use | |
| - Support for multiple crawler types (Basic, LLM, Cosine, JSON/CSS) | |
| - Configurable word count threshold | |
| - Markdown output with metadata | |
| - Sub-page crawling capabilities | |
| - Lazy loading support | |
| Usage: | |
| ------ | |
| This version is specifically designed for Hugging Face Spaces deployment. | |
| Simply upload this file to your Space and it will automatically run. | |
| Dependencies: | |
| ------------ | |
| - gradio | |
| - crawl4ai>=0.4.3b0 | |
| - python-dotenv>=1.0.0 | |
| - pydantic>=2.5.0 | |
| """ | |
| import gradio as gr | |
| import asyncio | |
| from typing import Optional, Dict, Any, List, Set | |
| from enum import Enum | |
| from pydantic import BaseModel | |
| from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig | |
| from crawl4ai.extraction_strategy import JsonCssExtractionStrategy | |
| import urllib.parse | |
| class CrawlerType(str, Enum): | |
| """Enumeration of supported crawler types.""" | |
| BASIC = "basic" | |
| LLM = "llm" | |
| COSINE = "cosine" | |
| JSON_CSS = "json_css" | |
| class ExtractionType(str, Enum): | |
| """Enumeration of supported extraction strategies.""" | |
| DEFAULT = "default" | |
| CSS = "css" | |
| XPATH = "xpath" | |
| LLM = "llm" | |
| COMBINED = "combined" | |
| class CrawlRequest(BaseModel): | |
| """Request model for crawling operations.""" | |
| url: str | |
| crawler_type: CrawlerType = CrawlerType.BASIC | |
| extraction_type: ExtractionType = ExtractionType.DEFAULT | |
| word_count_threshold: int = 100 | |
| css_selector: Optional[str] = None | |
| xpath_query: Optional[str] = None | |
| excluded_tags: Optional[list] = None | |
| scan_full_page: bool = False | |
| scroll_delay: float = 0.5 | |
| crawl_subpages: bool = False | |
| max_depth: int = 1 | |
| exclude_external_links: bool = True | |
| max_pages: int = 10 | |
| def create_extraction_strategy(extraction_type: ExtractionType, css_selector: Optional[str] = None, xpath_query: Optional[str] = None) -> Any: | |
| """Create an extraction strategy based on the specified type.""" | |
| if extraction_type == ExtractionType.CSS and css_selector: | |
| schema = { | |
| "name": "Content", | |
| "baseSelector": css_selector, | |
| "fields": [ | |
| {"name": "title", "selector": "h1,h2", "type": "text"}, | |
| {"name": "text", "selector": "p", "type": "text"}, | |
| {"name": "links", "selector": "a", "type": "attribute", "attribute": "href"} | |
| ] | |
| } | |
| return JsonCssExtractionStrategy(schema) | |
| return None | |
| async def crawl_with_subpages(request: CrawlRequest, base_url: str, current_depth: int = 1, visited: Set[str] = None) -> Dict: | |
| """Recursively crawl pages including sub-pages up to the specified depth.""" | |
| if visited is None: | |
| visited = set() | |
| if current_depth > request.max_depth or len(visited) >= request.max_pages: | |
| return None | |
| # Normalize URL to avoid duplicates | |
| normalized_url = urllib.parse.urljoin(request.url, '/') | |
| if normalized_url in visited: | |
| return None | |
| # Create run configuration for current page | |
| run_config = CrawlerRunConfig( | |
| cache_mode=CacheMode.BYPASS, | |
| verbose=True, | |
| word_count_threshold=request.word_count_threshold, | |
| css_selector=request.css_selector, | |
| excluded_tags=request.excluded_tags or ["nav", "footer", "header"], | |
| exclude_external_links=request.exclude_external_links, | |
| wait_for=f"css:{request.css_selector}" if request.css_selector else None, | |
| wait_for_images=True, | |
| page_timeout=30000, | |
| scan_full_page=request.scan_full_page, | |
| scroll_delay=request.scroll_delay, | |
| extraction_strategy=create_extraction_strategy( | |
| request.extraction_type, | |
| request.css_selector, | |
| request.xpath_query | |
| ) | |
| ) | |
| browser_config = BrowserConfig( | |
| headless=True, | |
| viewport_width=1920, | |
| viewport_height=1080 | |
| ) | |
| results = { | |
| "pages": [], | |
| "total_links": 0, | |
| "visited_pages": len(visited) | |
| } | |
| try: | |
| async with AsyncWebCrawler(config=browser_config) as crawler: | |
| result = await crawler.arun(url=request.url, config=run_config) | |
| if not result.success: | |
| print(f"Failed to crawl {request.url}: {result.error_message}") | |
| return None | |
| # Add current page result | |
| page_result = { | |
| "url": request.url, | |
| "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "", | |
| "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None, | |
| "depth": current_depth | |
| } | |
| results["pages"].append(page_result) | |
| visited.add(normalized_url) | |
| # Process sub-pages if enabled | |
| if request.crawl_subpages and hasattr(result, 'links'): | |
| internal_links = result.links.get("internal", []) | |
| if internal_links: | |
| results["total_links"] += len(internal_links) | |
| for link in internal_links: | |
| if len(visited) >= request.max_pages: | |
| break | |
| try: | |
| normalized_link = urllib.parse.urljoin(request.url, link) | |
| link_domain = urllib.parse.urlparse(normalized_link).netloc | |
| if normalized_link in visited or (request.exclude_external_links and link_domain != base_url): | |
| continue | |
| sub_request = CrawlRequest( | |
| **{**request.dict(), "url": normalized_link} | |
| ) | |
| sub_result = await crawl_with_subpages( | |
| sub_request, | |
| base_url, | |
| current_depth + 1, | |
| visited | |
| ) | |
| if sub_result: | |
| results["pages"].extend(sub_result["pages"]) | |
| results["total_links"] += sub_result["total_links"] | |
| results["visited_pages"] = len(visited) | |
| except Exception as e: | |
| print(f"Error processing link {link}: {str(e)}") | |
| continue | |
| return results | |
| except Exception as e: | |
| print(f"Error crawling {request.url}: {str(e)}") | |
| return None | |
| async def crawl_url(request: CrawlRequest) -> Dict: | |
| """Crawl a URL and return the extracted content.""" | |
| try: | |
| base_url = urllib.parse.urlparse(request.url).netloc | |
| if request.crawl_subpages: | |
| results = await crawl_with_subpages(request, base_url) | |
| if not results or not results["pages"]: | |
| raise Exception(f"Failed to crawl pages starting from {request.url}") | |
| combined_markdown = "\\n\\n---\\n\\n".join( | |
| f"## Page: {page['url']}\\n{page['markdown']}" | |
| for page in results["pages"] | |
| ) | |
| return { | |
| "markdown": combined_markdown, | |
| "metadata": { | |
| "url": request.url, | |
| "crawler_type": request.crawler_type.value, | |
| "extraction_type": request.extraction_type.value, | |
| "word_count_threshold": request.word_count_threshold, | |
| "css_selector": request.css_selector, | |
| "xpath_query": request.xpath_query, | |
| "scan_full_page": request.scan_full_page, | |
| "scroll_delay": request.scroll_delay, | |
| "total_pages_crawled": results["visited_pages"], | |
| "total_links_found": results["total_links"], | |
| "max_depth_reached": min(request.max_depth, max(page["depth"] for page in results["pages"])) | |
| }, | |
| "pages": results["pages"] | |
| } | |
| else: | |
| wait_condition = f"css:{request.css_selector}" if request.css_selector else None | |
| run_config = CrawlerRunConfig( | |
| cache_mode=CacheMode.BYPASS, | |
| word_count_threshold=request.word_count_threshold, | |
| css_selector=request.css_selector, | |
| excluded_tags=request.excluded_tags or ["nav", "footer", "header"], | |
| wait_for=wait_condition, | |
| wait_for_images=True, | |
| page_timeout=30000, | |
| scan_full_page=request.scan_full_page, | |
| scroll_delay=request.scroll_delay, | |
| extraction_strategy=create_extraction_strategy( | |
| request.extraction_type, | |
| request.css_selector, | |
| request.xpath_query | |
| ) | |
| ) | |
| browser_config = BrowserConfig( | |
| headless=True, | |
| viewport_width=1920, | |
| viewport_height=1080 | |
| ) | |
| async with AsyncWebCrawler(config=browser_config) as crawler: | |
| result = await crawler.arun(url=request.url, config=run_config) | |
| if not result.success: | |
| raise Exception(result.error_message) | |
| images = result.media.get("images", []) if hasattr(result, 'media') else [] | |
| image_info = "\n### Images Found\n" if images else "" | |
| for i, img in enumerate(images[:5]): | |
| image_info += f"- Image {i+1}: {img.get('src', 'N/A')}\n" | |
| if img.get('alt'): | |
| image_info += f" Alt: {img['alt']}\n" | |
| if img.get('score'): | |
| image_info += f" Score: {img['score']}\n" | |
| return { | |
| "markdown": result.markdown_v2 if hasattr(result, 'markdown_v2') else "", | |
| "metadata": { | |
| "url": request.url, | |
| "crawler_type": request.crawler_type.value, | |
| "extraction_type": request.extraction_type.value, | |
| "word_count_threshold": request.word_count_threshold, | |
| "css_selector": request.css_selector, | |
| "xpath_query": request.xpath_query, | |
| "scan_full_page": request.scan_full_page, | |
| "scroll_delay": request.scroll_delay, | |
| "wait_condition": wait_condition | |
| }, | |
| "extracted_content": result.extracted_content if hasattr(result, 'extracted_content') else None, | |
| "image_info": image_info | |
| } | |
| except Exception as e: | |
| raise Exception(str(e)) | |
| async def gradio_crawl( | |
| url: str, | |
| crawler_type: str, | |
| extraction_type: str, | |
| word_count_threshold: int, | |
| css_selector: str, | |
| xpath_query: str, | |
| scan_full_page: bool, | |
| scroll_delay: float, | |
| crawl_subpages: bool, | |
| max_depth: int, | |
| max_pages: int, | |
| exclude_external_links: bool | |
| ) -> tuple[str, str]: | |
| """Handle crawling requests from the Gradio interface.""" | |
| try: | |
| request = CrawlRequest( | |
| url=url, | |
| crawler_type=CrawlerType(crawler_type.lower()), | |
| extraction_type=ExtractionType(extraction_type.lower()), | |
| word_count_threshold=word_count_threshold, | |
| css_selector=css_selector if css_selector else None, | |
| xpath_query=xpath_query if xpath_query else None, | |
| scan_full_page=scan_full_page, | |
| scroll_delay=scroll_delay, | |
| crawl_subpages=crawl_subpages, | |
| max_depth=max_depth, | |
| max_pages=max_pages, | |
| exclude_external_links=exclude_external_links | |
| ) | |
| result = await crawl_url(request) | |
| markdown_content = str(result["markdown"]) if result.get("markdown") else "" | |
| metadata_str = f"""### Metadata | |
| - URL: {result['metadata']['url']} | |
| - Crawler Type: {result['metadata']['crawler_type']} | |
| - Extraction Type: {result['metadata']['extraction_type']} | |
| - Word Count Threshold: {result['metadata']['word_count_threshold']} | |
| - CSS Selector: {result['metadata']['css_selector'] or 'None'} | |
| - XPath Query: {result['metadata']['xpath_query'] or 'None'} | |
| - Full Page Scan: {result['metadata']['scan_full_page']} | |
| - Scroll Delay: {result['metadata']['scroll_delay']}s""" | |
| if crawl_subpages: | |
| metadata_str += f""" | |
| - Total Pages Crawled: {result['metadata'].get('total_pages_crawled', 0)} | |
| - Total Links Found: {result['metadata'].get('total_links_found', 0)} | |
| - Max Depth Reached: {result['metadata'].get('max_depth_reached', 1)}""" | |
| if result.get('image_info'): | |
| metadata_str += f"\n\n{result['image_info']}" | |
| if result.get("extracted_content"): | |
| metadata_str += f"\n\n### Extracted Content\n```json\n{result['extracted_content']}\n```" | |
| return markdown_content, metadata_str | |
| except Exception as e: | |
| error_msg = f"Error: {str(e)}" | |
| return error_msg, "Error occurred while crawling" | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=gradio_crawl, | |
| inputs=[ | |
| gr.Textbox( | |
| label="URL", | |
| placeholder="Enter URL to crawl", | |
| info="The webpage URL to extract content from" | |
| ), | |
| gr.Dropdown( | |
| choices=["Basic", "LLM", "Cosine", "JSON/CSS"], | |
| label="Crawler Type", | |
| value="Basic", | |
| info="Select the content extraction strategy" | |
| ), | |
| gr.Dropdown( | |
| choices=["Default", "CSS", "XPath", "LLM", "Combined"], | |
| label="Extraction Type", | |
| value="Default", | |
| info="Choose how to extract content from the page" | |
| ), | |
| gr.Slider( | |
| minimum=50, | |
| maximum=500, | |
| value=100, | |
| step=50, | |
| label="Word Count Threshold", | |
| info="Minimum number of words required for content extraction" | |
| ), | |
| gr.Textbox( | |
| label="CSS Selector", | |
| placeholder="e.g., article.content, main.post", | |
| info="CSS selector to target specific content (used with CSS extraction type)" | |
| ), | |
| gr.Textbox( | |
| label="XPath Query", | |
| placeholder="e.g., //article[@class='content']", | |
| info="XPath query to target specific content (used with XPath extraction type)" | |
| ), | |
| gr.Checkbox( | |
| label="Scan Full Page", | |
| value=False, | |
| info="Enable to scroll through the entire page to load lazy content" | |
| ), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.5, | |
| step=0.1, | |
| label="Scroll Delay", | |
| info="Delay between scroll steps in seconds when scanning full page" | |
| ), | |
| gr.Checkbox( | |
| label="Crawl Sub-pages", | |
| value=False, | |
| info="Enable to crawl links found on the page" | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=5, | |
| value=1, | |
| step=1, | |
| label="Max Crawl Depth", | |
| info="Maximum depth for recursive crawling (1 = only direct links)" | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=50, | |
| value=10, | |
| step=5, | |
| label="Max Pages", | |
| info="Maximum number of pages to crawl" | |
| ), | |
| gr.Checkbox( | |
| label="Exclude External Links", | |
| value=True, | |
| info="Only crawl links within the same domain" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.Markdown(label="Generated Markdown"), | |
| gr.Markdown(label="Metadata & Extraction Results") | |
| ], | |
| title="Crawl4AI Demo", | |
| description=""" | |
| This demo allows you to extract content from web pages using different crawling and extraction strategies. | |
| 1. Enter a URL to crawl | |
| 2. Select a crawler type (Basic, LLM, Cosine, JSON/CSS) | |
| 3. Choose an extraction strategy (Default, CSS, XPath, LLM, Combined) | |
| 4. Configure additional options: | |
| - Word count threshold for content filtering | |
| - CSS selectors for targeting specific content | |
| - XPath queries for precise extraction | |
| - Full page scanning for lazy-loaded content | |
| - Scroll delay for controlling page scanning speed | |
| - Sub-page crawling with depth control | |
| - Maximum number of pages to crawl | |
| - External link filtering | |
| The extracted content will be displayed in markdown format along with metadata and extraction results. | |
| When sub-page crawling is enabled, content from all crawled pages will be combined in the output. | |
| """, | |
| examples=[ | |
| ["https://example.com", "Basic", "Default", 100, "", "", False, 0.5, False, 1, 10, True], | |
| ["https://example.com/blog", "Basic", "CSS", 100, "article.post", "", True, 0.5, True, 2, 5, True], | |
| ] | |
| ) | |
| # For Hugging Face Spaces, we launch just the Gradio interface | |
| if __name__ == "__main__": | |
| demo.launch() | |