| | """ |
| | This example demonstrates optimal browser usage patterns in Crawl4AI: |
| | 1. Sequential crawling with session reuse |
| | 2. Parallel crawling with browser instance reuse |
| | 3. Performance optimization settings |
| | """ |
| |
|
| | import asyncio |
| | import os |
| | from typing import List |
| | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig |
| | from crawl4ai.content_filter_strategy import PruningContentFilter |
| | from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator |
| |
|
| |
|
| | async def crawl_sequential(urls: List[str]): |
| | """ |
| | Sequential crawling using session reuse - most efficient for moderate workloads |
| | """ |
| | print("\n=== Sequential Crawling with Session Reuse ===") |
| |
|
| | |
| | browser_config = BrowserConfig( |
| | headless=True, |
| | browser_args=[ |
| | "--disable-gpu", |
| | "--disable-dev-shm-usage", |
| | "--no-sandbox", |
| | ], |
| | viewport={ |
| | "width": 800, |
| | "height": 600, |
| | }, |
| | ) |
| |
|
| | |
| | crawl_config = CrawlerRunConfig( |
| | markdown_generator=DefaultMarkdownGenerator( |
| | |
| | ), |
| | ) |
| |
|
| | |
| | crawler = AsyncWebCrawler(config=browser_config) |
| | await crawler.start() |
| |
|
| | try: |
| | session_id = "session1" |
| | for url in urls: |
| | result = await crawler.arun( |
| | url=url, |
| | config=crawl_config, |
| | session_id=session_id, |
| | ) |
| | if result.success: |
| | print(f"Successfully crawled {url}") |
| | print(f"Content length: {len(result.markdown_v2.raw_markdown)}") |
| | finally: |
| | await crawler.close() |
| |
|
| |
|
| | async def crawl_parallel(urls: List[str], max_concurrent: int = 3): |
| | """ |
| | Parallel crawling while reusing browser instance - best for large workloads |
| | """ |
| | print("\n=== Parallel Crawling with Browser Reuse ===") |
| |
|
| | browser_config = BrowserConfig( |
| | headless=True, |
| | browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"], |
| | viewport={"width": 800, "height": 600}, |
| | ) |
| |
|
| | crawl_config = CrawlerRunConfig( |
| | markdown_generator=DefaultMarkdownGenerator( |
| | |
| | ), |
| | ) |
| |
|
| | |
| | crawler = AsyncWebCrawler(config=browser_config) |
| | await crawler.start() |
| |
|
| | try: |
| | |
| | for i in range(0, len(urls), max_concurrent): |
| | batch = urls[i : i + max_concurrent] |
| | tasks = [] |
| |
|
| | for j, url in enumerate(batch): |
| | session_id = ( |
| | f"parallel_session_{j}" |
| | ) |
| | task = crawler.arun(url=url, config=crawl_config, session_id=session_id) |
| | tasks.append(task) |
| |
|
| | |
| | results = await asyncio.gather(*tasks, return_exceptions=True) |
| |
|
| | |
| | for url, result in zip(batch, results): |
| | if isinstance(result, Exception): |
| | print(f"Error crawling {url}: {str(result)}") |
| | elif result.success: |
| | print(f"Successfully crawled {url}") |
| | print(f"Content length: {len(result.markdown_v2.raw_markdown)}") |
| | finally: |
| | await crawler.close() |
| |
|
| |
|
| | async def main(): |
| | |
| | urls = [ |
| | "https://example.com/page1", |
| | "https://example.com/page2", |
| | "https://example.com/page3", |
| | "https://example.com/page4", |
| | ] |
| |
|
| | |
| | await crawl_sequential(urls) |
| |
|
| | |
| | await crawl_parallel(urls, max_concurrent=2) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | asyncio.run(main()) |
| |
|