| | |
| | import os, sys |
| | |
| | parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| | sys.path.append(parent_dir) |
| |
|
| | import asyncio |
| | from crawl4ai import AsyncWebCrawler |
| |
|
| | async def main(): |
| | |
| | async with AsyncWebCrawler(verbose=True) as crawler: |
| | |
| | urls = [ |
| | "https://example.com", |
| | "https://python.org", |
| | "https://github.com", |
| | "https://stackoverflow.com", |
| | "https://news.ycombinator.com" |
| | ] |
| |
|
| | |
| | word_count_threshold = 100 |
| |
|
| | |
| | results = await crawler.arun_many( |
| | urls=urls, |
| | word_count_threshold=word_count_threshold, |
| | bypass_cache=True, |
| | verbose=True |
| | ) |
| |
|
| | |
| | for result in results: |
| | if result.success: |
| | print(f"Successfully crawled: {result.url}") |
| | print(f"Title: {result.metadata.get('title', 'N/A')}") |
| | print(f"Word count: {len(result.markdown.split())}") |
| | print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}") |
| | print(f"Number of images: {len(result.media.get('images', []))}") |
| | print("---") |
| | else: |
| | print(f"Failed to crawl: {result.url}") |
| | print(f"Error: {result.error_message}") |
| | print("---") |
| |
|
| | if __name__ == "__main__": |
| | asyncio.run(main()) |