Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Command-line interface for the web crawler. | |
Usage: | |
crawl.py start [--workers=<num>] [--async] [--seed=<url>...] | |
crawl.py stop | |
crawl.py pause | |
crawl.py resume | |
crawl.py stats | |
crawl.py clean [--days=<days>] | |
crawl.py export [--format=<format>] [--output=<file>] | |
crawl.py set-max-depth <depth> | |
crawl.py add-seed <url>... | |
crawl.py (-h | --help) | |
crawl.py --version | |
Options: | |
-h --help Show this help message | |
--version Show version | |
--workers=<num> Number of worker threads [default: 4] | |
--async Use asynchronous mode | |
--seed=<url> Seed URL(s) to start crawling | |
--days=<days> Days threshold for data cleaning [default: 90] | |
--format=<format> Export format (json, csv) [default: json] | |
--output=<file> Output file path [default: crawl_data.json] | |
""" | |
import os | |
import sys | |
import time | |
import json | |
import signal | |
import logging | |
import csv | |
from typing import List, Dict, Any | |
from docopt import docopt | |
import datetime | |
import traceback | |
from models import URL, URLStatus, Priority | |
from crawler import Crawler | |
import config | |
# Configure logging | |
logging.basicConfig( | |
level=getattr(logging, config.LOG_LEVEL), | |
format=config.LOG_FORMAT | |
) | |
logger = logging.getLogger(__name__) | |
# Global crawler instance | |
crawler = None | |
def initialize_crawler() -> Crawler: | |
"""Initialize the crawler instance""" | |
global crawler | |
if crawler is None: | |
crawler = Crawler() | |
return crawler | |
def start_crawler(workers: int, async_mode: bool, seed_urls: List[str]) -> None: | |
""" | |
Start the crawler | |
Args: | |
workers: Number of worker threads | |
async_mode: Whether to use async mode | |
seed_urls: List of seed URLs to add | |
""" | |
crawler = initialize_crawler() | |
# Add seed URLs if provided | |
if seed_urls: | |
num_added = crawler.add_seed_urls(seed_urls) | |
logger.info(f"Added {num_added} seed URLs") | |
# Start crawler | |
try: | |
crawler.start(num_workers=workers, async_mode=async_mode) | |
except KeyboardInterrupt: | |
logger.info("Crawler interrupted by user") | |
crawler.stop() | |
except Exception as e: | |
logger.error(f"Error starting crawler: {e}") | |
logger.error(traceback.format_exc()) | |
crawler.stop() | |
def stop_crawler() -> None: | |
"""Stop the crawler""" | |
if crawler is None: | |
logger.error("Crawler is not running") | |
return | |
crawler.stop() | |
logger.info("Crawler stopped") | |
def pause_crawler() -> None: | |
"""Pause the crawler""" | |
if crawler is None: | |
logger.error("Crawler is not running") | |
return | |
crawler.pause() | |
logger.info("Crawler paused") | |
def resume_crawler() -> None: | |
"""Resume the crawler""" | |
if crawler is None: | |
logger.error("Crawler is not running") | |
return | |
crawler.resume() | |
logger.info("Crawler resumed") | |
def show_stats() -> None: | |
"""Show crawler statistics""" | |
if crawler is None: | |
logger.error("Crawler is not running") | |
return | |
# Get crawler stats | |
stats = crawler.stats | |
# Calculate elapsed time | |
elapsed = time.time() - stats['start_time'] | |
elapsed_str = str(datetime.timedelta(seconds=int(elapsed))) | |
# Format statistics | |
print("\n=== Crawler Statistics ===") | |
print(f"Running time: {elapsed_str}") | |
print(f"Pages crawled: {stats['pages_crawled']}") | |
print(f"Pages failed: {stats['pages_failed']}") | |
print(f"URLs discovered: {stats['urls_discovered']}") | |
print(f"URLs filtered: {stats['urls_filtered']}") | |
# Calculate pages per second | |
pages_per_second = stats['pages_crawled'] / elapsed if elapsed > 0 else 0 | |
print(f"Crawl rate: {pages_per_second:.2f} pages/second") | |
# Domain statistics | |
domains = len(stats['domains_crawled']) | |
print(f"Domains crawled: {domains}") | |
# Status code statistics | |
print("\n--- HTTP Status Codes ---") | |
for status, count in sorted(stats['status_codes'].items()): | |
print(f" {status}: {count}") | |
# Content type statistics | |
print("\n--- Content Types ---") | |
for content_type, count in sorted(stats['content_types'].items(), key=lambda x: x[1], reverse=True)[:10]: | |
print(f" {content_type}: {count}") | |
# Frontier size | |
print(f"\nFrontier size: {crawler.frontier.size()}") | |
# DNS cache statistics | |
dns_stats = crawler.dns_resolver.get_stats() | |
print(f"\nDNS cache: {dns_stats['hit_count']} hits, {dns_stats['miss_count']} misses, {dns_stats['size']} entries") | |
print("\n=========================\n") | |
def clean_data(days: int) -> None: | |
""" | |
Clean old data | |
Args: | |
days: Days threshold for data cleaning | |
""" | |
try: | |
if crawler is None: | |
initialize_crawler() | |
# Get MongoDB connection | |
storage = crawler.mongo_client | |
# Clean old pages | |
old_pages = storage.clean_old_pages(days) | |
# Clean failed URLs | |
failed_urls = storage.clean_failed_urls() | |
logger.info(f"Cleaned {old_pages} old pages and {failed_urls} failed URLs") | |
print(f"Cleaned {old_pages} old pages and {failed_urls} failed URLs") | |
except Exception as e: | |
logger.error(f"Error cleaning data: {e}") | |
print(f"Error cleaning data: {e}") | |
def export_data(export_format: str, output_file: str) -> None: | |
""" | |
Export crawler data | |
Args: | |
export_format: Format to export (json, csv) | |
output_file: Output file path | |
""" | |
try: | |
if crawler is None: | |
initialize_crawler() | |
# Get MongoDB connection | |
db = crawler.db | |
# Get data | |
pages = list(db.pages_collection.find({}, {'_id': 0})) | |
urls = list(db.urls_collection.find({}, {'_id': 0})) | |
stats = list(db.stats_collection.find({}, {'_id': 0})) | |
# Prepare export data | |
export_data = { | |
'metadata': { | |
'exported_at': datetime.datetime.now().isoformat(), | |
'pages_count': len(pages), | |
'urls_count': len(urls), | |
'stats_count': len(stats), | |
}, | |
'pages': pages, | |
'urls': urls, | |
'stats': stats | |
} | |
# Convert datetime objects to strings | |
export_data = json.loads(json.dumps(export_data, default=str)) | |
# Export based on format | |
if export_format.lower() == 'json': | |
with open(output_file, 'w') as f: | |
json.dump(export_data, f, indent=2) | |
logger.info(f"Data exported to {output_file} in JSON format") | |
print(f"Data exported to {output_file} in JSON format") | |
elif export_format.lower() == 'csv': | |
# Split export into multiple CSV files | |
base_name = os.path.splitext(output_file)[0] | |
# Export pages | |
pages_file = f"{base_name}_pages.csv" | |
if pages: | |
with open(pages_file, 'w', newline='') as f: | |
writer = csv.DictWriter(f, fieldnames=pages[0].keys()) | |
writer.writeheader() | |
writer.writerows(pages) | |
# Export URLs | |
urls_file = f"{base_name}_urls.csv" | |
if urls: | |
with open(urls_file, 'w', newline='') as f: | |
writer = csv.DictWriter(f, fieldnames=urls[0].keys()) | |
writer.writeheader() | |
writer.writerows(urls) | |
# Export stats | |
stats_file = f"{base_name}_stats.csv" | |
if stats: | |
with open(stats_file, 'w', newline='') as f: | |
writer = csv.DictWriter(f, fieldnames=stats[0].keys()) | |
writer.writeheader() | |
writer.writerows(stats) | |
logger.info(f"Data exported to {base_name}_*.csv files in CSV format") | |
print(f"Data exported to {base_name}_*.csv files in CSV format") | |
else: | |
logger.error(f"Unsupported export format: {export_format}") | |
print(f"Unsupported export format: {export_format}") | |
except Exception as e: | |
logger.error(f"Error exporting data: {e}") | |
print(f"Error exporting data: {e}") | |
def set_max_depth(depth: int) -> None: | |
""" | |
Set maximum crawl depth | |
Args: | |
depth: Maximum crawl depth | |
""" | |
try: | |
depth = int(depth) | |
if depth < 0: | |
logger.error("Depth must be a positive integer") | |
print("Depth must be a positive integer") | |
return | |
# Update configuration | |
config.MAX_DEPTH = depth | |
logger.info(f"Maximum crawl depth set to {depth}") | |
print(f"Maximum crawl depth set to {depth}") | |
except ValueError: | |
logger.error("Depth must be a valid integer") | |
print("Depth must be a valid integer") | |
def add_seed_urls(urls: List[str]) -> None: | |
""" | |
Add seed URLs to the crawler | |
Args: | |
urls: List of URLs to add | |
""" | |
if crawler is None: | |
initialize_crawler() | |
num_added = crawler.add_seed_urls(urls) | |
logger.info(f"Added {num_added} seed URLs") | |
print(f"Added {num_added} seed URLs") | |
def handle_signal(sig, frame): | |
"""Handle signal interrupts""" | |
if sig == signal.SIGINT: | |
logger.info("Received SIGINT, stopping crawler") | |
stop_crawler() | |
sys.exit(0) | |
elif sig == signal.SIGTERM: | |
logger.info("Received SIGTERM, stopping crawler") | |
stop_crawler() | |
sys.exit(0) | |
def main(): | |
"""Main entry point""" | |
# Register signal handlers | |
signal.signal(signal.SIGINT, handle_signal) | |
signal.signal(signal.SIGTERM, handle_signal) | |
# Parse arguments | |
args = docopt(__doc__, version='Web Crawler 1.0') | |
# Handle commands | |
if args['start']: | |
workers = int(args['--workers']) | |
async_mode = args['--async'] | |
seed_urls = args['--seed'] if args['--seed'] else [] | |
start_crawler(workers, async_mode, seed_urls) | |
elif args['stop']: | |
stop_crawler() | |
elif args['pause']: | |
pause_crawler() | |
elif args['resume']: | |
resume_crawler() | |
elif args['stats']: | |
show_stats() | |
elif args['clean']: | |
days = int(args['--days']) | |
clean_data(days) | |
elif args['export']: | |
export_format = args['--format'] | |
output_file = args['--output'] | |
export_data(export_format, output_file) | |
elif args['set-max-depth']: | |
depth = args['<depth>'] | |
set_max_depth(depth) | |
elif args['add-seed']: | |
urls = args['<url>'] | |
add_seed_urls(urls) | |
else: | |
print(__doc__) | |
if __name__ == '__main__': | |
main() |