Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Main script to run the web crawler with command line arguments | |
""" | |
import os | |
import sys | |
import time | |
import logging | |
import argparse | |
import signal | |
from urllib.parse import urlparse | |
# Add the current directory to path if needed | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
if script_dir not in sys.path: | |
sys.path.insert(0, script_dir) | |
# Configure logging - do this first | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s [%(name)s] %(levelname)s: %(message)s', | |
handlers=[ | |
logging.StreamHandler(sys.stdout), | |
logging.FileHandler(os.path.join(script_dir, 'crawler.log')) | |
] | |
) | |
logger = logging.getLogger("run_crawler") | |
# Now import the crawler components | |
logger.info("Importing crawler modules...") | |
try: | |
from crawler import Crawler | |
from models import Priority | |
logger.info("Successfully imported crawler modules") | |
except Exception as e: | |
logger.error(f"Error importing crawler modules: {e}", exc_info=True) | |
sys.exit(1) | |
def parse_arguments(): | |
"""Parse command line arguments""" | |
parser = argparse.ArgumentParser(description='Run the web crawler with custom settings') | |
parser.add_argument('--seed', nargs='+', metavar='URL', | |
help='One or more seed URLs to start crawling') | |
parser.add_argument('--depth', type=int, default=None, | |
help='Maximum crawl depth') | |
parser.add_argument('--workers', type=int, default=None, | |
help='Number of worker threads') | |
parser.add_argument('--delay', type=float, default=None, | |
help='Delay between requests to the same domain (in seconds)') | |
parser.add_argument('--respect-robots', dest='respect_robots', action='store_true', | |
help='Respect robots.txt rules') | |
parser.add_argument('--ignore-robots', dest='respect_robots', action='store_false', | |
help='Ignore robots.txt rules') | |
parser.add_argument('--user-agent', type=str, default=None, | |
help='User agent to use for requests') | |
parser.add_argument('--async', dest='async_mode', action='store_true', | |
help='Use async mode for requests') | |
parser.add_argument('--domain-filter', type=str, default=None, | |
help='Only crawl URLs that match this domain') | |
parser.add_argument('--reset-db', action='store_true', | |
help='Reset MongoDB and flush Redis data before starting') | |
parser.add_argument('--verbose', action='store_true', | |
help='Enable verbose logging') | |
args = parser.parse_args() | |
# Set log level based on verbose flag | |
if args.verbose: | |
logger.setLevel(logging.DEBUG) | |
logger.debug("Verbose logging enabled") | |
return args | |
def reset_databases(): | |
"""Reset MongoDB and flush Redis data""" | |
success = True | |
# Reset MongoDB | |
try: | |
logger.info("Starting MongoDB cleanup...") | |
from mongo_cleanup import cleanup_mongodb | |
mongo_success = cleanup_mongodb() | |
if not mongo_success: | |
logger.warning("MongoDB cleanup may not have been completely successful") | |
success = False | |
else: | |
logger.info("MongoDB cleanup completed successfully") | |
except Exception as e: | |
logger.error(f"Error cleaning up MongoDB: {e}", exc_info=True) | |
success = False | |
# Flush Redis | |
try: | |
logger.info("Starting Redis flush...") | |
import redis | |
logger.debug("Connecting to Redis to flush data...") | |
# Set a timeout for Redis connection | |
r = redis.Redis(host='localhost', port=6379, db=0, socket_timeout=5) | |
# Check if Redis is available | |
try: | |
logger.debug("Testing Redis connection...") | |
ping_result = r.ping() | |
logger.debug(f"Redis ping result: {ping_result}") | |
# If connection works, flush all data | |
logger.info("Flushing all Redis data...") | |
result = r.flushall() | |
logger.info(f"Redis flush result: {result}") | |
except redis.ConnectionError as e: | |
logger.error(f"Redis connection error: {e}") | |
success = False | |
except Exception as e: | |
logger.error(f"Error flushing Redis: {e}", exc_info=True) | |
success = False | |
return success | |
def setup_signal_handlers(crawler_instance): | |
"""Setup signal handlers for graceful shutdown""" | |
def signal_handler(sig, frame): | |
logger.info(f"Received signal {sig}, shutting down gracefully...") | |
if crawler_instance and crawler_instance.running: | |
logger.info("Stopping crawler...") | |
crawler_instance.stop() | |
sys.exit(0) | |
signal.signal(signal.SIGINT, signal_handler) | |
signal.signal(signal.SIGTERM, signal_handler) | |
def run_crawler(): | |
"""Run the crawler with command-line arguments""" | |
args = parse_arguments() | |
crawler = None | |
try: | |
logger.info("Starting the web crawler...") | |
# Reset database if requested | |
if args.reset_db: | |
logger.info("Resetting MongoDB and flushing Redis data...") | |
if not reset_databases(): | |
logger.warning("Database reset was not completely successful") | |
# Create crawler instance | |
logger.info("Creating crawler instance...") | |
crawler = Crawler() | |
logger.info("Crawler instance created successfully") | |
# Setup signal handlers | |
setup_signal_handlers(crawler) | |
# Override settings from command line if provided | |
if args.depth is not None: | |
import config | |
config.MAX_DEPTH = args.depth | |
logger.info(f"Setting maximum depth to {args.depth}") | |
if args.delay is not None: | |
import config | |
config.DELAY_BETWEEN_REQUESTS = args.delay | |
logger.info(f"Setting delay between requests to {args.delay} seconds") | |
if args.respect_robots is not None: | |
import config | |
config.RESPECT_ROBOTS_TXT = args.respect_robots | |
logger.info(f"Respect robots.txt: {args.respect_robots}") | |
if args.user_agent is not None: | |
import config | |
config.USER_AGENT = args.user_agent | |
logger.info(f"Using user agent: {args.user_agent}") | |
# Add seed URLs if provided | |
if args.seed: | |
logger.info(f"Adding {len(args.seed)} seed URLs") | |
seed_urls = [] | |
for url in args.seed: | |
if not (url.startswith('http://') or url.startswith('https://')): | |
url = 'https://' + url | |
seed_urls.append(url) | |
logger.debug(f"Added seed URL: {url}") | |
# Add the URLs to the frontier | |
logger.info("Adding seed URLs to frontier...") | |
added = crawler.add_seed_urls(seed_urls, Priority.VERY_HIGH) | |
logger.info(f"Successfully added {added} seed URLs to the frontier") | |
# Apply domain filter if provided | |
if args.domain_filter: | |
import config | |
# Allow both domain.com or http://domain.com formats | |
domain = args.domain_filter | |
if domain.startswith('http://') or domain.startswith('https://'): | |
domain = urlparse(domain).netloc | |
config.ALLOWED_DOMAINS = [domain] | |
logger.info(f"Filtering to domain: {domain}") | |
# Start the crawler | |
num_workers = args.workers if args.workers is not None else 4 | |
logger.info(f"Starting crawler with {num_workers} workers...") | |
crawler.start(num_workers=num_workers, async_mode=args.async_mode) | |
# If we get here, crawler has finished or was stopped | |
logger.info("Crawler finished") | |
except KeyboardInterrupt: | |
logger.info("Crawler interrupted by user") | |
if crawler and crawler.running: | |
logger.info("Stopping crawler...") | |
crawler.stop() | |
except Exception as e: | |
logger.error(f"Error running crawler: {e}", exc_info=True) | |
if crawler and crawler.running: | |
try: | |
logger.info("Attempting to stop crawler after error...") | |
crawler.stop() | |
except: | |
pass | |
if __name__ == "__main__": | |
run_crawler() |