File size: 5,950 Bytes
6f509ec
 
 
 
 
 
 
 
 
 
13e0903
6f509ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13e0903
6f509ec
 
 
 
 
 
13e0903
6f509ec
 
 
 
 
 
 
13e0903
 
 
6f509ec
 
 
 
 
 
 
 
 
 
13e0903
 
 
6f509ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13e0903
 
 
6f509ec
 
 
 
 
 
 
 
 
 
 
 
13e0903
 
 
6f509ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Data models for the web crawler
"""

import time
import hashlib
import tldextract
from urllib.parse import urlparse, urljoin, urlunparse
from datetime import datetime
from typing import Dict, List, Any, Optional, Set, Tuple
from pydantic import BaseModel, Field, HttpUrl, field_validator
from enum import Enum
import logging

logger = logging.getLogger(__name__)


class URLStatus(str, Enum):
    """Status of a URL in the crawl process"""
    PENDING = "pending"  # Not yet processed
    IN_PROGRESS = "in_progress"  # Currently being processed
    COMPLETED = "completed"  # Successfully processed
    FAILED = "failed"  # Failed to process
    FILTERED = "filtered"  # Filtered out based on rules
    ROBOTSTXT_EXCLUDED = "robotstxt_excluded"  # Excluded by robots.txt


class Priority(int, Enum):
    """Priority levels for URLs"""
    VERY_HIGH = 1
    HIGH = 2
    MEDIUM = 3
    LOW = 4
    VERY_LOW = 5


class URL(BaseModel):
    """URL model with metadata for crawling"""
    url: str
    normalized_url: str = ""  # Normalized version of the URL
    domain: str = ""  # Domain extracted from the URL
    depth: int = 0  # Depth from seed URL
    discovered_at: datetime = Field(default_factory=datetime.now)
    last_crawled: Optional[datetime] = None
    completed_at: Optional[datetime] = None  # When the URL was completed/failed
    status: URLStatus = URLStatus.PENDING
    priority: Priority = Priority.MEDIUM
    parent_url: Optional[str] = None  # URL that led to this URL
    retries: int = 0  # Number of times retried
    error: Optional[str] = None  # Error message if failed
    metadata: Dict[str, Any] = Field(default_factory=dict)  # Additional metadata

    @field_validator("normalized_url", mode="before")
    def set_normalized_url(cls, v, values):
        """Normalize the URL if not already set"""
        if not v and "url" in values:
            return normalize_url(values["url"])
        return v

    @field_validator("domain", mode="before")
    def set_domain(cls, v, values):
        """Extract domain from URL if not already set"""
        if not v and "url" in values:
            parsed = tldextract.extract(values["url"])
            return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
        return v

    class Config:
        arbitrary_types_allowed = True


class RobotsInfo(BaseModel):
    """Information from robots.txt for a domain"""
    domain: str
    allowed: bool = True  # Whether crawling is allowed
    crawl_delay: Optional[float] = None  # Crawl delay in seconds
    last_fetched: datetime = Field(default_factory=datetime.now)
    user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict)  # Info per user agent
    status_code: Optional[int] = None  # HTTP status code when fetching robots.txt

    class Config:
        arbitrary_types_allowed = True


class Page(BaseModel):
    """Web page model with content and metadata"""
    url: str
    status_code: int
    content: str  # HTML content
    content_type: str
    content_length: int
    content_hash: str  # Hash of the content for duplicate detection
    headers: Dict[str, str] = Field(default_factory=dict)
    links: List[str] = Field(default_factory=list)  # Links extracted from the page
    crawled_at: datetime = Field(default_factory=datetime.now)
    redirect_url: Optional[str] = None  # URL after redirects
    elapsed_time: float = 0.0  # Time taken to fetch the page
    is_duplicate: bool = False  # Whether this is duplicate content
    metadata: Dict[str, Any] = Field(default_factory=dict)  # Additional metadata

    class Config:
        arbitrary_types_allowed = True


class DomainStats(BaseModel):
    """Statistics for a domain"""
    domain: str
    pages_crawled: int = 0
    successful_crawls: int = 0
    failed_crawls: int = 0
    last_crawled: Optional[datetime] = None
    robots_info: Optional[RobotsInfo] = None
    crawl_times: List[float] = Field(default_factory=list)  # Recent crawl times
    errors: Dict[int, int] = Field(default_factory=dict)  # Status code counts for errors

    class Config:
        arbitrary_types_allowed = True


def normalize_url(url: str) -> str:
    """
    Normalize a URL by:
    1. Converting to lowercase
    2. Removing fragments
    3. Removing default ports
    4. Sorting query parameters
    5. Removing trailing slashes
    6. Adding scheme if missing
    """
    try:
        # Parse URL
        parsed = urlparse(url)
        
        # Add scheme if missing
        if not parsed.scheme:
            url = 'http://' + url
            parsed = urlparse(url)
        
        # Get domain and path
        domain = parsed.netloc.lower()
        path = parsed.path
        
        # Remove default ports
        if ':' in domain:
            domain_parts = domain.split(':')
            if (parsed.scheme == 'http' and domain_parts[1] == '80') or \
               (parsed.scheme == 'https' and domain_parts[1] == '443'):
                domain = domain_parts[0]
        
        # Sort query parameters
        query = parsed.query
        if query:
            query_params = sorted(query.split('&'))
            query = '&'.join(query_params)
        
        # Remove trailing slashes from path
        while path.endswith('/') and len(path) > 1:
            path = path[:-1]
            
        # Add leading slash if missing
        if not path:
            path = '/'
        
        # Reconstruct URL
        normalized = f"{parsed.scheme}://{domain}{path}"
        if query:
            normalized += f"?{query}"
            
        logger.debug(f"Normalized URL: {url} -> {normalized}")
        return normalized
        
    except Exception as e:
        logger.error(f"Error normalizing URL {url}: {e}")
        return url


def calculate_content_hash(content: str) -> str:
    """Calculate hash of content for duplicate detection"""
    return hashlib.md5(content.encode('utf-8')).hexdigest()