File size: 1,577 Bytes
d9f1916
 
 
 
 
 
 
 
 
d316383
 
 
 
 
 
 
 
 
 
c5e53a5
 
 
 
 
 
d9f1916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5e53a5
 
d9f1916
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
from urllib.parse import urlparse, urlunparse
import httpx

def extract_urls(text: str):
    """Extract URLs from raw text."""
    url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
    return re.findall(url_pattern, text)

def extract_domain_from_url(url: str) -> str:
    """
    Extracts the domain (netloc) from a given URL.

    Parameters:
        url (str): The full URL.

    Returns:
        str: The domain (e.g., 'example.com').
    """
    redirect_url = resolve_short_url(url)
    print(f"redirect: {url} -> {redirect_url}")
    parsed = urlparse(redirect_url)
    domain = parsed.netloc
    print(f"domain: {redirect_url} -> {domain}")
    return domain

def normalize_url(url: str) -> str:
    """Ensure the URL has a scheme and is normalized."""
    parsed = urlparse(url, scheme="http")
    if not parsed.netloc:
        parsed = urlparse("http://" + url)
    return urlunparse(parsed)

def resolve_short_url(url: str) -> str:
    """Make a HEAD request without following redirects, return the Location if redirected."""
    url = normalize_url(url)
    try:
        with httpx.Client(follow_redirects=False, timeout=5) as client:
            response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
            if response.status_code in {301, 302, 303, 307, 308}:
                location = response.headers.get("location")
                return resolve_short_url(location)
            return url  # No redirect
    except httpx.RequestError as e:
        print(f"Error: {e}")
        return url