phishing-detector-api / url_tools.py
kokluch's picture
Fix resolve shorten urls to go to last redirection.
c5e53a5
import re
from urllib.parse import urlparse, urlunparse
import httpx
def extract_urls(text: str):
"""Extract URLs from raw text."""
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
return re.findall(url_pattern, text)
def extract_domain_from_url(url: str) -> str:
"""
Extracts the domain (netloc) from a given URL.
Parameters:
url (str): The full URL.
Returns:
str: The domain (e.g., 'example.com').
"""
redirect_url = resolve_short_url(url)
print(f"redirect: {url} -> {redirect_url}")
parsed = urlparse(redirect_url)
domain = parsed.netloc
print(f"domain: {redirect_url} -> {domain}")
return domain
def normalize_url(url: str) -> str:
"""Ensure the URL has a scheme and is normalized."""
parsed = urlparse(url, scheme="http")
if not parsed.netloc:
parsed = urlparse("http://" + url)
return urlunparse(parsed)
def resolve_short_url(url: str) -> str:
"""Make a HEAD request without following redirects, return the Location if redirected."""
url = normalize_url(url)
try:
with httpx.Client(follow_redirects=False, timeout=5) as client:
response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code in {301, 302, 303, 307, 308}:
location = response.headers.get("location")
return resolve_short_url(location)
return url # No redirect
except httpx.RequestError as e:
print(f"Error: {e}")
return url