Spaces:
Running
Running
import re | |
from urllib.parse import urlparse, urlunparse | |
import httpx | |
def extract_urls(text: str): | |
"""Extract URLs from raw text.""" | |
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?""" | |
return re.findall(url_pattern, text) | |
def extract_domain_from_url(url: str) -> str: | |
""" | |
Extracts the domain (netloc) from a given URL. | |
Parameters: | |
url (str): The full URL. | |
Returns: | |
str: The domain (e.g., 'example.com'). | |
""" | |
redirect_url = resolve_short_url(url) | |
print(f"redirect: {url} -> {redirect_url}") | |
parsed = urlparse(redirect_url) | |
domain = parsed.netloc | |
print(f"domain: {redirect_url} -> {domain}") | |
return domain | |
def normalize_url(url: str) -> str: | |
"""Ensure the URL has a scheme and is normalized.""" | |
parsed = urlparse(url, scheme="http") | |
if not parsed.netloc: | |
parsed = urlparse("http://" + url) | |
return urlunparse(parsed) | |
def resolve_short_url(url: str) -> str: | |
"""Make a HEAD request without following redirects, return the Location if redirected.""" | |
url = normalize_url(url) | |
try: | |
with httpx.Client(follow_redirects=False, timeout=5) as client: | |
response = client.head(url, headers={"User-Agent": "Mozilla/5.0"}) | |
if response.status_code in {301, 302, 303, 307, 308}: | |
location = response.headers.get("location") | |
return resolve_short_url(location) | |
return url # No redirect | |
except httpx.RequestError as e: | |
print(f"Error: {e}") | |
return url |