Spaces:
Running
Running
File size: 1,577 Bytes
d9f1916 d316383 c5e53a5 d9f1916 c5e53a5 d9f1916 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import re
from urllib.parse import urlparse, urlunparse
import httpx
def extract_urls(text: str):
"""Extract URLs from raw text."""
url_pattern = r"""(?:(?:https?:\/\/|www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})(?:\/[^\s<>"']*)?"""
return re.findall(url_pattern, text)
def extract_domain_from_url(url: str) -> str:
"""
Extracts the domain (netloc) from a given URL.
Parameters:
url (str): The full URL.
Returns:
str: The domain (e.g., 'example.com').
"""
redirect_url = resolve_short_url(url)
print(f"redirect: {url} -> {redirect_url}")
parsed = urlparse(redirect_url)
domain = parsed.netloc
print(f"domain: {redirect_url} -> {domain}")
return domain
def normalize_url(url: str) -> str:
"""Ensure the URL has a scheme and is normalized."""
parsed = urlparse(url, scheme="http")
if not parsed.netloc:
parsed = urlparse("http://" + url)
return urlunparse(parsed)
def resolve_short_url(url: str) -> str:
"""Make a HEAD request without following redirects, return the Location if redirected."""
url = normalize_url(url)
try:
with httpx.Client(follow_redirects=False, timeout=5) as client:
response = client.head(url, headers={"User-Agent": "Mozilla/5.0"})
if response.status_code in {301, 302, 303, 307, 308}:
location = response.headers.get("location")
return resolve_short_url(location)
return url # No redirect
except httpx.RequestError as e:
print(f"Error: {e}")
return url |