| """ |
| Date Parsing and Normalization Utility |
| FAANG-Level Quality Control for Published Dates |
| |
| Ensures all dates are in strict ISO-8601 UTC format for reliable sorting. |
| """ |
|
|
| from typing import Optional |
| from datetime import datetime, timezone |
| import re |
| from dateutil import parser as dateutil_parser |
| from dateutil.tz import tzutc |
|
|
|
|
| def parse_date_to_iso(date_str: str) -> str: |
| """ |
| Parse any date format and convert to strict ISO-8601 UTC |
| |
| Handles: |
| - ISO-8601: "2026-01-22T05:58:33Z" ✅ |
| - RFC-822: "Mon, 22 Jan 2026 05:58:33 GMT" ✅ |
| - Natural language: "2 hours ago", "yesterday" ✅ |
| - Unix timestamps: "1737525513" ✅ |
| |
| Returns: "2026-01-22T05:58:33.000Z" (always UTC) |
| """ |
| if not date_str: |
| return datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') |
| |
| try: |
| |
| parsed_date = dateutil_parser.parse(date_str) |
| |
| |
| if parsed_date.tzinfo is not None: |
| parsed_date = parsed_date.astimezone(timezone.utc) |
| else: |
| |
| parsed_date = parsed_date.replace(tzinfo=timezone.utc) |
| |
| |
| return parsed_date.isoformat().replace('+00:00', 'Z') |
| |
| except Exception as e: |
| |
| print(f"⚠️ Date parsing failed for '{date_str}': {e}. Using current time.") |
| return datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') |
|
|
|
|
| def normalize_article_date(article): |
| """ |
| Normalize the publishedAt field in an article |
| |
| HOTFIX (2026-01-23): Now handles both Pydantic Article models AND dicts |
| |
| Args: |
| article: Article model or dict |
| |
| Returns: |
| dict with normalized publishedAt field |
| """ |
| |
| if hasattr(article, 'model_dump'): |
| |
| article_dict = article.model_dump() |
| elif hasattr(article, 'dict'): |
| |
| article_dict = article.dict() |
| elif isinstance(article, dict): |
| |
| article_dict = article.copy() |
| else: |
| |
| article_dict = dict(article) |
| |
| |
| |
| published_at = article_dict.get('publishedAt') or article_dict.get('published_at') |
| |
| if published_at: |
| |
| if isinstance(published_at, datetime): |
| iso_date = published_at.astimezone(timezone.utc).isoformat().replace('+00:00', 'Z') |
| elif isinstance(published_at, str): |
| iso_date = parse_date_to_iso(published_at) |
| else: |
| |
| iso_date = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') |
| else: |
| |
| iso_date = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') |
| |
| |
| article_dict['publishedAt'] = iso_date |
| article_dict['published_at'] = iso_date |
| |
| return article_dict |
|
|
|
|
| def validate_date_format(date_str: str) -> bool: |
| """ |
| Validate that a date string is in strict ISO-8601 UTC format |
| |
| Expected format: "YYYY-MM-DDTHH:MM:SS.sssZ" or "YYYY-MM-DDTHH:MM:SSZ" |
| |
| Returns: True if valid, False otherwise |
| """ |
| |
| iso_pattern = r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{3})?Z$' |
| |
| if not date_str: |
| return False |
| |
| return bool(re.match(iso_pattern, date_str)) |
|
|
|
|
| |
| __all__ = [ |
| 'parse_date_to_iso', |
| 'normalize_article_date', |
| 'validate_date_format' |
| ] |
|
|