Spaces:
Running
Running
import logging | |
import re | |
from typing import Dict, Any, Optional | |
from bs4 import BeautifulSoup | |
from datetime import datetime, date | |
from .horoscope_scraper import HoroscopeScraper | |
logger = logging.getLogger(__name__) | |
class HoroscopeComScraper(HoroscopeScraper): | |
"""Scraper for Horoscope.com daily horoscopes""" | |
def __init__(self, timeout: int = 30): | |
super().__init__(timeout) | |
self.source_name = "Horoscope.com" | |
self.base_url = "https://www.horoscope.com/us/horoscopes/general/horoscope-general-daily-today.aspx" | |
def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str: | |
"""Format URL for horoscope.com""" | |
# Map zodiac signs to their numeric ids used by horoscope.com | |
sign_ids = { | |
"aries": 1, "taurus": 2, "gemini": 3, "cancer": 4, | |
"leo": 5, "virgo": 6, "libra": 7, "scorpio": 8, | |
"sagittarius": 9, "capricorn": 10, "aquarius": 11, "pisces": 12 | |
} | |
sign_id = sign_ids.get(sign.lower(), 1) | |
if date_str: | |
try: | |
# Convert YYYY-MM-DD to the format needed (YYYYMMDD) | |
date_obj = datetime.strptime(date_str, '%Y-%m-%d') | |
formatted_date = date_obj.strftime('%Y%m%d') | |
return f"{self.base_url}?sign={sign_id}&laDate={formatted_date}" | |
except Exception as e: | |
logger.error(f"Error formatting date: {str(e)}") | |
# Default to current date if no date provided | |
return f"{self.base_url}?sign={sign_id}" | |
def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str: | |
"""Extract horoscope prediction from horoscope.com""" | |
# Horoscope.com typically has the horoscope in a div with class 'main-horoscope' | |
prediction_div = soup.select('.main-horoscope p') | |
if prediction_div: | |
# Join all paragraphs in the prediction div | |
prediction = ' '.join([p.get_text().strip() for p in prediction_div]) | |
return prediction | |
# Alternative selector | |
alt_div = soup.select('#textline') | |
if alt_div: | |
return alt_div[0].get_text().strip() | |
# Fallback to generic extraction | |
return super()._extract_prediction(soup, text_content) | |
def _extract_date(self, soup: BeautifulSoup, url: str) -> str: | |
"""Extract horoscope date from horoscope.com""" | |
# Try to get date from URL first (in the laDate parameter) | |
date_match = re.search(r'laDate=(\d{8})', url) | |
if date_match: | |
date_str = date_match.group(1) | |
try: | |
parsed_date = datetime.strptime(date_str, '%Y%m%d') | |
return parsed_date.strftime('%Y-%m-%d') | |
except ValueError: | |
pass | |
# Look for date in typical location | |
date_div = soup.select('.main-horoscope h1, .main-horoscope h2') | |
if date_div: | |
date_text = date_div[0].get_text().strip() | |
# Try to extract date (format typically like "Taurus Daily Horoscope for May 13, 2025") | |
try: | |
# Try to match month day, year pattern | |
match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text) | |
if match: | |
month, day, year = match.groups() | |
month_dict = { | |
'january': 1, 'february': 2, 'march': 3, 'april': 4, | |
'may': 5, 'june': 6, 'july': 7, 'august': 8, | |
'september': 9, 'october': 10, 'november': 11, 'december': 12 | |
} | |
month_num = month_dict.get(month.lower(), 1) | |
parsed_date = datetime(int(year), month_num, int(day)) | |
return parsed_date.strftime('%Y-%m-%d') | |
except Exception: | |
pass | |
# Default to today's date if no date found | |
return date.today().isoformat() |