MT564AITraining / scrapers /horoscope_com_scraper.py
pareshmishra
Add full project source files for MT564 AI
2c72e40
import logging
import re
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
from datetime import datetime, date
from .horoscope_scraper import HoroscopeScraper
logger = logging.getLogger(__name__)
class HoroscopeComScraper(HoroscopeScraper):
"""Scraper for Horoscope.com daily horoscopes"""
def __init__(self, timeout: int = 30):
super().__init__(timeout)
self.source_name = "Horoscope.com"
self.base_url = "https://www.horoscope.com/us/horoscopes/general/horoscope-general-daily-today.aspx"
def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
"""Format URL for horoscope.com"""
# Map zodiac signs to their numeric ids used by horoscope.com
sign_ids = {
"aries": 1, "taurus": 2, "gemini": 3, "cancer": 4,
"leo": 5, "virgo": 6, "libra": 7, "scorpio": 8,
"sagittarius": 9, "capricorn": 10, "aquarius": 11, "pisces": 12
}
sign_id = sign_ids.get(sign.lower(), 1)
if date_str:
try:
# Convert YYYY-MM-DD to the format needed (YYYYMMDD)
date_obj = datetime.strptime(date_str, '%Y-%m-%d')
formatted_date = date_obj.strftime('%Y%m%d')
return f"{self.base_url}?sign={sign_id}&laDate={formatted_date}"
except Exception as e:
logger.error(f"Error formatting date: {str(e)}")
# Default to current date if no date provided
return f"{self.base_url}?sign={sign_id}"
def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
"""Extract horoscope prediction from horoscope.com"""
# Horoscope.com typically has the horoscope in a div with class 'main-horoscope'
prediction_div = soup.select('.main-horoscope p')
if prediction_div:
# Join all paragraphs in the prediction div
prediction = ' '.join([p.get_text().strip() for p in prediction_div])
return prediction
# Alternative selector
alt_div = soup.select('#textline')
if alt_div:
return alt_div[0].get_text().strip()
# Fallback to generic extraction
return super()._extract_prediction(soup, text_content)
def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
"""Extract horoscope date from horoscope.com"""
# Try to get date from URL first (in the laDate parameter)
date_match = re.search(r'laDate=(\d{8})', url)
if date_match:
date_str = date_match.group(1)
try:
parsed_date = datetime.strptime(date_str, '%Y%m%d')
return parsed_date.strftime('%Y-%m-%d')
except ValueError:
pass
# Look for date in typical location
date_div = soup.select('.main-horoscope h1, .main-horoscope h2')
if date_div:
date_text = date_div[0].get_text().strip()
# Try to extract date (format typically like "Taurus Daily Horoscope for May 13, 2025")
try:
# Try to match month day, year pattern
match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text)
if match:
month, day, year = match.groups()
month_dict = {
'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12
}
month_num = month_dict.get(month.lower(), 1)
parsed_date = datetime(int(year), month_num, int(day))
return parsed_date.strftime('%Y-%m-%d')
except Exception:
pass
# Default to today's date if no date found
return date.today().isoformat()