Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / scrapers /horoscope_com_scraper.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 27 days ago

4.03 kB

	import logging
	import re
	from typing import Dict, Any, Optional
	from bs4 import BeautifulSoup
	from datetime import datetime, date
	from .horoscope_scraper import HoroscopeScraper

	logger = logging.getLogger(__name__)

	class HoroscopeComScraper(HoroscopeScraper):
	"""Scraper for Horoscope.com daily horoscopes"""

	def __init__(self, timeout: int = 30):
	super().__init__(timeout)
	self.source_name = "Horoscope.com"
	self.base_url = "https://www.horoscope.com/us/horoscopes/general/horoscope-general-daily-today.aspx"

	def _format_url(self, base_url: str, sign: str, date_str: Optional[str] = None) -> str:
	"""Format URL for horoscope.com"""
	# Map zodiac signs to their numeric ids used by horoscope.com
	sign_ids = {
	"aries": 1, "taurus": 2, "gemini": 3, "cancer": 4,
	"leo": 5, "virgo": 6, "libra": 7, "scorpio": 8,
	"sagittarius": 9, "capricorn": 10, "aquarius": 11, "pisces": 12
	}

	sign_id = sign_ids.get(sign.lower(), 1)

	if date_str:
	try:
	# Convert YYYY-MM-DD to the format needed (YYYYMMDD)
	date_obj = datetime.strptime(date_str, '%Y-%m-%d')
	formatted_date = date_obj.strftime('%Y%m%d')
	return f"{self.base_url}?sign={sign_id}&laDate={formatted_date}"
	except Exception as e:
	logger.error(f"Error formatting date: {str(e)}")

	# Default to current date if no date provided
	return f"{self.base_url}?sign={sign_id}"

	def _extract_prediction(self, soup: BeautifulSoup, text_content: str) -> str:
	"""Extract horoscope prediction from horoscope.com"""
	# Horoscope.com typically has the horoscope in a div with class 'main-horoscope'
	prediction_div = soup.select('.main-horoscope p')

	if prediction_div:
	# Join all paragraphs in the prediction div
	prediction = ' '.join([p.get_text().strip() for p in prediction_div])
	return prediction

	# Alternative selector
	alt_div = soup.select('#textline')
	if alt_div:
	return alt_div[0].get_text().strip()

	# Fallback to generic extraction
	return super()._extract_prediction(soup, text_content)

	def _extract_date(self, soup: BeautifulSoup, url: str) -> str:
	"""Extract horoscope date from horoscope.com"""
	# Try to get date from URL first (in the laDate parameter)
	date_match = re.search(r'laDate=(\d{8})', url)
	if date_match:
	date_str = date_match.group(1)
	try:
	parsed_date = datetime.strptime(date_str, '%Y%m%d')
	return parsed_date.strftime('%Y-%m-%d')
	except ValueError:
	pass

	# Look for date in typical location
	date_div = soup.select('.main-horoscope h1, .main-horoscope h2')
	if date_div:
	date_text = date_div[0].get_text().strip()
	# Try to extract date (format typically like "Taurus Daily Horoscope for May 13, 2025")
	try:
	# Try to match month day, year pattern
	match = re.search(r'(\w+)\s+(\d{1,2}),?\s+(\d{4})', date_text)
	if match:
	month, day, year = match.groups()
	month_dict = {
	'january': 1, 'february': 2, 'march': 3, 'april': 4,
	'may': 5, 'june': 6, 'july': 7, 'august': 8,
	'september': 9, 'october': 10, 'november': 11, 'december': 12
	}
	month_num = month_dict.get(month.lower(), 1)
	parsed_date = datetime(int(year), month_num, int(day))
	return parsed_date.strftime('%Y-%m-%d')
	except Exception:
	pass

	# Default to today's date if no date found
	return date.today().isoformat()