Spaces:

garvitcpp
/

accomodation-info-api

Paused

App Files Files Community

garvitcpp commited on May 21, 2025

Commit

28df1e8

verified ·

1 Parent(s): 19bd205

Upload 27 files

Browse files

Files changed (27) hide show

api/routes/__pycache__/hotel_routes.cpython-310.pyc +0 -0
api/routes/__pycache__/system_routes.cpython-310.pyc +0 -0
api/routes/hotel_routes.py +42 -0
api/routes/system_routes.py +25 -0
core/__pycache__/scraper.cpython-310.pyc +0 -0
core/__pycache__/utils.cpython-310.pyc +0 -0
core/scraper.py +46 -0
core/utils.py +59 -0
main.py +57 -0
models/__pycache__/requests.cpython-310.pyc +0 -0
models/__pycache__/responses.cpython-310.pyc +0 -0
models/requests.py +9 -0
models/responses.py +26 -0
requirements.txt +9 -0
services/__pycache__/booking_service.cpython-310.pyc +0 -0
services/booking_service.py +324 -0
services/utils/__init__.py +13 -0
services/utils/__pycache__/__init__.cpython-310.pyc +0 -0
services/utils/__pycache__/google_search_utils.cpython-310.pyc +0 -0
services/utils/__pycache__/html_utils.cpython-310.pyc +0 -0
services/utils/__pycache__/http_utils.cpython-310.pyc +0 -0
services/utils/__pycache__/image_utils.cpython-310.pyc +0 -0
services/utils/__pycache__/selector_manager.cpython-310.pyc +0 -0
services/utils/google_search_utils.py +78 -0
services/utils/html_utils.py +51 -0
services/utils/http_utils.py +20 -0
services/utils/image_utils.py +22 -0

api/routes/__pycache__/hotel_routes.cpython-310.pyc ADDED Viewed

Binary file (1.53 kB). View file

api/routes/__pycache__/system_routes.cpython-310.pyc ADDED Viewed

Binary file (1.21 kB). View file

api/routes/hotel_routes.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from fastapi import APIRouter, HTTPException, Query  # Added Query
+from models.requests import HotelSearchRequest
+from models.responses import HotelSearchResponse, HotelResponse
+from core.scraper import HotelScraper
+import logging
+from typing import Optional  # Added Optional
+router = APIRouter(tags=["hotels"], prefix="/api")
+logger = logging.getLogger(__name__)
+@router.post("/hotels", response_model=HotelSearchResponse)
+async def search_hotels(
+    request: HotelSearchRequest,
+    token: Optional[str] = Query(None, description="API Access Token")  # Added token parameter
+):
+    """
+    Search for multiple hotels with room images
+    Input format:
+    ```json
+    {
+        "hotels": [
+            {"hotel_name": "....", "destination": "...."},
+            {"hotel_name": "....", "destination": "...."}
+        ]
+    }
+    ```
+    Requires authentication via token (either in query parameter or X-API-Token header)
+    """
+    try:
+        scraper = HotelScraper()
+        results = await scraper.scrape_hotels(request.hotels)
+        return {
+            "results": results,
+            "status": "success",
+            "count": len(results)
+        }
+    except Exception as e:
+        logger.error(f"Error processing hotel search request: {e}")
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")

api/routes/system_routes.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from fastapi import APIRouter, Depends
+import datetime
+from main import verify_token
+router = APIRouter(tags=["system"])
+@router.get("/")
+async def root():
+    """Root endpoint for uptime monitoring"""
+    return {"status": "online", "service": "hotel-image-api"}
+@router.get("/health")
+async def health_check():
+    """Health check endpoint for uptime monitoring"""
+    return {"status": "healthy", "timestamp": datetime.datetime.utcnow().isoformat()}
+@router.get("/status")
+async def status():
+    """Status check without token verification"""
+    return {"status": "running"}
+@router.get("/token-test", dependencies=[Depends(verify_token)])
+async def token_test():
+    """Actually tests if token authentication is working"""
+    return {"status": "success", "message": "Token authentication successful"}

core/__pycache__/scraper.cpython-310.pyc ADDED Viewed

Binary file (1.71 kB). View file

core/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.08 kB). View file

core/scraper.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import asyncio
+import aiohttp # type: ignore
+import logging
+from typing import List, Dict, Any
+from services.booking_service import BookingService
+from models.requests import HotelQuery
+logger = logging.getLogger(__name__)
+class HotelScraper:
+    """Main scraper class that coordinates the scraping process"""
+    def __init__(self):
+        self.booking_service = BookingService()
+    async def scrape_hotels(self, hotel_queries: List[HotelQuery]) -> List[Dict[str, Any]]:
+        """Scrape multiple hotels concurrently"""
+        logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
+        async with aiohttp.ClientSession() as session:
+            tasks = []
+            for query in hotel_queries:
+                task = self.booking_service.search_hotel(
+                    session=session,
+                    destination=query.destination,
+                    hotel_name=query.hotel_name
+                )
+                tasks.append(task)
+            # Run all tasks concurrently
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            # Handle any exceptions
+            processed_results = []
+            for i, result in enumerate(results):
+                if isinstance(result, Exception):
+                    logger.error(f"Error scraping hotel {hotel_queries[i].hotel_name}: {result}")
+                    processed_results.append({
+                        "destination": hotel_queries[i].destination,
+                        "hotel_name": hotel_queries[i].hotel_name,
+                        "error": f"Scraping failed: {str(result)}"
+                    })
+                else:
+                    processed_results.append(result)
+            return processed_results

core/utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import re
+import logging
+import random
+from typing import Optional, Dict, Any, List
+from urllib.parse import urljoin, quote
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def get_clean_text(element) -> str:
+    """Extract clean text from an HTML element"""
+    if element:
+        return element.text.strip()
+    return ""
+def clean_url(base_url: str, href: str) -> str:
+    """Clean and join URLs properly"""
+    if not href:
+        return ""
+    return urljoin(base_url, href)
+def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]:
+    """Extract a float value from text"""
+    if not text:
+        return default
+    match = re.search(r'(\d+[\.,]?\d*)', text)
+    if match:
+        try:
+            return float(match.group(1).replace(',', '.'))
+        except ValueError:
+            pass
+    return default
+def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str:
+    """Construct a Booking.com search URL"""
+    search_query = f"{hotel_name} {destination}" if hotel_name else destination
+    return f"https://www.booking.com/search.html?ss={quote(search_query)}"
+def is_valid_image_url(url: str) -> bool:
+    """Check if URL is likely a valid room image and not a logo"""
+    if not url:
+        return False
+    if url.startswith("data:"):
+        return False
+    # Skip tiny images (likely icons)
+    if any(x in url for x in ["icon", "logo", "badge", "thumb"]):
+        return False
+    # Must be a full URL
+    if not (url.startswith("http://") or url.startswith("https://")):
+        return False
+    return True

main.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from fastapi import FastAPI, Depends, HTTPException, status
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.security import APIKeyHeader
+import os
+import uvicorn
+from typing import Optional
+from dotenv import load_dotenv
+load_dotenv()
+API_TOKEN = os.getenv("API_ACCESS_TOKEN")
+if not API_TOKEN:
+    print("WARNING: API_ACCESS_TOKEN not set. Generating a random token.")
+    import secrets
+    API_TOKEN = secrets.token_urlsafe(32)
+    print(f"Generated token: {API_TOKEN}")
+api_key_header = APIKeyHeader(name="X-API-Token", auto_error=False)
+api_key_query = "token"
+async def verify_token(
+    api_key_header: str = Depends(api_key_header),
+    token: Optional[str] = None,
+) -> bool:
+    if api_key_header == API_TOKEN:
+        return True
+    if token == API_TOKEN:
+        return True
+    raise HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Invalid or missing API token",
+        headers={"WWW-Authenticate": "APIKey"},
+    )
+app = FastAPI(
+    title="Hotel Image API",
+    description="API for retrieving hotel images with Google Search fallback"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+from api.routes.hotel_routes import router as hotel_router
+from api.routes.system_routes import router as system_router
+app.include_router(system_router)
+app.include_router(hotel_router, dependencies=[Depends(verify_token)])
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

models/__pycache__/requests.cpython-310.pyc ADDED Viewed

Binary file (699 Bytes). View file

models/__pycache__/responses.cpython-310.pyc ADDED Viewed

Binary file (1.41 kB). View file

models/requests.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from typing import List
+from pydantic import BaseModel, Field
+class HotelQuery(BaseModel):
+    hotel_name: str
+    destination: str
+class HotelSearchRequest(BaseModel):
+    hotels: List[HotelQuery] = Field(..., min_items=1)

models/responses.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import List, Optional
+from pydantic import BaseModel, Field
+class HotelImage(BaseModel):
+    url: str
+class HotelAmenity(BaseModel):
+    name: str
+class HotelData(BaseModel):
+    name: str
+    rating: Optional[float] = None
+    images: List[str] = []
+    amenities: List[str] = []
+    booking_link: Optional[str] = None
+class HotelResponse(BaseModel):
+    destination: str
+    hotel_name: str
+    data: Optional[HotelData] = None
+    error: Optional[str] = None
+class HotelSearchResponse(BaseModel):
+    results: List[HotelResponse]
+    status: str = "success"
+    count: int

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi==0.104.1
+uvicorn==0.23.2
+beautifulsoup4==4.12.2
+requests==2.31.0
+pydantic==2.4.2
+asyncio==3.4.3
+aiohttp==3.8.6
+python-multipart==0.0.6
+python-dotenv

services/__pycache__/booking_service.cpython-310.pyc ADDED Viewed

Binary file (9.51 kB). View file

services/booking_service.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import asyncio
+import random
+import re
+import aiohttp # type: ignore
+from bs4 import BeautifulSoup # type: ignore
+from typing import Dict, Any, List, Optional, Tuple
+import logging
+from urllib.parse import urljoin, quote
+from dotenv import load_dotenv
+import os
+from .utils.http_utils import fetch_page
+from .utils.image_utils import filter_logo_images, is_logo_image
+from .utils.html_utils import extract_rating_from_element, extract_images_from_soup
+from .utils.google_search_utils import fetch_hotel_images_from_google
+# Load environment variables
+load_dotenv()
+logger = logging.getLogger(__name__)
+class BookingService:
+    """Service for scraping hotel data from Booking.com"""
+    def __init__(self):
+        # List of diverse user agents for rotation
+        self.user_agents = [
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0 Safari/537.36",
+            "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
+        ]
+        # Base headers - User-Agent will be overridden in get_page
+        self.headers = {
+            "Accept-Language": "en-US,en;q=0.9",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
+            "sec-ch-ua": '"Google Chrome";v="123", "Not:A-Brand";v="99"',
+            "sec-ch-ua-mobile": "?0",
+            "sec-ch-ua-platform": '"Windows"',
+        }
+        # Check if Google Search API credentials are available
+        self.google_api_available = bool(os.getenv("GOOGLE_SEARCH_API_KEY") and os.getenv("GOOGLE_SEARCH_ENGINE_ID"))
+        logger.info(f"BookingService initialized at 2025-05-21 15:22:38 by Garvit-Nagok")
+        if self.google_api_available:
+            logger.info("Google Custom Search API configured as fallback for hotel images")
+        else:
+            logger.warning("Google Custom Search API credentials not found - fallback will not be available")
+    # [Keep all existing methods unchanged]
+    async def get_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
+        """Wrapper for fetch_page with rotating user agents"""
+        # Create a copy of headers and use a random user agent
+        current_headers = self.headers.copy()
+        current_headers["User-Agent"] = random.choice(self.user_agents)
+        logger.debug(f"Using user agent: {current_headers['User-Agent'][:30]}...")
+        return await fetch_page(session, url, current_headers)
+    async def extract_amenities(self, session: aiohttp.ClientSession, hotel_element, hotel_url: Optional[str] = None) -> List[str]:
+        """Extract popular facilities from hotel detail page"""
+        unique_amenities = set()
+        if hotel_url:
+            try:
+                html = await self.get_page(session, hotel_url)
+                if html:
+                    soup = BeautifulSoup(html, 'html.parser')
+                    popular_heading = soup.find(string=lambda text: text and text.strip() == "Most popular facilities")
+                    if popular_heading:
+                        current = popular_heading.parent
+                        container = None
+                        # Look for container with facility icons
+                        for _ in range(3):
+                            if not current:
+                                break
+                            if current.select("svg") or current.select("img"):
+                                container = current
+                                break
+                            parent = current.parent
+                            if parent and (parent.select("svg") or parent.select("img")):
+                                container = parent
+                                break
+                            sibling = current.find_next_sibling()
+                            if sibling and (sibling.select("svg") or sibling.select("img")):
+                                container = sibling
+                                break
+                            current = parent
+                        if not container:
+                            heading_parent = popular_heading.parent
+                            if heading_parent:
+                                container = heading_parent.find_next_sibling()
+                        # Extract facility items
+                        if container:
+                            facility_items = container.select("span") or container.select("div")
+                            for item in facility_items:
+                                text = item.get_text().strip()
+                                if text and text != "Most popular facilities" and len(text) < 30:
+                                    unique_amenities.add(text)
+                        # Fallback method
+                        if not unique_amenities:
+                            try:
+                                rows = soup.select(".f6b6d2a959") or soup.select_one("div:-soup-contains('Most popular facilities')").parent.find_next_sibling().select("span")
+                                for item in rows:
+                                    text = item.get_text().strip()
+                                    if text and text != "Most popular facilities" and len(text) < 30:
+                                        unique_amenities.add(text)
+                            except AttributeError:
+                                logger.debug("Could not find facilities using fallback selector")
+            except Exception as e:
+                logger.error(f"Error extracting amenities: {e}")
+        return list(unique_amenities)
+    async def get_room_images_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> List[str]:
+        """Get a mix of property and room images from hotel detail page"""
+        all_images = []
+        try:
+            html = await self.get_page(session, url)
+            if html:
+                soup = BeautifulSoup(html, 'html.parser')
+                selectors = [
+                    ".bui-carousel__item img", ".bh-photo-grid img",
+                    ".hp-gallery img", ".hotel-photos img",
+                    ".room-gallery img", ".hotel-room-photographs-slides img",
+                    "img.active-image", ".gallery-mosaic img", ".tour-360__image img",
+                    "img[width='300'], img[width='350'], img[width='400'], img[width='500']",
+                ]
+                all_images = extract_images_from_soup(soup, url, selectors)
+                if len(all_images) < 5:
+                    for img in soup.select("img"):
+                        width = img.get("width")
+                        if width and int(width) < 100:
+                            continue
+                        src = img.get("src") or img.get("data-src")
+                        if src and not is_logo_image(src) and src not in all_images:
+                            if not src.startswith("http"):
+                                src = urljoin(url, src)
+                            all_images.append(src)
+                            if len(all_images) >= 5:
+                                break
+                return filter_logo_images(all_images)[:5]
+        except Exception as e:
+            logger.error(f"Error getting hotel images: {e}", exc_info=True)
+        return all_images[:5] if all_images else []
+    async def extract_rating_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> Optional[float]:
+        """Extract rating from hotel detail page"""
+        try:
+            html = await self.get_page(session, url)
+            if not html:
+                return None
+            soup = BeautifulSoup(html, 'html.parser')
+            guest_reviews_section = soup.find("h2", string="Guest reviews")
+            if guest_reviews_section:
+                rating_div = soup.select_one("div[aria-label*='Scored'] strong") or soup.select_one(".b5cd09854e")
+                if rating_div:
+                    text = rating_div.get_text().strip()
+                    match = re.search(r"(\d+[.,]\d+)", text)
+                    if match:
+                        return float(match.group(1).replace(',', '.'))
+                nearby_elements = guest_reviews_section.parent.select("div")
+                for elem in nearby_elements:
+                    text = elem.get_text().strip()
+                    if re.match(r"^\d+[.,]\d+$", text):
+                        return float(text.replace(',', '.'))
+            score_elements = soup.select(".review-score-badge, .b5cd09854e")
+            for elem in score_elements:
+                text = elem.get_text().strip()
+                match = re.search(r"(\d+[.,]\d+)", text)
+                if match:
+                    return float(match.group(1).replace(',', '.'))
+            review_text = soup.find(string=lambda text: text and ("Review score" in text))
+            if review_text:
+                parent_text = review_text.parent.get_text() if review_text.parent else ""
+                match = re.search(r"(\d+[.,]\d+)", parent_text)
+                if match:
+                    return float(match.group(1).replace(',', '.'))
+        except Exception as e:
+            logger.error(f"Error extracting rating: {e}")
+        return None
+    def extract_rating(self, hotel_element) -> Optional[float]:
+        """Extract rating from hotel element"""
+        return extract_rating_from_element(hotel_element)
+    def is_name_similar(self, name1: str, name2: str) -> bool:
+        """Check if two hotel names are similar enough"""
+        if not name1 or not name2:
+            return False
+        name1 = name1.lower()
+        name2 = name2.lower()
+        if name1 in name2 or name2 in name1:
+            return True
+        # Compare words
+        words1 = set(re.findall(r'\w+', name1))
+        words2 = set(re.findall(r'\w+', name2))
+        if not words1 or not words2:
+            return False
+        # Calculate word overlap
+        common_words = words1.intersection(words2)
+        similarity = len(common_words) / min(len(words1), len(words2))
+        return similarity >= 0.5  # 50% word overlap
+    async def search_hotel(self, session: aiohttp.ClientSession, destination: str, hotel_name: str) -> Dict[str, Any]:
+        """Search for a specific hotel on Booking.com"""
+        search_query = f"{hotel_name} {destination}"
+        search_url = f"https://www.booking.com/search.html?ss={quote(search_query)}"
+        html = await self.get_page(session, search_url)
+        if not html:
+            return {
+                "destination": destination,
+                "hotel_name": hotel_name,
+                "error": "Failed to retrieve search results"
+            }
+        soup = BeautifulSoup(html, 'html.parser')
+        hotel_cards = soup.select("[data-testid='property-card'], .sr_property_block, .sr_item")
+        if not hotel_cards:
+            return {
+                "destination": destination,
+                "hotel_name": hotel_name,
+                "error": "No hotels found"
+            }
+        # Find matching hotel card
+        hotel_card = None
+        for card in hotel_cards:
+            name_elem = card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
+            if name_elem:
+                card_hotel_name = name_elem.text.strip()
+                if self.is_name_similar(card_hotel_name, hotel_name):
+                    hotel_card = card
+                    break
+        if not hotel_card:
+            hotel_card = hotel_cards[0]
+        name_elem = hotel_card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
+        name = name_elem.text.strip() if name_elem else hotel_name
+        rating = self.extract_rating(hotel_card)
+        link_elem = hotel_card.select_one("a[href*='hotel'], a.hotel_name_link")
+        hotel_url = ""
+        if link_elem and 'href' in link_elem.attrs:
+            href = link_elem['href']
+            hotel_url = urljoin("https://www.booking.com", href) if not href.startswith('http') else href
+        if hotel_url:
+            tasks = [
+                self.extract_rating_from_detail_page(session, hotel_url),
+                self.get_room_images_from_detail_page(session, hotel_url),
+                self.extract_amenities(session, hotel_card, hotel_url)
+            ]
+            detail_rating, images, amenities = await asyncio.gather(*tasks)
+            if detail_rating is not None:
+                rating = detail_rating
+        else:
+            images = []
+            amenities = []
+        # If scraping didn't return any images, use Google Custom Search API as fallback
+        if not images and self.google_api_available:
+            logger.info(f"No images found via scraping for {hotel_name} in {destination}. Using Google API as fallback.")
+            images = await fetch_hotel_images_from_google(session, hotel_name, destination)
+        return {
+            "destination": destination,
+            "hotel_name": hotel_name,
+            "data": {
+                "name": name,
+                "rating": rating,
+                "images": images,
+                "amenities": amenities,
+                "booking_link": hotel_url
+            }
+        }

services/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .html_utils import extract_rating_from_element, extract_images_from_soup
+from .http_utils import fetch_page
+from .image_utils import filter_logo_images, is_logo_image
+from .google_search_utils import fetch_hotel_images_from_google
+__all__ = [
+    'extract_rating_from_element',
+    'extract_images_from_soup',
+    'fetch_page',
+    'filter_logo_images',
+    'is_logo_image',
+    'fetch_hotel_images_from_google'
+]

services/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (505 Bytes). View file

services/utils/__pycache__/google_search_utils.cpython-310.pyc ADDED Viewed

Binary file (2.06 kB). View file

services/utils/__pycache__/html_utils.cpython-310.pyc ADDED Viewed

Binary file (1.76 kB). View file

services/utils/__pycache__/http_utils.cpython-310.pyc ADDED Viewed

Binary file (989 Bytes). View file

services/utils/__pycache__/image_utils.cpython-310.pyc ADDED Viewed

Binary file (1.01 kB). View file

services/utils/__pycache__/selector_manager.cpython-310.pyc ADDED Viewed

Binary file (5.15 kB). View file

services/utils/google_search_utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import aiohttp
+import logging
+from typing import List, Optional
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Get API credentials from environment variables
+GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY")
+GOOGLE_SEARCH_ENGINE_ID = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
+logger = logging.getLogger(__name__)
+async def fetch_hotel_images_from_google(
+    session: aiohttp.ClientSession,
+    hotel_name: str,
+    destination: str,
+    max_results: int = 5
+) -> List[str]:
+    """
+    Fetch hotel images using Google Custom Search API as a fallback
+    when scraping fails to return any images.
+    Args:
+        session: aiohttp client session
+        hotel_name: Name of the hotel to search for
+        destination: Location/destination of the hotel
+        max_results: Maximum number of images to return (default: 5)
+    Returns:
+        List of image URLs
+    """
+    if not GOOGLE_SEARCH_API_KEY or not GOOGLE_SEARCH_ENGINE_ID:
+        logger.error("Google Search API credentials not configured")
+        return []
+    # Construct the search query
+    search_query = f"{hotel_name} {destination} hotel"
+    # API endpoint
+    url = "https://www.googleapis.com/customsearch/v1"
+    # Parameters for the API request
+    params = {
+        'q': search_query,
+        'cx': GOOGLE_SEARCH_ENGINE_ID,
+        'key': GOOGLE_SEARCH_API_KEY,
+        'searchType': 'image',
+        'num': max_results,
+        'imgSize': 'large',  # Prefer large images
+        'imgType': 'photo',  # Only return photos, not illustrations
+        'safe': 'active'     # Safe search
+    }
+    try:
+        async with session.get(url, params=params) as response:
+            if response.status == 200:
+                data = await response.json()
+                # Extract image URLs from the response
+                image_urls = []
+                if 'items' in data:
+                    for item in data['items']:
+                        if 'link' in item:
+                            image_urls.append(item['link'])
+                logger.info(f"Google API returned {len(image_urls)} images for {hotel_name} in {destination}")
+                return image_urls
+            else:
+                error_data = await response.text()
+                logger.error(f"Google API error: {response.status} - {error_data}")
+                return []
+    except Exception as e:
+        logger.error(f"Error fetching hotel images from Google: {e}")
+        return []

services/utils/html_utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from bs4 import BeautifulSoup # type: ignore
+from typing import List, Optional
+from urllib.parse import urljoin
+import re
+import logging
+from .image_utils import is_logo_image
+logger = logging.getLogger(__name__)
+def extract_rating_from_element(element) -> Optional[float]:
+    """Extract rating from an HTML element"""
+    try:
+        rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']")
+        if rating_elem:
+            rating_text = rating_elem.text.strip()
+            rating_match = re.search(r"(\d+[.,]?\d*)", rating_text)
+            if rating_match:
+                rating_value = float(rating_match.group(1).replace(',', '.'))
+                return round(rating_value, 1)
+        # Look for review text near ratings
+        review_container = element.select_one(".bui-review-score, .d10a6220b4")
+        if review_container:
+            text = review_container.get_text()
+            rating_match = re.search(r"(\d+[.,]\d+)", text)
+            if rating_match:
+                rating_value = float(rating_match.group(1).replace(',', '.'))
+                return round(rating_value, 1)
+    except Exception as e:
+        logger.error(f"Error extracting rating: {e}")
+    return None
+def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]:
+    """Extract images from HTML using provided selectors"""
+    images = []
+    for selector in selectors:
+        for img in soup.select(selector):
+            src = img.get("src") or img.get("data-src") or img.get("data-lazy-src")
+            if src and not is_logo_image(src):
+                if not src.startswith("http"):
+                    src = urljoin(url, src)
+                if src not in images:
+                    images.append(src)
+                    if len(images) >= max_images:
+                        return images
+    return images

services/utils/http_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import aiohttp # type: ignore
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
+    """Fetch a page using aiohttp"""
+    try:
+        logger.info(f"Requesting URL: {url}")
+        async with session.get(url, headers=headers, timeout=15) as response:
+            if response.status == 200:
+                logger.debug(f"Successfully retrieved content from {url}")
+                return await response.text()
+            else:
+                logger.error(f"Error retrieving URL {url}: Status code {response.status}")
+                return None
+    except Exception as e:
+        logger.error(f"Request failed for {url}: {e}")
+        return None

services/utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import List
+def filter_logo_images(images: List[str]) -> List[str]:
+    """Filter out likely logo images based on URL patterns"""
+    filtered = []
+    for img_url in images:
+        if not is_logo_image(img_url):
+            filtered.append(img_url)
+    return filtered
+def is_logo_image(url: str) -> bool:
+    """Check if an image is likely a logo based on URL patterns"""
+    if not url:
+        return True
+    logo_patterns = [
+        "logo", "icon", "brand", "marker", "thumb", "tiny",
+        "avatar", "badge", "symbol", "sign", "favicon",
+        "design-assets", "googleusercontent", "images-flags"
+    ]
+    return any(pattern in url.lower() for pattern in logo_patterns)