Upload 27 files
Browse files- api/routes/__pycache__/hotel_routes.cpython-310.pyc +0 -0
- api/routes/__pycache__/system_routes.cpython-310.pyc +0 -0
- api/routes/hotel_routes.py +42 -0
- api/routes/system_routes.py +25 -0
- core/__pycache__/scraper.cpython-310.pyc +0 -0
- core/__pycache__/utils.cpython-310.pyc +0 -0
- core/scraper.py +46 -0
- core/utils.py +59 -0
- main.py +57 -0
- models/__pycache__/requests.cpython-310.pyc +0 -0
- models/__pycache__/responses.cpython-310.pyc +0 -0
- models/requests.py +9 -0
- models/responses.py +26 -0
- requirements.txt +9 -0
- services/__pycache__/booking_service.cpython-310.pyc +0 -0
- services/booking_service.py +324 -0
- services/utils/__init__.py +13 -0
- services/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- services/utils/__pycache__/google_search_utils.cpython-310.pyc +0 -0
- services/utils/__pycache__/html_utils.cpython-310.pyc +0 -0
- services/utils/__pycache__/http_utils.cpython-310.pyc +0 -0
- services/utils/__pycache__/image_utils.cpython-310.pyc +0 -0
- services/utils/__pycache__/selector_manager.cpython-310.pyc +0 -0
- services/utils/google_search_utils.py +78 -0
- services/utils/html_utils.py +51 -0
- services/utils/http_utils.py +20 -0
- services/utils/image_utils.py +22 -0
api/routes/__pycache__/hotel_routes.cpython-310.pyc
ADDED
|
Binary file (1.53 kB). View file
|
|
|
api/routes/__pycache__/system_routes.cpython-310.pyc
ADDED
|
Binary file (1.21 kB). View file
|
|
|
api/routes/hotel_routes.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException, Query # Added Query
|
| 2 |
+
from models.requests import HotelSearchRequest
|
| 3 |
+
from models.responses import HotelSearchResponse, HotelResponse
|
| 4 |
+
from core.scraper import HotelScraper
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Optional # Added Optional
|
| 7 |
+
|
| 8 |
+
router = APIRouter(tags=["hotels"], prefix="/api")
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
@router.post("/hotels", response_model=HotelSearchResponse)
|
| 12 |
+
async def search_hotels(
|
| 13 |
+
request: HotelSearchRequest,
|
| 14 |
+
token: Optional[str] = Query(None, description="API Access Token") # Added token parameter
|
| 15 |
+
):
|
| 16 |
+
"""
|
| 17 |
+
Search for multiple hotels with room images
|
| 18 |
+
|
| 19 |
+
Input format:
|
| 20 |
+
```json
|
| 21 |
+
{
|
| 22 |
+
"hotels": [
|
| 23 |
+
{"hotel_name": "....", "destination": "...."},
|
| 24 |
+
{"hotel_name": "....", "destination": "...."}
|
| 25 |
+
]
|
| 26 |
+
}
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
Requires authentication via token (either in query parameter or X-API-Token header)
|
| 30 |
+
"""
|
| 31 |
+
try:
|
| 32 |
+
scraper = HotelScraper()
|
| 33 |
+
results = await scraper.scrape_hotels(request.hotels)
|
| 34 |
+
|
| 35 |
+
return {
|
| 36 |
+
"results": results,
|
| 37 |
+
"status": "success",
|
| 38 |
+
"count": len(results)
|
| 39 |
+
}
|
| 40 |
+
except Exception as e:
|
| 41 |
+
logger.error(f"Error processing hotel search request: {e}")
|
| 42 |
+
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
api/routes/system_routes.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Depends
|
| 2 |
+
import datetime
|
| 3 |
+
from main import verify_token
|
| 4 |
+
|
| 5 |
+
router = APIRouter(tags=["system"])
|
| 6 |
+
|
| 7 |
+
@router.get("/")
|
| 8 |
+
async def root():
|
| 9 |
+
"""Root endpoint for uptime monitoring"""
|
| 10 |
+
return {"status": "online", "service": "hotel-image-api"}
|
| 11 |
+
|
| 12 |
+
@router.get("/health")
|
| 13 |
+
async def health_check():
|
| 14 |
+
"""Health check endpoint for uptime monitoring"""
|
| 15 |
+
return {"status": "healthy", "timestamp": datetime.datetime.utcnow().isoformat()}
|
| 16 |
+
|
| 17 |
+
@router.get("/status")
|
| 18 |
+
async def status():
|
| 19 |
+
"""Status check without token verification"""
|
| 20 |
+
return {"status": "running"}
|
| 21 |
+
|
| 22 |
+
@router.get("/token-test", dependencies=[Depends(verify_token)])
|
| 23 |
+
async def token_test():
|
| 24 |
+
"""Actually tests if token authentication is working"""
|
| 25 |
+
return {"status": "success", "message": "Token authentication successful"}
|
core/__pycache__/scraper.cpython-310.pyc
ADDED
|
Binary file (1.71 kB). View file
|
|
|
core/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (2.08 kB). View file
|
|
|
core/scraper.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import aiohttp # type: ignore
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
from services.booking_service import BookingService
|
| 6 |
+
from models.requests import HotelQuery
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class HotelScraper:
|
| 11 |
+
"""Main scraper class that coordinates the scraping process"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
self.booking_service = BookingService()
|
| 15 |
+
|
| 16 |
+
async def scrape_hotels(self, hotel_queries: List[HotelQuery]) -> List[Dict[str, Any]]:
|
| 17 |
+
"""Scrape multiple hotels concurrently"""
|
| 18 |
+
logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
|
| 19 |
+
|
| 20 |
+
async with aiohttp.ClientSession() as session:
|
| 21 |
+
tasks = []
|
| 22 |
+
for query in hotel_queries:
|
| 23 |
+
task = self.booking_service.search_hotel(
|
| 24 |
+
session=session,
|
| 25 |
+
destination=query.destination,
|
| 26 |
+
hotel_name=query.hotel_name
|
| 27 |
+
)
|
| 28 |
+
tasks.append(task)
|
| 29 |
+
|
| 30 |
+
# Run all tasks concurrently
|
| 31 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 32 |
+
|
| 33 |
+
# Handle any exceptions
|
| 34 |
+
processed_results = []
|
| 35 |
+
for i, result in enumerate(results):
|
| 36 |
+
if isinstance(result, Exception):
|
| 37 |
+
logger.error(f"Error scraping hotel {hotel_queries[i].hotel_name}: {result}")
|
| 38 |
+
processed_results.append({
|
| 39 |
+
"destination": hotel_queries[i].destination,
|
| 40 |
+
"hotel_name": hotel_queries[i].hotel_name,
|
| 41 |
+
"error": f"Scraping failed: {str(result)}"
|
| 42 |
+
})
|
| 43 |
+
else:
|
| 44 |
+
processed_results.append(result)
|
| 45 |
+
|
| 46 |
+
return processed_results
|
core/utils.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
import random
|
| 4 |
+
from typing import Optional, Dict, Any, List
|
| 5 |
+
from urllib.parse import urljoin, quote
|
| 6 |
+
|
| 7 |
+
logging.basicConfig(
|
| 8 |
+
level=logging.INFO,
|
| 9 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 10 |
+
)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
def get_clean_text(element) -> str:
|
| 14 |
+
"""Extract clean text from an HTML element"""
|
| 15 |
+
if element:
|
| 16 |
+
return element.text.strip()
|
| 17 |
+
return ""
|
| 18 |
+
|
| 19 |
+
def clean_url(base_url: str, href: str) -> str:
|
| 20 |
+
"""Clean and join URLs properly"""
|
| 21 |
+
if not href:
|
| 22 |
+
return ""
|
| 23 |
+
return urljoin(base_url, href)
|
| 24 |
+
|
| 25 |
+
def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]:
|
| 26 |
+
"""Extract a float value from text"""
|
| 27 |
+
if not text:
|
| 28 |
+
return default
|
| 29 |
+
|
| 30 |
+
match = re.search(r'(\d+[\.,]?\d*)', text)
|
| 31 |
+
if match:
|
| 32 |
+
try:
|
| 33 |
+
return float(match.group(1).replace(',', '.'))
|
| 34 |
+
except ValueError:
|
| 35 |
+
pass
|
| 36 |
+
return default
|
| 37 |
+
|
| 38 |
+
def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str:
|
| 39 |
+
"""Construct a Booking.com search URL"""
|
| 40 |
+
search_query = f"{hotel_name} {destination}" if hotel_name else destination
|
| 41 |
+
return f"https://www.booking.com/search.html?ss={quote(search_query)}"
|
| 42 |
+
|
| 43 |
+
def is_valid_image_url(url: str) -> bool:
|
| 44 |
+
"""Check if URL is likely a valid room image and not a logo"""
|
| 45 |
+
if not url:
|
| 46 |
+
return False
|
| 47 |
+
|
| 48 |
+
if url.startswith("data:"):
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
# Skip tiny images (likely icons)
|
| 52 |
+
if any(x in url for x in ["icon", "logo", "badge", "thumb"]):
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
# Must be a full URL
|
| 56 |
+
if not (url.startswith("http://") or url.startswith("https://")):
|
| 57 |
+
return False
|
| 58 |
+
|
| 59 |
+
return True
|
main.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, Depends, HTTPException, status
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from fastapi.security import APIKeyHeader
|
| 4 |
+
import os
|
| 5 |
+
import uvicorn
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
API_TOKEN = os.getenv("API_ACCESS_TOKEN")
|
| 12 |
+
if not API_TOKEN:
|
| 13 |
+
print("WARNING: API_ACCESS_TOKEN not set. Generating a random token.")
|
| 14 |
+
import secrets
|
| 15 |
+
API_TOKEN = secrets.token_urlsafe(32)
|
| 16 |
+
print(f"Generated token: {API_TOKEN}")
|
| 17 |
+
|
| 18 |
+
api_key_header = APIKeyHeader(name="X-API-Token", auto_error=False)
|
| 19 |
+
api_key_query = "token"
|
| 20 |
+
|
| 21 |
+
async def verify_token(
|
| 22 |
+
api_key_header: str = Depends(api_key_header),
|
| 23 |
+
token: Optional[str] = None,
|
| 24 |
+
) -> bool:
|
| 25 |
+
if api_key_header == API_TOKEN:
|
| 26 |
+
return True
|
| 27 |
+
|
| 28 |
+
if token == API_TOKEN:
|
| 29 |
+
return True
|
| 30 |
+
|
| 31 |
+
raise HTTPException(
|
| 32 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 33 |
+
detail="Invalid or missing API token",
|
| 34 |
+
headers={"WWW-Authenticate": "APIKey"},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
app = FastAPI(
|
| 38 |
+
title="Hotel Image API",
|
| 39 |
+
description="API for retrieving hotel images with Google Search fallback"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
app.add_middleware(
|
| 43 |
+
CORSMiddleware,
|
| 44 |
+
allow_origins=["*"],
|
| 45 |
+
allow_credentials=True,
|
| 46 |
+
allow_methods=["*"],
|
| 47 |
+
allow_headers=["*"],
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
from api.routes.hotel_routes import router as hotel_router
|
| 51 |
+
from api.routes.system_routes import router as system_router
|
| 52 |
+
|
| 53 |
+
app.include_router(system_router)
|
| 54 |
+
app.include_router(hotel_router, dependencies=[Depends(verify_token)])
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
models/__pycache__/requests.cpython-310.pyc
ADDED
|
Binary file (699 Bytes). View file
|
|
|
models/__pycache__/responses.cpython-310.pyc
ADDED
|
Binary file (1.41 kB). View file
|
|
|
models/requests.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
|
| 4 |
+
class HotelQuery(BaseModel):
|
| 5 |
+
hotel_name: str
|
| 6 |
+
destination: str
|
| 7 |
+
|
| 8 |
+
class HotelSearchRequest(BaseModel):
|
| 9 |
+
hotels: List[HotelQuery] = Field(..., min_items=1)
|
models/responses.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
|
| 4 |
+
class HotelImage(BaseModel):
|
| 5 |
+
url: str
|
| 6 |
+
|
| 7 |
+
class HotelAmenity(BaseModel):
|
| 8 |
+
name: str
|
| 9 |
+
|
| 10 |
+
class HotelData(BaseModel):
|
| 11 |
+
name: str
|
| 12 |
+
rating: Optional[float] = None
|
| 13 |
+
images: List[str] = []
|
| 14 |
+
amenities: List[str] = []
|
| 15 |
+
booking_link: Optional[str] = None
|
| 16 |
+
|
| 17 |
+
class HotelResponse(BaseModel):
|
| 18 |
+
destination: str
|
| 19 |
+
hotel_name: str
|
| 20 |
+
data: Optional[HotelData] = None
|
| 21 |
+
error: Optional[str] = None
|
| 22 |
+
|
| 23 |
+
class HotelSearchResponse(BaseModel):
|
| 24 |
+
results: List[HotelResponse]
|
| 25 |
+
status: str = "success"
|
| 26 |
+
count: int
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn==0.23.2
|
| 3 |
+
beautifulsoup4==4.12.2
|
| 4 |
+
requests==2.31.0
|
| 5 |
+
pydantic==2.4.2
|
| 6 |
+
asyncio==3.4.3
|
| 7 |
+
aiohttp==3.8.6
|
| 8 |
+
python-multipart==0.0.6
|
| 9 |
+
python-dotenv
|
services/__pycache__/booking_service.cpython-310.pyc
ADDED
|
Binary file (9.51 kB). View file
|
|
|
services/booking_service.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import random
|
| 3 |
+
import re
|
| 4 |
+
import aiohttp # type: ignore
|
| 5 |
+
from bs4 import BeautifulSoup # type: ignore
|
| 6 |
+
from typing import Dict, Any, List, Optional, Tuple
|
| 7 |
+
import logging
|
| 8 |
+
from urllib.parse import urljoin, quote
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
from .utils.http_utils import fetch_page
|
| 13 |
+
from .utils.image_utils import filter_logo_images, is_logo_image
|
| 14 |
+
from .utils.html_utils import extract_rating_from_element, extract_images_from_soup
|
| 15 |
+
from .utils.google_search_utils import fetch_hotel_images_from_google
|
| 16 |
+
|
| 17 |
+
# Load environment variables
|
| 18 |
+
load_dotenv()
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
class BookingService:
|
| 23 |
+
"""Service for scraping hotel data from Booking.com"""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
# List of diverse user agents for rotation
|
| 27 |
+
self.user_agents = [
|
| 28 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
| 29 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15",
|
| 30 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
| 31 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
| 32 |
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
|
| 33 |
+
"Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
|
| 34 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0 Safari/537.36",
|
| 35 |
+
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
# Base headers - User-Agent will be overridden in get_page
|
| 39 |
+
self.headers = {
|
| 40 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 41 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
| 42 |
+
"Connection": "keep-alive",
|
| 43 |
+
"Upgrade-Insecure-Requests": "1",
|
| 44 |
+
"sec-ch-ua": '"Google Chrome";v="123", "Not:A-Brand";v="99"',
|
| 45 |
+
"sec-ch-ua-mobile": "?0",
|
| 46 |
+
"sec-ch-ua-platform": '"Windows"',
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# Check if Google Search API credentials are available
|
| 50 |
+
self.google_api_available = bool(os.getenv("GOOGLE_SEARCH_API_KEY") and os.getenv("GOOGLE_SEARCH_ENGINE_ID"))
|
| 51 |
+
|
| 52 |
+
logger.info(f"BookingService initialized at 2025-05-21 15:22:38 by Garvit-Nagok")
|
| 53 |
+
if self.google_api_available:
|
| 54 |
+
logger.info("Google Custom Search API configured as fallback for hotel images")
|
| 55 |
+
else:
|
| 56 |
+
logger.warning("Google Custom Search API credentials not found - fallback will not be available")
|
| 57 |
+
|
| 58 |
+
# [Keep all existing methods unchanged]
|
| 59 |
+
|
| 60 |
+
async def get_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
|
| 61 |
+
"""Wrapper for fetch_page with rotating user agents"""
|
| 62 |
+
# Create a copy of headers and use a random user agent
|
| 63 |
+
current_headers = self.headers.copy()
|
| 64 |
+
current_headers["User-Agent"] = random.choice(self.user_agents)
|
| 65 |
+
|
| 66 |
+
logger.debug(f"Using user agent: {current_headers['User-Agent'][:30]}...")
|
| 67 |
+
return await fetch_page(session, url, current_headers)
|
| 68 |
+
|
| 69 |
+
async def extract_amenities(self, session: aiohttp.ClientSession, hotel_element, hotel_url: Optional[str] = None) -> List[str]:
|
| 70 |
+
"""Extract popular facilities from hotel detail page"""
|
| 71 |
+
unique_amenities = set()
|
| 72 |
+
|
| 73 |
+
if hotel_url:
|
| 74 |
+
try:
|
| 75 |
+
html = await self.get_page(session, hotel_url)
|
| 76 |
+
if html:
|
| 77 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 78 |
+
|
| 79 |
+
popular_heading = soup.find(string=lambda text: text and text.strip() == "Most popular facilities")
|
| 80 |
+
|
| 81 |
+
if popular_heading:
|
| 82 |
+
current = popular_heading.parent
|
| 83 |
+
container = None
|
| 84 |
+
|
| 85 |
+
# Look for container with facility icons
|
| 86 |
+
for _ in range(3):
|
| 87 |
+
if not current:
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
if current.select("svg") or current.select("img"):
|
| 91 |
+
container = current
|
| 92 |
+
break
|
| 93 |
+
|
| 94 |
+
parent = current.parent
|
| 95 |
+
if parent and (parent.select("svg") or parent.select("img")):
|
| 96 |
+
container = parent
|
| 97 |
+
break
|
| 98 |
+
|
| 99 |
+
sibling = current.find_next_sibling()
|
| 100 |
+
if sibling and (sibling.select("svg") or sibling.select("img")):
|
| 101 |
+
container = sibling
|
| 102 |
+
break
|
| 103 |
+
|
| 104 |
+
current = parent
|
| 105 |
+
|
| 106 |
+
if not container:
|
| 107 |
+
heading_parent = popular_heading.parent
|
| 108 |
+
if heading_parent:
|
| 109 |
+
container = heading_parent.find_next_sibling()
|
| 110 |
+
|
| 111 |
+
# Extract facility items
|
| 112 |
+
if container:
|
| 113 |
+
facility_items = container.select("span") or container.select("div")
|
| 114 |
+
|
| 115 |
+
for item in facility_items:
|
| 116 |
+
text = item.get_text().strip()
|
| 117 |
+
if text and text != "Most popular facilities" and len(text) < 30:
|
| 118 |
+
unique_amenities.add(text)
|
| 119 |
+
|
| 120 |
+
# Fallback method
|
| 121 |
+
if not unique_amenities:
|
| 122 |
+
try:
|
| 123 |
+
rows = soup.select(".f6b6d2a959") or soup.select_one("div:-soup-contains('Most popular facilities')").parent.find_next_sibling().select("span")
|
| 124 |
+
|
| 125 |
+
for item in rows:
|
| 126 |
+
text = item.get_text().strip()
|
| 127 |
+
if text and text != "Most popular facilities" and len(text) < 30:
|
| 128 |
+
unique_amenities.add(text)
|
| 129 |
+
except AttributeError:
|
| 130 |
+
logger.debug("Could not find facilities using fallback selector")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"Error extracting amenities: {e}")
|
| 133 |
+
|
| 134 |
+
return list(unique_amenities)
|
| 135 |
+
|
| 136 |
+
async def get_room_images_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> List[str]:
|
| 137 |
+
"""Get a mix of property and room images from hotel detail page"""
|
| 138 |
+
all_images = []
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
html = await self.get_page(session, url)
|
| 142 |
+
if html:
|
| 143 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 144 |
+
|
| 145 |
+
selectors = [
|
| 146 |
+
".bui-carousel__item img", ".bh-photo-grid img",
|
| 147 |
+
".hp-gallery img", ".hotel-photos img",
|
| 148 |
+
".room-gallery img", ".hotel-room-photographs-slides img",
|
| 149 |
+
"img.active-image", ".gallery-mosaic img", ".tour-360__image img",
|
| 150 |
+
"img[width='300'], img[width='350'], img[width='400'], img[width='500']",
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
all_images = extract_images_from_soup(soup, url, selectors)
|
| 154 |
+
|
| 155 |
+
if len(all_images) < 5:
|
| 156 |
+
for img in soup.select("img"):
|
| 157 |
+
width = img.get("width")
|
| 158 |
+
if width and int(width) < 100:
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
src = img.get("src") or img.get("data-src")
|
| 162 |
+
if src and not is_logo_image(src) and src not in all_images:
|
| 163 |
+
if not src.startswith("http"):
|
| 164 |
+
src = urljoin(url, src)
|
| 165 |
+
all_images.append(src)
|
| 166 |
+
if len(all_images) >= 5:
|
| 167 |
+
break
|
| 168 |
+
|
| 169 |
+
return filter_logo_images(all_images)[:5]
|
| 170 |
+
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.error(f"Error getting hotel images: {e}", exc_info=True)
|
| 173 |
+
|
| 174 |
+
return all_images[:5] if all_images else []
|
| 175 |
+
|
| 176 |
+
async def extract_rating_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> Optional[float]:
|
| 177 |
+
"""Extract rating from hotel detail page"""
|
| 178 |
+
try:
|
| 179 |
+
html = await self.get_page(session, url)
|
| 180 |
+
if not html:
|
| 181 |
+
return None
|
| 182 |
+
|
| 183 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 184 |
+
|
| 185 |
+
guest_reviews_section = soup.find("h2", string="Guest reviews")
|
| 186 |
+
if guest_reviews_section:
|
| 187 |
+
rating_div = soup.select_one("div[aria-label*='Scored'] strong") or soup.select_one(".b5cd09854e")
|
| 188 |
+
if rating_div:
|
| 189 |
+
text = rating_div.get_text().strip()
|
| 190 |
+
match = re.search(r"(\d+[.,]\d+)", text)
|
| 191 |
+
if match:
|
| 192 |
+
return float(match.group(1).replace(',', '.'))
|
| 193 |
+
|
| 194 |
+
nearby_elements = guest_reviews_section.parent.select("div")
|
| 195 |
+
for elem in nearby_elements:
|
| 196 |
+
text = elem.get_text().strip()
|
| 197 |
+
if re.match(r"^\d+[.,]\d+$", text):
|
| 198 |
+
return float(text.replace(',', '.'))
|
| 199 |
+
|
| 200 |
+
score_elements = soup.select(".review-score-badge, .b5cd09854e")
|
| 201 |
+
for elem in score_elements:
|
| 202 |
+
text = elem.get_text().strip()
|
| 203 |
+
match = re.search(r"(\d+[.,]\d+)", text)
|
| 204 |
+
if match:
|
| 205 |
+
return float(match.group(1).replace(',', '.'))
|
| 206 |
+
|
| 207 |
+
review_text = soup.find(string=lambda text: text and ("Review score" in text))
|
| 208 |
+
if review_text:
|
| 209 |
+
parent_text = review_text.parent.get_text() if review_text.parent else ""
|
| 210 |
+
match = re.search(r"(\d+[.,]\d+)", parent_text)
|
| 211 |
+
if match:
|
| 212 |
+
return float(match.group(1).replace(',', '.'))
|
| 213 |
+
|
| 214 |
+
except Exception as e:
|
| 215 |
+
logger.error(f"Error extracting rating: {e}")
|
| 216 |
+
|
| 217 |
+
return None
|
| 218 |
+
|
| 219 |
+
def extract_rating(self, hotel_element) -> Optional[float]:
|
| 220 |
+
"""Extract rating from hotel element"""
|
| 221 |
+
return extract_rating_from_element(hotel_element)
|
| 222 |
+
|
| 223 |
+
def is_name_similar(self, name1: str, name2: str) -> bool:
|
| 224 |
+
"""Check if two hotel names are similar enough"""
|
| 225 |
+
if not name1 or not name2:
|
| 226 |
+
return False
|
| 227 |
+
|
| 228 |
+
name1 = name1.lower()
|
| 229 |
+
name2 = name2.lower()
|
| 230 |
+
|
| 231 |
+
if name1 in name2 or name2 in name1:
|
| 232 |
+
return True
|
| 233 |
+
|
| 234 |
+
# Compare words
|
| 235 |
+
words1 = set(re.findall(r'\w+', name1))
|
| 236 |
+
words2 = set(re.findall(r'\w+', name2))
|
| 237 |
+
|
| 238 |
+
if not words1 or not words2:
|
| 239 |
+
return False
|
| 240 |
+
|
| 241 |
+
# Calculate word overlap
|
| 242 |
+
common_words = words1.intersection(words2)
|
| 243 |
+
similarity = len(common_words) / min(len(words1), len(words2))
|
| 244 |
+
|
| 245 |
+
return similarity >= 0.5 # 50% word overlap
|
| 246 |
+
|
| 247 |
+
async def search_hotel(self, session: aiohttp.ClientSession, destination: str, hotel_name: str) -> Dict[str, Any]:
|
| 248 |
+
"""Search for a specific hotel on Booking.com"""
|
| 249 |
+
search_query = f"{hotel_name} {destination}"
|
| 250 |
+
search_url = f"https://www.booking.com/search.html?ss={quote(search_query)}"
|
| 251 |
+
|
| 252 |
+
html = await self.get_page(session, search_url)
|
| 253 |
+
|
| 254 |
+
if not html:
|
| 255 |
+
return {
|
| 256 |
+
"destination": destination,
|
| 257 |
+
"hotel_name": hotel_name,
|
| 258 |
+
"error": "Failed to retrieve search results"
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
soup = BeautifulSoup(html, 'html.parser')
|
| 262 |
+
hotel_cards = soup.select("[data-testid='property-card'], .sr_property_block, .sr_item")
|
| 263 |
+
|
| 264 |
+
if not hotel_cards:
|
| 265 |
+
return {
|
| 266 |
+
"destination": destination,
|
| 267 |
+
"hotel_name": hotel_name,
|
| 268 |
+
"error": "No hotels found"
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
# Find matching hotel card
|
| 272 |
+
hotel_card = None
|
| 273 |
+
for card in hotel_cards:
|
| 274 |
+
name_elem = card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
|
| 275 |
+
if name_elem:
|
| 276 |
+
card_hotel_name = name_elem.text.strip()
|
| 277 |
+
if self.is_name_similar(card_hotel_name, hotel_name):
|
| 278 |
+
hotel_card = card
|
| 279 |
+
break
|
| 280 |
+
|
| 281 |
+
if not hotel_card:
|
| 282 |
+
hotel_card = hotel_cards[0]
|
| 283 |
+
|
| 284 |
+
name_elem = hotel_card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
|
| 285 |
+
name = name_elem.text.strip() if name_elem else hotel_name
|
| 286 |
+
rating = self.extract_rating(hotel_card)
|
| 287 |
+
|
| 288 |
+
link_elem = hotel_card.select_one("a[href*='hotel'], a.hotel_name_link")
|
| 289 |
+
hotel_url = ""
|
| 290 |
+
if link_elem and 'href' in link_elem.attrs:
|
| 291 |
+
href = link_elem['href']
|
| 292 |
+
hotel_url = urljoin("https://www.booking.com", href) if not href.startswith('http') else href
|
| 293 |
+
|
| 294 |
+
if hotel_url:
|
| 295 |
+
tasks = [
|
| 296 |
+
self.extract_rating_from_detail_page(session, hotel_url),
|
| 297 |
+
self.get_room_images_from_detail_page(session, hotel_url),
|
| 298 |
+
self.extract_amenities(session, hotel_card, hotel_url)
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
detail_rating, images, amenities = await asyncio.gather(*tasks)
|
| 302 |
+
|
| 303 |
+
if detail_rating is not None:
|
| 304 |
+
rating = detail_rating
|
| 305 |
+
else:
|
| 306 |
+
images = []
|
| 307 |
+
amenities = []
|
| 308 |
+
|
| 309 |
+
# If scraping didn't return any images, use Google Custom Search API as fallback
|
| 310 |
+
if not images and self.google_api_available:
|
| 311 |
+
logger.info(f"No images found via scraping for {hotel_name} in {destination}. Using Google API as fallback.")
|
| 312 |
+
images = await fetch_hotel_images_from_google(session, hotel_name, destination)
|
| 313 |
+
|
| 314 |
+
return {
|
| 315 |
+
"destination": destination,
|
| 316 |
+
"hotel_name": hotel_name,
|
| 317 |
+
"data": {
|
| 318 |
+
"name": name,
|
| 319 |
+
"rating": rating,
|
| 320 |
+
"images": images,
|
| 321 |
+
"amenities": amenities,
|
| 322 |
+
"booking_link": hotel_url
|
| 323 |
+
}
|
| 324 |
+
}
|
services/utils/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .html_utils import extract_rating_from_element, extract_images_from_soup
|
| 2 |
+
from .http_utils import fetch_page
|
| 3 |
+
from .image_utils import filter_logo_images, is_logo_image
|
| 4 |
+
from .google_search_utils import fetch_hotel_images_from_google
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
'extract_rating_from_element',
|
| 8 |
+
'extract_images_from_soup',
|
| 9 |
+
'fetch_page',
|
| 10 |
+
'filter_logo_images',
|
| 11 |
+
'is_logo_image',
|
| 12 |
+
'fetch_hotel_images_from_google'
|
| 13 |
+
]
|
services/utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (505 Bytes). View file
|
|
|
services/utils/__pycache__/google_search_utils.cpython-310.pyc
ADDED
|
Binary file (2.06 kB). View file
|
|
|
services/utils/__pycache__/html_utils.cpython-310.pyc
ADDED
|
Binary file (1.76 kB). View file
|
|
|
services/utils/__pycache__/http_utils.cpython-310.pyc
ADDED
|
Binary file (989 Bytes). View file
|
|
|
services/utils/__pycache__/image_utils.cpython-310.pyc
ADDED
|
Binary file (1.01 kB). View file
|
|
|
services/utils/__pycache__/selector_manager.cpython-310.pyc
ADDED
|
Binary file (5.15 kB). View file
|
|
|
services/utils/google_search_utils.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import aiohttp
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
# Load environment variables
|
| 8 |
+
load_dotenv()
|
| 9 |
+
|
| 10 |
+
# Get API credentials from environment variables
|
| 11 |
+
GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY")
|
| 12 |
+
GOOGLE_SEARCH_ENGINE_ID = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
async def fetch_hotel_images_from_google(
|
| 17 |
+
session: aiohttp.ClientSession,
|
| 18 |
+
hotel_name: str,
|
| 19 |
+
destination: str,
|
| 20 |
+
max_results: int = 5
|
| 21 |
+
) -> List[str]:
|
| 22 |
+
"""
|
| 23 |
+
Fetch hotel images using Google Custom Search API as a fallback
|
| 24 |
+
when scraping fails to return any images.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
session: aiohttp client session
|
| 28 |
+
hotel_name: Name of the hotel to search for
|
| 29 |
+
destination: Location/destination of the hotel
|
| 30 |
+
max_results: Maximum number of images to return (default: 5)
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
List of image URLs
|
| 34 |
+
"""
|
| 35 |
+
if not GOOGLE_SEARCH_API_KEY or not GOOGLE_SEARCH_ENGINE_ID:
|
| 36 |
+
logger.error("Google Search API credentials not configured")
|
| 37 |
+
return []
|
| 38 |
+
|
| 39 |
+
# Construct the search query
|
| 40 |
+
search_query = f"{hotel_name} {destination} hotel"
|
| 41 |
+
|
| 42 |
+
# API endpoint
|
| 43 |
+
url = "https://www.googleapis.com/customsearch/v1"
|
| 44 |
+
|
| 45 |
+
# Parameters for the API request
|
| 46 |
+
params = {
|
| 47 |
+
'q': search_query,
|
| 48 |
+
'cx': GOOGLE_SEARCH_ENGINE_ID,
|
| 49 |
+
'key': GOOGLE_SEARCH_API_KEY,
|
| 50 |
+
'searchType': 'image',
|
| 51 |
+
'num': max_results,
|
| 52 |
+
'imgSize': 'large', # Prefer large images
|
| 53 |
+
'imgType': 'photo', # Only return photos, not illustrations
|
| 54 |
+
'safe': 'active' # Safe search
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
async with session.get(url, params=params) as response:
|
| 59 |
+
if response.status == 200:
|
| 60 |
+
data = await response.json()
|
| 61 |
+
|
| 62 |
+
# Extract image URLs from the response
|
| 63 |
+
image_urls = []
|
| 64 |
+
|
| 65 |
+
if 'items' in data:
|
| 66 |
+
for item in data['items']:
|
| 67 |
+
if 'link' in item:
|
| 68 |
+
image_urls.append(item['link'])
|
| 69 |
+
|
| 70 |
+
logger.info(f"Google API returned {len(image_urls)} images for {hotel_name} in {destination}")
|
| 71 |
+
return image_urls
|
| 72 |
+
else:
|
| 73 |
+
error_data = await response.text()
|
| 74 |
+
logger.error(f"Google API error: {response.status} - {error_data}")
|
| 75 |
+
return []
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Error fetching hotel images from Google: {e}")
|
| 78 |
+
return []
|
services/utils/html_utils.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup # type: ignore
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
from urllib.parse import urljoin
|
| 4 |
+
import re
|
| 5 |
+
import logging
|
| 6 |
+
from .image_utils import is_logo_image
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
def extract_rating_from_element(element) -> Optional[float]:
|
| 11 |
+
"""Extract rating from an HTML element"""
|
| 12 |
+
try:
|
| 13 |
+
rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']")
|
| 14 |
+
|
| 15 |
+
if rating_elem:
|
| 16 |
+
rating_text = rating_elem.text.strip()
|
| 17 |
+
rating_match = re.search(r"(\d+[.,]?\d*)", rating_text)
|
| 18 |
+
if rating_match:
|
| 19 |
+
rating_value = float(rating_match.group(1).replace(',', '.'))
|
| 20 |
+
return round(rating_value, 1)
|
| 21 |
+
|
| 22 |
+
# Look for review text near ratings
|
| 23 |
+
review_container = element.select_one(".bui-review-score, .d10a6220b4")
|
| 24 |
+
if review_container:
|
| 25 |
+
text = review_container.get_text()
|
| 26 |
+
rating_match = re.search(r"(\d+[.,]\d+)", text)
|
| 27 |
+
if rating_match:
|
| 28 |
+
rating_value = float(rating_match.group(1).replace(',', '.'))
|
| 29 |
+
return round(rating_value, 1)
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logger.error(f"Error extracting rating: {e}")
|
| 32 |
+
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]:
|
| 36 |
+
"""Extract images from HTML using provided selectors"""
|
| 37 |
+
images = []
|
| 38 |
+
|
| 39 |
+
for selector in selectors:
|
| 40 |
+
for img in soup.select(selector):
|
| 41 |
+
src = img.get("src") or img.get("data-src") or img.get("data-lazy-src")
|
| 42 |
+
if src and not is_logo_image(src):
|
| 43 |
+
if not src.startswith("http"):
|
| 44 |
+
src = urljoin(url, src)
|
| 45 |
+
|
| 46 |
+
if src not in images:
|
| 47 |
+
images.append(src)
|
| 48 |
+
if len(images) >= max_images:
|
| 49 |
+
return images
|
| 50 |
+
|
| 51 |
+
return images
|
services/utils/http_utils.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import aiohttp # type: ignore
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
|
| 8 |
+
"""Fetch a page using aiohttp"""
|
| 9 |
+
try:
|
| 10 |
+
logger.info(f"Requesting URL: {url}")
|
| 11 |
+
async with session.get(url, headers=headers, timeout=15) as response:
|
| 12 |
+
if response.status == 200:
|
| 13 |
+
logger.debug(f"Successfully retrieved content from {url}")
|
| 14 |
+
return await response.text()
|
| 15 |
+
else:
|
| 16 |
+
logger.error(f"Error retrieving URL {url}: Status code {response.status}")
|
| 17 |
+
return None
|
| 18 |
+
except Exception as e:
|
| 19 |
+
logger.error(f"Request failed for {url}: {e}")
|
| 20 |
+
return None
|
services/utils/image_utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
def filter_logo_images(images: List[str]) -> List[str]:
|
| 4 |
+
"""Filter out likely logo images based on URL patterns"""
|
| 5 |
+
filtered = []
|
| 6 |
+
for img_url in images:
|
| 7 |
+
if not is_logo_image(img_url):
|
| 8 |
+
filtered.append(img_url)
|
| 9 |
+
return filtered
|
| 10 |
+
|
| 11 |
+
def is_logo_image(url: str) -> bool:
|
| 12 |
+
"""Check if an image is likely a logo based on URL patterns"""
|
| 13 |
+
if not url:
|
| 14 |
+
return True
|
| 15 |
+
|
| 16 |
+
logo_patterns = [
|
| 17 |
+
"logo", "icon", "brand", "marker", "thumb", "tiny",
|
| 18 |
+
"avatar", "badge", "symbol", "sign", "favicon",
|
| 19 |
+
"design-assets", "googleusercontent", "images-flags"
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
return any(pattern in url.lower() for pattern in logo_patterns)
|