garvitcpp commited on
Commit
28df1e8
·
verified ·
1 Parent(s): 19bd205

Upload 27 files

Browse files
api/routes/__pycache__/hotel_routes.cpython-310.pyc ADDED
Binary file (1.53 kB). View file
 
api/routes/__pycache__/system_routes.cpython-310.pyc ADDED
Binary file (1.21 kB). View file
 
api/routes/hotel_routes.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Query # Added Query
2
+ from models.requests import HotelSearchRequest
3
+ from models.responses import HotelSearchResponse, HotelResponse
4
+ from core.scraper import HotelScraper
5
+ import logging
6
+ from typing import Optional # Added Optional
7
+
8
+ router = APIRouter(tags=["hotels"], prefix="/api")
9
+ logger = logging.getLogger(__name__)
10
+
11
+ @router.post("/hotels", response_model=HotelSearchResponse)
12
+ async def search_hotels(
13
+ request: HotelSearchRequest,
14
+ token: Optional[str] = Query(None, description="API Access Token") # Added token parameter
15
+ ):
16
+ """
17
+ Search for multiple hotels with room images
18
+
19
+ Input format:
20
+ ```json
21
+ {
22
+ "hotels": [
23
+ {"hotel_name": "....", "destination": "...."},
24
+ {"hotel_name": "....", "destination": "...."}
25
+ ]
26
+ }
27
+ ```
28
+
29
+ Requires authentication via token (either in query parameter or X-API-Token header)
30
+ """
31
+ try:
32
+ scraper = HotelScraper()
33
+ results = await scraper.scrape_hotels(request.hotels)
34
+
35
+ return {
36
+ "results": results,
37
+ "status": "success",
38
+ "count": len(results)
39
+ }
40
+ except Exception as e:
41
+ logger.error(f"Error processing hotel search request: {e}")
42
+ raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
api/routes/system_routes.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends
2
+ import datetime
3
+ from main import verify_token
4
+
5
+ router = APIRouter(tags=["system"])
6
+
7
+ @router.get("/")
8
+ async def root():
9
+ """Root endpoint for uptime monitoring"""
10
+ return {"status": "online", "service": "hotel-image-api"}
11
+
12
+ @router.get("/health")
13
+ async def health_check():
14
+ """Health check endpoint for uptime monitoring"""
15
+ return {"status": "healthy", "timestamp": datetime.datetime.utcnow().isoformat()}
16
+
17
+ @router.get("/status")
18
+ async def status():
19
+ """Status check without token verification"""
20
+ return {"status": "running"}
21
+
22
+ @router.get("/token-test", dependencies=[Depends(verify_token)])
23
+ async def token_test():
24
+ """Actually tests if token authentication is working"""
25
+ return {"status": "success", "message": "Token authentication successful"}
core/__pycache__/scraper.cpython-310.pyc ADDED
Binary file (1.71 kB). View file
 
core/__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.08 kB). View file
 
core/scraper.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import aiohttp # type: ignore
3
+ import logging
4
+ from typing import List, Dict, Any
5
+ from services.booking_service import BookingService
6
+ from models.requests import HotelQuery
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class HotelScraper:
11
+ """Main scraper class that coordinates the scraping process"""
12
+
13
+ def __init__(self):
14
+ self.booking_service = BookingService()
15
+
16
+ async def scrape_hotels(self, hotel_queries: List[HotelQuery]) -> List[Dict[str, Any]]:
17
+ """Scrape multiple hotels concurrently"""
18
+ logger.info(f"Starting to scrape {len(hotel_queries)} hotels")
19
+
20
+ async with aiohttp.ClientSession() as session:
21
+ tasks = []
22
+ for query in hotel_queries:
23
+ task = self.booking_service.search_hotel(
24
+ session=session,
25
+ destination=query.destination,
26
+ hotel_name=query.hotel_name
27
+ )
28
+ tasks.append(task)
29
+
30
+ # Run all tasks concurrently
31
+ results = await asyncio.gather(*tasks, return_exceptions=True)
32
+
33
+ # Handle any exceptions
34
+ processed_results = []
35
+ for i, result in enumerate(results):
36
+ if isinstance(result, Exception):
37
+ logger.error(f"Error scraping hotel {hotel_queries[i].hotel_name}: {result}")
38
+ processed_results.append({
39
+ "destination": hotel_queries[i].destination,
40
+ "hotel_name": hotel_queries[i].hotel_name,
41
+ "error": f"Scraping failed: {str(result)}"
42
+ })
43
+ else:
44
+ processed_results.append(result)
45
+
46
+ return processed_results
core/utils.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ import random
4
+ from typing import Optional, Dict, Any, List
5
+ from urllib.parse import urljoin, quote
6
+
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def get_clean_text(element) -> str:
14
+ """Extract clean text from an HTML element"""
15
+ if element:
16
+ return element.text.strip()
17
+ return ""
18
+
19
+ def clean_url(base_url: str, href: str) -> str:
20
+ """Clean and join URLs properly"""
21
+ if not href:
22
+ return ""
23
+ return urljoin(base_url, href)
24
+
25
+ def extract_float_from_text(text: str, default: Optional[float] = None) -> Optional[float]:
26
+ """Extract a float value from text"""
27
+ if not text:
28
+ return default
29
+
30
+ match = re.search(r'(\d+[\.,]?\d*)', text)
31
+ if match:
32
+ try:
33
+ return float(match.group(1).replace(',', '.'))
34
+ except ValueError:
35
+ pass
36
+ return default
37
+
38
+ def construct_booking_search_url(destination: str, hotel_name: Optional[str] = None) -> str:
39
+ """Construct a Booking.com search URL"""
40
+ search_query = f"{hotel_name} {destination}" if hotel_name else destination
41
+ return f"https://www.booking.com/search.html?ss={quote(search_query)}"
42
+
43
+ def is_valid_image_url(url: str) -> bool:
44
+ """Check if URL is likely a valid room image and not a logo"""
45
+ if not url:
46
+ return False
47
+
48
+ if url.startswith("data:"):
49
+ return False
50
+
51
+ # Skip tiny images (likely icons)
52
+ if any(x in url for x in ["icon", "logo", "badge", "thumb"]):
53
+ return False
54
+
55
+ # Must be a full URL
56
+ if not (url.startswith("http://") or url.startswith("https://")):
57
+ return False
58
+
59
+ return True
main.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Depends, HTTPException, status
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.security import APIKeyHeader
4
+ import os
5
+ import uvicorn
6
+ from typing import Optional
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ API_TOKEN = os.getenv("API_ACCESS_TOKEN")
12
+ if not API_TOKEN:
13
+ print("WARNING: API_ACCESS_TOKEN not set. Generating a random token.")
14
+ import secrets
15
+ API_TOKEN = secrets.token_urlsafe(32)
16
+ print(f"Generated token: {API_TOKEN}")
17
+
18
+ api_key_header = APIKeyHeader(name="X-API-Token", auto_error=False)
19
+ api_key_query = "token"
20
+
21
+ async def verify_token(
22
+ api_key_header: str = Depends(api_key_header),
23
+ token: Optional[str] = None,
24
+ ) -> bool:
25
+ if api_key_header == API_TOKEN:
26
+ return True
27
+
28
+ if token == API_TOKEN:
29
+ return True
30
+
31
+ raise HTTPException(
32
+ status_code=status.HTTP_401_UNAUTHORIZED,
33
+ detail="Invalid or missing API token",
34
+ headers={"WWW-Authenticate": "APIKey"},
35
+ )
36
+
37
+ app = FastAPI(
38
+ title="Hotel Image API",
39
+ description="API for retrieving hotel images with Google Search fallback"
40
+ )
41
+
42
+ app.add_middleware(
43
+ CORSMiddleware,
44
+ allow_origins=["*"],
45
+ allow_credentials=True,
46
+ allow_methods=["*"],
47
+ allow_headers=["*"],
48
+ )
49
+
50
+ from api.routes.hotel_routes import router as hotel_router
51
+ from api.routes.system_routes import router as system_router
52
+
53
+ app.include_router(system_router)
54
+ app.include_router(hotel_router, dependencies=[Depends(verify_token)])
55
+
56
+ if __name__ == "__main__":
57
+ uvicorn.run(app, host="0.0.0.0", port=7860)
models/__pycache__/requests.cpython-310.pyc ADDED
Binary file (699 Bytes). View file
 
models/__pycache__/responses.cpython-310.pyc ADDED
Binary file (1.41 kB). View file
 
models/requests.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from pydantic import BaseModel, Field
3
+
4
+ class HotelQuery(BaseModel):
5
+ hotel_name: str
6
+ destination: str
7
+
8
+ class HotelSearchRequest(BaseModel):
9
+ hotels: List[HotelQuery] = Field(..., min_items=1)
models/responses.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ from pydantic import BaseModel, Field
3
+
4
+ class HotelImage(BaseModel):
5
+ url: str
6
+
7
+ class HotelAmenity(BaseModel):
8
+ name: str
9
+
10
+ class HotelData(BaseModel):
11
+ name: str
12
+ rating: Optional[float] = None
13
+ images: List[str] = []
14
+ amenities: List[str] = []
15
+ booking_link: Optional[str] = None
16
+
17
+ class HotelResponse(BaseModel):
18
+ destination: str
19
+ hotel_name: str
20
+ data: Optional[HotelData] = None
21
+ error: Optional[str] = None
22
+
23
+ class HotelSearchResponse(BaseModel):
24
+ results: List[HotelResponse]
25
+ status: str = "success"
26
+ count: int
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.23.2
3
+ beautifulsoup4==4.12.2
4
+ requests==2.31.0
5
+ pydantic==2.4.2
6
+ asyncio==3.4.3
7
+ aiohttp==3.8.6
8
+ python-multipart==0.0.6
9
+ python-dotenv
services/__pycache__/booking_service.cpython-310.pyc ADDED
Binary file (9.51 kB). View file
 
services/booking_service.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import random
3
+ import re
4
+ import aiohttp # type: ignore
5
+ from bs4 import BeautifulSoup # type: ignore
6
+ from typing import Dict, Any, List, Optional, Tuple
7
+ import logging
8
+ from urllib.parse import urljoin, quote
9
+ from dotenv import load_dotenv
10
+ import os
11
+
12
+ from .utils.http_utils import fetch_page
13
+ from .utils.image_utils import filter_logo_images, is_logo_image
14
+ from .utils.html_utils import extract_rating_from_element, extract_images_from_soup
15
+ from .utils.google_search_utils import fetch_hotel_images_from_google
16
+
17
+ # Load environment variables
18
+ load_dotenv()
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class BookingService:
23
+ """Service for scraping hotel data from Booking.com"""
24
+
25
+ def __init__(self):
26
+ # List of diverse user agents for rotation
27
+ self.user_agents = [
28
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
29
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15",
30
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
31
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
32
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
33
+ "Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/123.0.0.0 Mobile/15E148 Safari/604.1",
34
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/123.0.0.0 Safari/537.36",
35
+ "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
36
+ ]
37
+
38
+ # Base headers - User-Agent will be overridden in get_page
39
+ self.headers = {
40
+ "Accept-Language": "en-US,en;q=0.9",
41
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
42
+ "Connection": "keep-alive",
43
+ "Upgrade-Insecure-Requests": "1",
44
+ "sec-ch-ua": '"Google Chrome";v="123", "Not:A-Brand";v="99"',
45
+ "sec-ch-ua-mobile": "?0",
46
+ "sec-ch-ua-platform": '"Windows"',
47
+ }
48
+
49
+ # Check if Google Search API credentials are available
50
+ self.google_api_available = bool(os.getenv("GOOGLE_SEARCH_API_KEY") and os.getenv("GOOGLE_SEARCH_ENGINE_ID"))
51
+
52
+ logger.info(f"BookingService initialized at 2025-05-21 15:22:38 by Garvit-Nagok")
53
+ if self.google_api_available:
54
+ logger.info("Google Custom Search API configured as fallback for hotel images")
55
+ else:
56
+ logger.warning("Google Custom Search API credentials not found - fallback will not be available")
57
+
58
+ # [Keep all existing methods unchanged]
59
+
60
+ async def get_page(self, session: aiohttp.ClientSession, url: str) -> Optional[str]:
61
+ """Wrapper for fetch_page with rotating user agents"""
62
+ # Create a copy of headers and use a random user agent
63
+ current_headers = self.headers.copy()
64
+ current_headers["User-Agent"] = random.choice(self.user_agents)
65
+
66
+ logger.debug(f"Using user agent: {current_headers['User-Agent'][:30]}...")
67
+ return await fetch_page(session, url, current_headers)
68
+
69
+ async def extract_amenities(self, session: aiohttp.ClientSession, hotel_element, hotel_url: Optional[str] = None) -> List[str]:
70
+ """Extract popular facilities from hotel detail page"""
71
+ unique_amenities = set()
72
+
73
+ if hotel_url:
74
+ try:
75
+ html = await self.get_page(session, hotel_url)
76
+ if html:
77
+ soup = BeautifulSoup(html, 'html.parser')
78
+
79
+ popular_heading = soup.find(string=lambda text: text and text.strip() == "Most popular facilities")
80
+
81
+ if popular_heading:
82
+ current = popular_heading.parent
83
+ container = None
84
+
85
+ # Look for container with facility icons
86
+ for _ in range(3):
87
+ if not current:
88
+ break
89
+
90
+ if current.select("svg") or current.select("img"):
91
+ container = current
92
+ break
93
+
94
+ parent = current.parent
95
+ if parent and (parent.select("svg") or parent.select("img")):
96
+ container = parent
97
+ break
98
+
99
+ sibling = current.find_next_sibling()
100
+ if sibling and (sibling.select("svg") or sibling.select("img")):
101
+ container = sibling
102
+ break
103
+
104
+ current = parent
105
+
106
+ if not container:
107
+ heading_parent = popular_heading.parent
108
+ if heading_parent:
109
+ container = heading_parent.find_next_sibling()
110
+
111
+ # Extract facility items
112
+ if container:
113
+ facility_items = container.select("span") or container.select("div")
114
+
115
+ for item in facility_items:
116
+ text = item.get_text().strip()
117
+ if text and text != "Most popular facilities" and len(text) < 30:
118
+ unique_amenities.add(text)
119
+
120
+ # Fallback method
121
+ if not unique_amenities:
122
+ try:
123
+ rows = soup.select(".f6b6d2a959") or soup.select_one("div:-soup-contains('Most popular facilities')").parent.find_next_sibling().select("span")
124
+
125
+ for item in rows:
126
+ text = item.get_text().strip()
127
+ if text and text != "Most popular facilities" and len(text) < 30:
128
+ unique_amenities.add(text)
129
+ except AttributeError:
130
+ logger.debug("Could not find facilities using fallback selector")
131
+ except Exception as e:
132
+ logger.error(f"Error extracting amenities: {e}")
133
+
134
+ return list(unique_amenities)
135
+
136
+ async def get_room_images_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> List[str]:
137
+ """Get a mix of property and room images from hotel detail page"""
138
+ all_images = []
139
+
140
+ try:
141
+ html = await self.get_page(session, url)
142
+ if html:
143
+ soup = BeautifulSoup(html, 'html.parser')
144
+
145
+ selectors = [
146
+ ".bui-carousel__item img", ".bh-photo-grid img",
147
+ ".hp-gallery img", ".hotel-photos img",
148
+ ".room-gallery img", ".hotel-room-photographs-slides img",
149
+ "img.active-image", ".gallery-mosaic img", ".tour-360__image img",
150
+ "img[width='300'], img[width='350'], img[width='400'], img[width='500']",
151
+ ]
152
+
153
+ all_images = extract_images_from_soup(soup, url, selectors)
154
+
155
+ if len(all_images) < 5:
156
+ for img in soup.select("img"):
157
+ width = img.get("width")
158
+ if width and int(width) < 100:
159
+ continue
160
+
161
+ src = img.get("src") or img.get("data-src")
162
+ if src and not is_logo_image(src) and src not in all_images:
163
+ if not src.startswith("http"):
164
+ src = urljoin(url, src)
165
+ all_images.append(src)
166
+ if len(all_images) >= 5:
167
+ break
168
+
169
+ return filter_logo_images(all_images)[:5]
170
+
171
+ except Exception as e:
172
+ logger.error(f"Error getting hotel images: {e}", exc_info=True)
173
+
174
+ return all_images[:5] if all_images else []
175
+
176
+ async def extract_rating_from_detail_page(self, session: aiohttp.ClientSession, url: str) -> Optional[float]:
177
+ """Extract rating from hotel detail page"""
178
+ try:
179
+ html = await self.get_page(session, url)
180
+ if not html:
181
+ return None
182
+
183
+ soup = BeautifulSoup(html, 'html.parser')
184
+
185
+ guest_reviews_section = soup.find("h2", string="Guest reviews")
186
+ if guest_reviews_section:
187
+ rating_div = soup.select_one("div[aria-label*='Scored'] strong") or soup.select_one(".b5cd09854e")
188
+ if rating_div:
189
+ text = rating_div.get_text().strip()
190
+ match = re.search(r"(\d+[.,]\d+)", text)
191
+ if match:
192
+ return float(match.group(1).replace(',', '.'))
193
+
194
+ nearby_elements = guest_reviews_section.parent.select("div")
195
+ for elem in nearby_elements:
196
+ text = elem.get_text().strip()
197
+ if re.match(r"^\d+[.,]\d+$", text):
198
+ return float(text.replace(',', '.'))
199
+
200
+ score_elements = soup.select(".review-score-badge, .b5cd09854e")
201
+ for elem in score_elements:
202
+ text = elem.get_text().strip()
203
+ match = re.search(r"(\d+[.,]\d+)", text)
204
+ if match:
205
+ return float(match.group(1).replace(',', '.'))
206
+
207
+ review_text = soup.find(string=lambda text: text and ("Review score" in text))
208
+ if review_text:
209
+ parent_text = review_text.parent.get_text() if review_text.parent else ""
210
+ match = re.search(r"(\d+[.,]\d+)", parent_text)
211
+ if match:
212
+ return float(match.group(1).replace(',', '.'))
213
+
214
+ except Exception as e:
215
+ logger.error(f"Error extracting rating: {e}")
216
+
217
+ return None
218
+
219
+ def extract_rating(self, hotel_element) -> Optional[float]:
220
+ """Extract rating from hotel element"""
221
+ return extract_rating_from_element(hotel_element)
222
+
223
+ def is_name_similar(self, name1: str, name2: str) -> bool:
224
+ """Check if two hotel names are similar enough"""
225
+ if not name1 or not name2:
226
+ return False
227
+
228
+ name1 = name1.lower()
229
+ name2 = name2.lower()
230
+
231
+ if name1 in name2 or name2 in name1:
232
+ return True
233
+
234
+ # Compare words
235
+ words1 = set(re.findall(r'\w+', name1))
236
+ words2 = set(re.findall(r'\w+', name2))
237
+
238
+ if not words1 or not words2:
239
+ return False
240
+
241
+ # Calculate word overlap
242
+ common_words = words1.intersection(words2)
243
+ similarity = len(common_words) / min(len(words1), len(words2))
244
+
245
+ return similarity >= 0.5 # 50% word overlap
246
+
247
+ async def search_hotel(self, session: aiohttp.ClientSession, destination: str, hotel_name: str) -> Dict[str, Any]:
248
+ """Search for a specific hotel on Booking.com"""
249
+ search_query = f"{hotel_name} {destination}"
250
+ search_url = f"https://www.booking.com/search.html?ss={quote(search_query)}"
251
+
252
+ html = await self.get_page(session, search_url)
253
+
254
+ if not html:
255
+ return {
256
+ "destination": destination,
257
+ "hotel_name": hotel_name,
258
+ "error": "Failed to retrieve search results"
259
+ }
260
+
261
+ soup = BeautifulSoup(html, 'html.parser')
262
+ hotel_cards = soup.select("[data-testid='property-card'], .sr_property_block, .sr_item")
263
+
264
+ if not hotel_cards:
265
+ return {
266
+ "destination": destination,
267
+ "hotel_name": hotel_name,
268
+ "error": "No hotels found"
269
+ }
270
+
271
+ # Find matching hotel card
272
+ hotel_card = None
273
+ for card in hotel_cards:
274
+ name_elem = card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
275
+ if name_elem:
276
+ card_hotel_name = name_elem.text.strip()
277
+ if self.is_name_similar(card_hotel_name, hotel_name):
278
+ hotel_card = card
279
+ break
280
+
281
+ if not hotel_card:
282
+ hotel_card = hotel_cards[0]
283
+
284
+ name_elem = hotel_card.select_one("[data-testid='title'], .sr-hotel__name, .hotel_name")
285
+ name = name_elem.text.strip() if name_elem else hotel_name
286
+ rating = self.extract_rating(hotel_card)
287
+
288
+ link_elem = hotel_card.select_one("a[href*='hotel'], a.hotel_name_link")
289
+ hotel_url = ""
290
+ if link_elem and 'href' in link_elem.attrs:
291
+ href = link_elem['href']
292
+ hotel_url = urljoin("https://www.booking.com", href) if not href.startswith('http') else href
293
+
294
+ if hotel_url:
295
+ tasks = [
296
+ self.extract_rating_from_detail_page(session, hotel_url),
297
+ self.get_room_images_from_detail_page(session, hotel_url),
298
+ self.extract_amenities(session, hotel_card, hotel_url)
299
+ ]
300
+
301
+ detail_rating, images, amenities = await asyncio.gather(*tasks)
302
+
303
+ if detail_rating is not None:
304
+ rating = detail_rating
305
+ else:
306
+ images = []
307
+ amenities = []
308
+
309
+ # If scraping didn't return any images, use Google Custom Search API as fallback
310
+ if not images and self.google_api_available:
311
+ logger.info(f"No images found via scraping for {hotel_name} in {destination}. Using Google API as fallback.")
312
+ images = await fetch_hotel_images_from_google(session, hotel_name, destination)
313
+
314
+ return {
315
+ "destination": destination,
316
+ "hotel_name": hotel_name,
317
+ "data": {
318
+ "name": name,
319
+ "rating": rating,
320
+ "images": images,
321
+ "amenities": amenities,
322
+ "booking_link": hotel_url
323
+ }
324
+ }
services/utils/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .html_utils import extract_rating_from_element, extract_images_from_soup
2
+ from .http_utils import fetch_page
3
+ from .image_utils import filter_logo_images, is_logo_image
4
+ from .google_search_utils import fetch_hotel_images_from_google
5
+
6
+ __all__ = [
7
+ 'extract_rating_from_element',
8
+ 'extract_images_from_soup',
9
+ 'fetch_page',
10
+ 'filter_logo_images',
11
+ 'is_logo_image',
12
+ 'fetch_hotel_images_from_google'
13
+ ]
services/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (505 Bytes). View file
 
services/utils/__pycache__/google_search_utils.cpython-310.pyc ADDED
Binary file (2.06 kB). View file
 
services/utils/__pycache__/html_utils.cpython-310.pyc ADDED
Binary file (1.76 kB). View file
 
services/utils/__pycache__/http_utils.cpython-310.pyc ADDED
Binary file (989 Bytes). View file
 
services/utils/__pycache__/image_utils.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
services/utils/__pycache__/selector_manager.cpython-310.pyc ADDED
Binary file (5.15 kB). View file
 
services/utils/google_search_utils.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import aiohttp
3
+ import logging
4
+ from typing import List, Optional
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Get API credentials from environment variables
11
+ GOOGLE_SEARCH_API_KEY = os.getenv("GOOGLE_SEARCH_API_KEY")
12
+ GOOGLE_SEARCH_ENGINE_ID = os.getenv("GOOGLE_SEARCH_ENGINE_ID")
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ async def fetch_hotel_images_from_google(
17
+ session: aiohttp.ClientSession,
18
+ hotel_name: str,
19
+ destination: str,
20
+ max_results: int = 5
21
+ ) -> List[str]:
22
+ """
23
+ Fetch hotel images using Google Custom Search API as a fallback
24
+ when scraping fails to return any images.
25
+
26
+ Args:
27
+ session: aiohttp client session
28
+ hotel_name: Name of the hotel to search for
29
+ destination: Location/destination of the hotel
30
+ max_results: Maximum number of images to return (default: 5)
31
+
32
+ Returns:
33
+ List of image URLs
34
+ """
35
+ if not GOOGLE_SEARCH_API_KEY or not GOOGLE_SEARCH_ENGINE_ID:
36
+ logger.error("Google Search API credentials not configured")
37
+ return []
38
+
39
+ # Construct the search query
40
+ search_query = f"{hotel_name} {destination} hotel"
41
+
42
+ # API endpoint
43
+ url = "https://www.googleapis.com/customsearch/v1"
44
+
45
+ # Parameters for the API request
46
+ params = {
47
+ 'q': search_query,
48
+ 'cx': GOOGLE_SEARCH_ENGINE_ID,
49
+ 'key': GOOGLE_SEARCH_API_KEY,
50
+ 'searchType': 'image',
51
+ 'num': max_results,
52
+ 'imgSize': 'large', # Prefer large images
53
+ 'imgType': 'photo', # Only return photos, not illustrations
54
+ 'safe': 'active' # Safe search
55
+ }
56
+
57
+ try:
58
+ async with session.get(url, params=params) as response:
59
+ if response.status == 200:
60
+ data = await response.json()
61
+
62
+ # Extract image URLs from the response
63
+ image_urls = []
64
+
65
+ if 'items' in data:
66
+ for item in data['items']:
67
+ if 'link' in item:
68
+ image_urls.append(item['link'])
69
+
70
+ logger.info(f"Google API returned {len(image_urls)} images for {hotel_name} in {destination}")
71
+ return image_urls
72
+ else:
73
+ error_data = await response.text()
74
+ logger.error(f"Google API error: {response.status} - {error_data}")
75
+ return []
76
+ except Exception as e:
77
+ logger.error(f"Error fetching hotel images from Google: {e}")
78
+ return []
services/utils/html_utils.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup # type: ignore
2
+ from typing import List, Optional
3
+ from urllib.parse import urljoin
4
+ import re
5
+ import logging
6
+ from .image_utils import is_logo_image
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def extract_rating_from_element(element) -> Optional[float]:
11
+ """Extract rating from an HTML element"""
12
+ try:
13
+ rating_elem = element.select_one(".bui-review-score__badge") or element.select_one("[data-testid='review-score']")
14
+
15
+ if rating_elem:
16
+ rating_text = rating_elem.text.strip()
17
+ rating_match = re.search(r"(\d+[.,]?\d*)", rating_text)
18
+ if rating_match:
19
+ rating_value = float(rating_match.group(1).replace(',', '.'))
20
+ return round(rating_value, 1)
21
+
22
+ # Look for review text near ratings
23
+ review_container = element.select_one(".bui-review-score, .d10a6220b4")
24
+ if review_container:
25
+ text = review_container.get_text()
26
+ rating_match = re.search(r"(\d+[.,]\d+)", text)
27
+ if rating_match:
28
+ rating_value = float(rating_match.group(1).replace(',', '.'))
29
+ return round(rating_value, 1)
30
+ except Exception as e:
31
+ logger.error(f"Error extracting rating: {e}")
32
+
33
+ return None
34
+
35
+ def extract_images_from_soup(soup: BeautifulSoup, url: str, selectors: List[str], max_images: int = 5) -> List[str]:
36
+ """Extract images from HTML using provided selectors"""
37
+ images = []
38
+
39
+ for selector in selectors:
40
+ for img in soup.select(selector):
41
+ src = img.get("src") or img.get("data-src") or img.get("data-lazy-src")
42
+ if src and not is_logo_image(src):
43
+ if not src.startswith("http"):
44
+ src = urljoin(url, src)
45
+
46
+ if src not in images:
47
+ images.append(src)
48
+ if len(images) >= max_images:
49
+ return images
50
+
51
+ return images
services/utils/http_utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp # type: ignore
2
+ import logging
3
+ from typing import Optional
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) -> Optional[str]:
8
+ """Fetch a page using aiohttp"""
9
+ try:
10
+ logger.info(f"Requesting URL: {url}")
11
+ async with session.get(url, headers=headers, timeout=15) as response:
12
+ if response.status == 200:
13
+ logger.debug(f"Successfully retrieved content from {url}")
14
+ return await response.text()
15
+ else:
16
+ logger.error(f"Error retrieving URL {url}: Status code {response.status}")
17
+ return None
18
+ except Exception as e:
19
+ logger.error(f"Request failed for {url}: {e}")
20
+ return None
services/utils/image_utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ def filter_logo_images(images: List[str]) -> List[str]:
4
+ """Filter out likely logo images based on URL patterns"""
5
+ filtered = []
6
+ for img_url in images:
7
+ if not is_logo_image(img_url):
8
+ filtered.append(img_url)
9
+ return filtered
10
+
11
+ def is_logo_image(url: str) -> bool:
12
+ """Check if an image is likely a logo based on URL patterns"""
13
+ if not url:
14
+ return True
15
+
16
+ logo_patterns = [
17
+ "logo", "icon", "brand", "marker", "thumb", "tiny",
18
+ "avatar", "badge", "symbol", "sign", "favicon",
19
+ "design-assets", "googleusercontent", "images-flags"
20
+ ]
21
+
22
+ return any(pattern in url.lower() for pattern in logo_patterns)