garvitcpp commited on
Commit
ebb0ab7
·
verified ·
1 Parent(s): c297293

Update services/utils/http_utils.py

Browse files
Files changed (1) hide show
  1. services/utils/http_utils.py +6 -12
services/utils/http_utils.py CHANGED
@@ -84,16 +84,6 @@ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) ->
84
  proxy_url = get_proxy_url()
85
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
86
 
87
- # Special adjustments for "hosteller" searches
88
- if "hosteller" in url.lower():
89
- enhanced_headers["Referer"] = "https://www.google.com/search?q=hosteller+hotels+india"
90
- # Remove any location qualifiers for better results
91
- if "old+manali" in url.lower():
92
- url = url.replace("old+manali", "manali")
93
- elif "narkanda" in url.lower() and "hosteller" in url.lower():
94
- # Try a more general search for Narkanda hostels
95
- url = url.replace("the+hosteller+narkanda", "hostels+in+narkanda")
96
-
97
  try:
98
  # Use a longer timeout for proxies
99
  async with session.get(
@@ -109,16 +99,20 @@ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) ->
109
 
110
  # Check if we got actual content, not a bot detection page
111
  if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
112
- # If this is a search page, additional validation
113
  if "searchresults" in url or "search" in url:
114
- if "property-card" in content or "sr_property_block" in content:
115
  logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
116
  return content
117
  else:
118
  logger.warning("No property cards found in search results")
 
119
  else:
120
  logger.info(f"Successfully retrieved content ({len(content)} bytes)")
121
  return content
 
 
 
122
  else:
123
  logger.warning(f"Response status {response.status} from proxy {proxy_url}")
124
  mark_proxy_failure(proxy_url)
 
84
  proxy_url = get_proxy_url()
85
  logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
86
 
 
 
 
 
 
 
 
 
 
 
87
  try:
88
  # Use a longer timeout for proxies
89
  async with session.get(
 
99
 
100
  # Check if we got actual content, not a bot detection page
101
  if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
102
+ # Check for property content on search pages
103
  if "searchresults" in url or "search" in url:
104
+ if "property-card" in content or "sr_property_block" in content or "sr_item" in content:
105
  logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
106
  return content
107
  else:
108
  logger.warning("No property cards found in search results")
109
+ mark_proxy_failure(proxy_url)
110
  else:
111
  logger.info(f"Successfully retrieved content ({len(content)} bytes)")
112
  return content
113
+ else:
114
+ logger.warning(f"Response too short or not HTML: {len(content)} bytes")
115
+ mark_proxy_failure(proxy_url)
116
  else:
117
  logger.warning(f"Response status {response.status} from proxy {proxy_url}")
118
  mark_proxy_failure(proxy_url)