Update services/utils/http_utils.py
Browse files- services/utils/http_utils.py +6 -12
services/utils/http_utils.py
CHANGED
|
@@ -84,16 +84,6 @@ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) ->
|
|
| 84 |
proxy_url = get_proxy_url()
|
| 85 |
logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
|
| 86 |
|
| 87 |
-
# Special adjustments for "hosteller" searches
|
| 88 |
-
if "hosteller" in url.lower():
|
| 89 |
-
enhanced_headers["Referer"] = "https://www.google.com/search?q=hosteller+hotels+india"
|
| 90 |
-
# Remove any location qualifiers for better results
|
| 91 |
-
if "old+manali" in url.lower():
|
| 92 |
-
url = url.replace("old+manali", "manali")
|
| 93 |
-
elif "narkanda" in url.lower() and "hosteller" in url.lower():
|
| 94 |
-
# Try a more general search for Narkanda hostels
|
| 95 |
-
url = url.replace("the+hosteller+narkanda", "hostels+in+narkanda")
|
| 96 |
-
|
| 97 |
try:
|
| 98 |
# Use a longer timeout for proxies
|
| 99 |
async with session.get(
|
|
@@ -109,16 +99,20 @@ async def fetch_page(session: aiohttp.ClientSession, url: str, headers: dict) ->
|
|
| 109 |
|
| 110 |
# Check if we got actual content, not a bot detection page
|
| 111 |
if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
|
| 112 |
-
#
|
| 113 |
if "searchresults" in url or "search" in url:
|
| 114 |
-
if "property-card" in content or "sr_property_block" in content:
|
| 115 |
logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
|
| 116 |
return content
|
| 117 |
else:
|
| 118 |
logger.warning("No property cards found in search results")
|
|
|
|
| 119 |
else:
|
| 120 |
logger.info(f"Successfully retrieved content ({len(content)} bytes)")
|
| 121 |
return content
|
|
|
|
|
|
|
|
|
|
| 122 |
else:
|
| 123 |
logger.warning(f"Response status {response.status} from proxy {proxy_url}")
|
| 124 |
mark_proxy_failure(proxy_url)
|
|
|
|
| 84 |
proxy_url = get_proxy_url()
|
| 85 |
logger.info(f"Using proxy {proxy_url} (attempt {attempt+1})")
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
try:
|
| 88 |
# Use a longer timeout for proxies
|
| 89 |
async with session.get(
|
|
|
|
| 99 |
|
| 100 |
# Check if we got actual content, not a bot detection page
|
| 101 |
if len(content) > 5000 and ("<html" in content or "<!DOCTYPE" in content):
|
| 102 |
+
# Check for property content on search pages
|
| 103 |
if "searchresults" in url or "search" in url:
|
| 104 |
+
if "property-card" in content or "sr_property_block" in content or "sr_item" in content:
|
| 105 |
logger.info(f"Successfully retrieved search results ({len(content)} bytes)")
|
| 106 |
return content
|
| 107 |
else:
|
| 108 |
logger.warning("No property cards found in search results")
|
| 109 |
+
mark_proxy_failure(proxy_url)
|
| 110 |
else:
|
| 111 |
logger.info(f"Successfully retrieved content ({len(content)} bytes)")
|
| 112 |
return content
|
| 113 |
+
else:
|
| 114 |
+
logger.warning(f"Response too short or not HTML: {len(content)} bytes")
|
| 115 |
+
mark_proxy_failure(proxy_url)
|
| 116 |
else:
|
| 117 |
logger.warning(f"Response status {response.status} from proxy {proxy_url}")
|
| 118 |
mark_proxy_failure(proxy_url)
|