Spaces:
Sleeping
Sleeping
File size: 12,023 Bytes
d24de5d 10aba13 d24de5d 8980309 d24de5d 8980309 b968a2c d24de5d b968a2c d24de5d 83403d6 b968a2c d24de5d b968a2c 4ec4487 d24de5d 4ec4487 10aba13 a526fb5 10aba13 8980309 d24de5d 8980309 d24de5d 8980309 d24de5d 8980309 d24de5d 8980309 d24de5d a526fb5 d24de5d 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 2667439 a526fb5 d24de5d 8980309 a526fb5 8980309 d24de5d 2667439 4ec4487 8980309 d24de5d 8980309 d24de5d 10aba13 d24de5d a526fb5 d24de5d a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 a526fb5 8980309 d24de5d 8980309 a526fb5 8980309 a526fb5 8980309 d24de5d 8980309 d24de5d 8980309 d24de5d a526fb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 |
import os
import re
import json
import base64
import logging
import time
from typing import List, Dict, Any
from fastapi import FastAPI
from pydantic import BaseModel
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
# -------------------------
# Logging
# -------------------------
logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
logger = logging.getLogger("hackrx-round5")
# -------------------------
# FastAPI app
# -------------------------
app = FastAPI(title="HackRx Round 5 API", version="1.0.0")
# -------------------------
# Models
# -------------------------
class ChallengeRequest(BaseModel):
url: str
questions: List[str]
class ChallengeResponse(BaseModel):
answers: List[str]
# -------------------------
# Helpers
# -------------------------
def try_decode_jwt(token: str) -> Dict[str, Any]:
"""Try to decode a JWT without verifying signature."""
try:
parts = token.split(".")
if len(parts) != 3:
return {}
payload_b64 = parts[1] + "=" * (-len(parts[1]) % 4) # pad
payload_json = base64.urlsafe_b64decode(payload_b64).decode("utf-8")
decoded_payload = json.loads(payload_json)
logger.info(f"Decoded JWT payload: {decoded_payload}")
return decoded_payload
except Exception as e:
logger.error(f"JWT decode error: {e}")
return {}
def setup_chrome_driver():
"""Setup Chrome driver with appropriate options."""
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in background
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36")
# Enable logging to capture console messages
chrome_options.add_argument("--enable-logging")
chrome_options.add_argument("--log-level=0")
try:
driver = webdriver.Chrome(options=chrome_options)
return driver
except Exception as e:
logger.error(f"Failed to create Chrome driver: {e}")
return None
# -------------------------
# Interactive Scraper
# -------------------------
def scrape_with_selenium(url: str) -> Dict[str, Any]:
"""Scrape webpage with Selenium, click Start Challenge, and extract data."""
driver = None
try:
driver = setup_chrome_driver()
if not driver:
return {}
logger.info(f"Loading URL: {url}")
driver.get(url)
# Wait for page to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
time.sleep(2)
# Look for and click "Start Challenge" button
start_button_selectors = [
"button:contains('Start Challenge')",
"button[id*='start']",
"button[class*='start']",
"input[value*='Start']",
"a[href*='start']",
".btn:contains('Start')",
"[onclick*='start']"
]
button_clicked = False
for selector in start_button_selectors:
try:
if "contains" in selector:
# Use XPath for text-based selection
xpath_selector = f"//button[contains(text(), 'Start Challenge')] | //button[contains(text(), 'Start')] | //input[contains(@value, 'Start')]"
elements = driver.find_elements(By.XPATH, xpath_selector)
else:
elements = driver.find_elements(By.CSS_SELECTOR, selector)
if elements:
logger.info(f"Found start button with selector: {selector}")
elements[0].click()
button_clicked = True
time.sleep(3) # Wait for challenge to start
break
except Exception as e:
logger.debug(f"Selector {selector} failed: {e}")
continue
if not button_clicked:
logger.warning("Could not find Start Challenge button, proceeding with current page")
# Get page source after interaction
html = driver.page_source
# Get console logs
console_logs = []
try:
logs = driver.get_log('browser')
for log in logs:
console_logs.append(log['message'])
logger.info(f"Console log: {log['message']}")
except Exception as e:
logger.warning(f"Could not get console logs: {e}")
# Extract data from HTML
hidden_values: List[str] = []
jwt_data: Dict[str, Any] = {}
# Look for JWT tokens in HTML and console logs
all_text = html + " ".join(console_logs)
jwt_patterns = [
r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+",
r"[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{20,}"
]
for pattern in jwt_patterns:
jwt_matches = re.findall(pattern, all_text)
for token in jwt_matches:
logger.info(f"Found JWT token: {token[:50]}...")
data = try_decode_jwt(token)
if data:
jwt_data.update(data)
for k, v in data.items():
hidden_values.append(f"jwt {k}={v}")
# Look for completion codes in console logs
for log in console_logs:
# Look for completion codes
completion_matches = re.findall(r"completion[_\s]*code[:\s]*([A-Za-z0-9\-_]{6,})", log, flags=re.I)
for code in completion_matches:
hidden_values.append(f"completion_code {code}")
# Look for challenge completion messages
if "challenge" in log.lower() and ("complete" in log.lower() or "finished" in log.lower()):
hidden_values.append(f"console_message {log}")
# Execute JavaScript to check for global variables or challenge data
try:
js_result = driver.execute_script("""
var data = {};
if (window.challengeData) data.challengeData = window.challengeData;
if (window.challenge) data.challenge = window.challenge;
if (window.completionCode) data.completionCode = window.completionCode;
return data;
""")
if js_result:
for k, v in js_result.items():
hidden_values.append(f"js_global {k}={v}")
logger.info(f"Found JS global: {k} = {v}")
except Exception as e:
logger.debug(f"JS execution failed: {e}")
# Look for data in local storage
try:
local_storage = driver.execute_script("return window.localStorage;")
if local_storage:
for k, v in local_storage.items():
if any(keyword in k.lower() for keyword in ['challenge', 'code', 'completion']):
hidden_values.append(f"localStorage {k}={v}")
except Exception as e:
logger.debug(f"LocalStorage check failed: {e}")
logger.info(f"Found {len(hidden_values)} hidden values")
logger.info(f"JWT data: {jwt_data}")
return {
"title": driver.title,
"visible_text": driver.find_element(By.TAG_NAME, "body").text[:6000],
"hidden_values": hidden_values,
"jwt_data": jwt_data,
"console_logs": console_logs,
"button_clicked": button_clicked
}
except Exception as e:
logger.error(f"Selenium scraping failed for {url}: {e}")
return {}
finally:
if driver:
driver.quit()
# -------------------------
# Answer extractor
# -------------------------
def answer_question(question: str, content: Dict[str, Any]) -> str:
"""Enhanced rule-based extraction for Round 5 questions."""
ql = question.lower()
hidden = content.get("hidden_values", [])
jwt_data = content.get("jwt_data", {})
console_logs = content.get("console_logs", [])
logger.info(f"Answering question: {question}")
logger.info(f"Available JWT data: {jwt_data}")
logger.info(f"Hidden values count: {len(hidden)}")
# Challenge ID extraction
if "challenge id" in ql or "challengeid" in ql:
# First check JWT data directly
if "challengeID" in jwt_data:
result = str(jwt_data["challengeID"])
logger.info(f"Found challengeID in JWT: {result}")
return result
# Check hidden values
for h in hidden:
if "challengeid" in h.lower():
result = h.split("=", 1)[-1].strip()
logger.info(f"Found challengeID in hidden values: {result}")
return result
# Completion code extraction
if "completion" in ql and "code" in ql:
# Look for explicit completion codes
for h in hidden:
if "completion_code" in h.lower():
result = h.split("=", 1)[-1].strip()
logger.info(f"Found completion code: {result}")
return result
# Look in console logs for completion codes
for log in console_logs:
completion_matches = re.findall(r"completion[_\s]*code[:\s]*([A-Za-z0-9\-_]{6,})", log, flags=re.I)
if completion_matches:
result = completion_matches[0]
logger.info(f"Found completion code in console: {result}")
return result
# Look for any long tokens that might be completion codes
for h in hidden:
if "token" in h.lower() or "code" in h.lower():
token = h.split("=", 1)[-1].strip()
if len(token) > 15: # Assuming completion codes are reasonably long
logger.info(f"Found potential completion code: {token}")
return token
# Challenge name extraction
if "challenge name" in ql:
if "coolGuy" in jwt_data:
result = str(jwt_data["coolGuy"])
logger.info(f"Found challenge name in JWT: {result}")
return result
# Fallback: return any relevant data from JWT
if jwt_data:
for key, value in jwt_data.items():
if key not in ["iat", "exp"] and isinstance(value, str):
logger.info(f"Fallback: returning JWT field {key}: {value}")
return str(value)
logger.warning("No matching data found for question")
return "Challenge information not found"
# -------------------------
# Routes
# -------------------------
@app.get("/")
def root():
return {
"message": "HackRx Round 5 API - Ready (with Selenium support)",
"endpoints": {"challenge": "POST /challenge", "health": "GET /health"},
}
@app.get("/health")
def health():
return {"status": "healthy"}
@app.post("/challenge", response_model=ChallengeResponse)
def challenge(req: ChallengeRequest):
logger.info(f"Round 5 request: url={req.url}, questions={req.questions}")
content = scrape_with_selenium(req.url)
if not content:
return ChallengeResponse(answers=["Challenge information not found" for _ in req.questions])
answers = []
for q in req.questions:
ans = answer_question(q, content)
answers.append(ans)
logger.info(f"Q: {q} → A: {ans}")
logger.info(f"Final answers: {answers}")
return ChallengeResponse(answers=answers) |