Spaces:
Sleeping
Sleeping
change
Browse files
app.py
CHANGED
|
@@ -59,6 +59,14 @@ def _default_headers(cookie: Optional[str]) -> dict:
|
|
| 59 |
"Pragma": "no-cache",
|
| 60 |
"Connection": "keep-alive",
|
| 61 |
"Referer": "https://www.linkedin.com/jobs/",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
}
|
| 63 |
if cookie:
|
| 64 |
headers["Cookie"] = cookie
|
|
@@ -280,8 +288,12 @@ def _build_search_params(
|
|
| 280 |
params["geoId"] = str(geo_id)
|
| 281 |
|
| 282 |
# Sort: relevance (R) or date (DD)
|
| 283 |
-
if sort_by
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
# Time posted
|
| 287 |
if date_posted:
|
|
@@ -318,18 +330,24 @@ def _search_page(
|
|
| 318 |
base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
|
| 319 |
logger.debug("GET main page: %s", base_url)
|
| 320 |
resp = client.get(base_url, follow_redirects=True, timeout=20.0)
|
| 321 |
-
resp.raise_for_status()
|
| 322 |
logger.debug(
|
| 323 |
"Main page status=%d bytes=%d content-type=%s",
|
| 324 |
resp.status_code,
|
| 325 |
len(resp.content),
|
| 326 |
resp.headers.get("content-type"),
|
| 327 |
)
|
| 328 |
-
|
| 329 |
-
if
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
# If nothing parsed, try the fragment endpoint as a fallback regardless of page
|
| 335 |
if len(jobs) == 0:
|
|
@@ -356,7 +374,7 @@ def _search_page(
|
|
| 356 |
if len(jobs) == 0:
|
| 357 |
logger.info(
|
| 358 |
"Zero jobs after main+fragment. Body sample: %s",
|
| 359 |
-
_summarize_body(resp.text
|
| 360 |
)
|
| 361 |
|
| 362 |
return jobs
|
|
|
|
| 59 |
"Pragma": "no-cache",
|
| 60 |
"Connection": "keep-alive",
|
| 61 |
"Referer": "https://www.linkedin.com/jobs/",
|
| 62 |
+
"Accept-Encoding": "gzip, deflate, br, zstd",
|
| 63 |
+
"Upgrade-Insecure-Requests": "1",
|
| 64 |
+
"sec-ch-ua": '"Chromium";v="125", "Not.A/Brand";v="24", "Google Chrome";v="125"',
|
| 65 |
+
"sec-ch-ua-mobile": "?0",
|
| 66 |
+
"sec-ch-ua-platform": '"macOS"',
|
| 67 |
+
"Sec-Fetch-Site": "same-origin",
|
| 68 |
+
"Sec-Fetch-Mode": "navigate",
|
| 69 |
+
"Sec-Fetch-Dest": "document",
|
| 70 |
}
|
| 71 |
if cookie:
|
| 72 |
headers["Cookie"] = cookie
|
|
|
|
| 288 |
params["geoId"] = str(geo_id)
|
| 289 |
|
| 290 |
# Sort: relevance (R) or date (DD)
|
| 291 |
+
if sort_by:
|
| 292 |
+
sb = sort_by.lower()
|
| 293 |
+
if sb in {"relevance", "r"}:
|
| 294 |
+
params["sortBy"] = "R"
|
| 295 |
+
elif sb in {"date", "recent", "dd"}:
|
| 296 |
+
params["sortBy"] = "DD"
|
| 297 |
|
| 298 |
# Time posted
|
| 299 |
if date_posted:
|
|
|
|
| 330 |
base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
|
| 331 |
logger.debug("GET main page: %s", base_url)
|
| 332 |
resp = client.get(base_url, follow_redirects=True, timeout=20.0)
|
|
|
|
| 333 |
logger.debug(
|
| 334 |
"Main page status=%d bytes=%d content-type=%s",
|
| 335 |
resp.status_code,
|
| 336 |
len(resp.content),
|
| 337 |
resp.headers.get("content-type"),
|
| 338 |
)
|
| 339 |
+
jobs: list[JobPosting] = []
|
| 340 |
+
if resp.status_code == 200:
|
| 341 |
+
block_hint = _detect_block_or_wall(resp.text)
|
| 342 |
+
if block_hint:
|
| 343 |
+
logger.warning("Main page may be blocked/walled (hint=%r)", block_hint)
|
| 344 |
+
jobs = _parse_jobs_from_html(resp.text)
|
| 345 |
+
logger.debug("Parsed %d jobs from main page", len(jobs))
|
| 346 |
+
elif resp.status_code in (999, 401, 403, 429):
|
| 347 |
+
logger.warning("Main page blocked with status=%d; will try fragment", resp.status_code)
|
| 348 |
+
else:
|
| 349 |
+
# For other errors, raise to caller
|
| 350 |
+
resp.raise_for_status()
|
| 351 |
|
| 352 |
# If nothing parsed, try the fragment endpoint as a fallback regardless of page
|
| 353 |
if len(jobs) == 0:
|
|
|
|
| 374 |
if len(jobs) == 0:
|
| 375 |
logger.info(
|
| 376 |
"Zero jobs after main+fragment. Body sample: %s",
|
| 377 |
+
_summarize_body(resp.text if resp is not None and resp.text else (frag_resp.text if frag_resp is not None else "")),
|
| 378 |
)
|
| 379 |
|
| 380 |
return jobs
|