Spaces:

Jofthomas
/

linkedin_mcp

Sleeping

App Files Files Community

Jofthomas commited on Sep 11

Commit

435fcc1

verified ·

1 Parent(s): 8962f9b

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +16 -0
app.py +188 -0
fastmcp.json +10 -0
requirements.txt +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11-slim
+# Set the working directory in the container
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7861
+ENV PORT=7861
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from __future__ import annotations
+import os
+import re
+import time
+import html
+from typing import List, Optional
+from urllib.parse import urlencode
+import httpx
+from pydantic import BaseModel, Field, HttpUrl
+from fastmcp import FastMCP
+mcp = FastMCP(
+    name="linkedin-jobs",
+    host="0.0.0.0",
+    port=7861,
+)
+class JobPosting(BaseModel):
+    title: str = Field(..., description="Job title")
+    company: Optional[str] = Field(None, description="Company name if available")
+    location: Optional[str] = Field(None, description="Job location if available")
+    url: HttpUrl = Field(..., description="Direct link to the LinkedIn job page")
+    job_id: Optional[str] = Field(None, description="LinkedIn job ID parsed from URL, if found")
+    listed_text: Optional[str] = Field(None, description="Human-readable posted time text, e.g., '3 days ago'")
+def _default_headers(cookie: Optional[str]) -> dict:
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/125.0.0.0 Safari/537.36"
+        ),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Cache-Control": "no-cache",
+        "Pragma": "no-cache",
+        "Connection": "keep-alive",
+    }
+    if cookie:
+        headers["Cookie"] = cookie
+    return headers
+def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
+    try:
+        from selectolax.parser import HTMLParser
+    except Exception:
+        raise RuntimeError(
+            "selectolax is required. Ensure it is listed in requirements.txt and installed."
+        )
+    tree = HTMLParser(html_text)
+    jobs: list[JobPosting] = []
+    # LinkedIn search uses job cards with these classes
+    for card in tree.css(".base-search-card, .job-search-card"):
+        link_el = card.css_first("a.base-card__full-link, a.hidden-nested-link, a")
+        title_el = card.css_first("h3.base-search-card__title, .base-search-card__title, .sr-only")
+        company_el = card.css_first(
+            "h4.base-search-card__subtitle, .base-search-card__subtitle, .job-search-card__subtitle, .hidden-nested-link+div"
+        )
+        location_el = card.css_first(".job-search-card__location, .base-search-card__metadata > .job-search-card__location")
+        time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new")
+        url = (link_el.attributes.get("href") if link_el else None) or ""
+        title = (title_el.text(strip=True) if title_el else "").strip()
+        company = (company_el.text(strip=True) if company_el else None)
+        location = (location_el.text(strip=True) if location_el else None)
+        listed_text = (time_el.text(strip=True) if time_el else None)
+        if not url or not title:
+            continue
+        # Clean up HTML entities and whitespace
+        title = html.unescape(re.sub(r"\s+", " ", title))
+        if company:
+            company = html.unescape(re.sub(r"\s+", " ", company))
+        if location:
+            location = html.unescape(re.sub(r"\s+", " ", location))
+        if listed_text:
+            listed_text = html.unescape(re.sub(r"\s+", " ", listed_text))
+        # Derive job id from URL if present: /jobs/view/<id>/
+        job_id_match = re.search(r"/jobs/view/(\d+)", url)
+        job_id = job_id_match.group(1) if job_id_match else None
+        try:
+            jobs.append(
+                JobPosting(
+                    title=title,
+                    company=company,
+                    location=location,
+                    url=url,  # type: ignore[arg-type]
+                    job_id=job_id,
+                    listed_text=listed_text,
+                )
+            )
+        except Exception:
+            # Skip malformed entries gracefully
+            continue
+    return jobs
+def _search_page(client: httpx.Client, query: str, location: Optional[str], start: int) -> list[JobPosting]:
+    params = {
+        "keywords": query,
+        "start": start,
+    }
+    if location:
+        params["location"] = location
+    # First request the main search page (richer HTML for the first 25 results)
+    url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
+    resp = client.get(url, follow_redirects=True, timeout=20.0)
+    resp.raise_for_status()
+    jobs = _parse_jobs_from_html(resp.text)
+    # For subsequent starts (>0), LinkedIn often uses this fragment endpoint
+    if start > 0 and len(jobs) == 0:
+        fragment_url = (
+            "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
+        )
+        frag_resp = client.get(fragment_url, follow_redirects=True, timeout=20.0)
+        if frag_resp.status_code == 200:
+            jobs = _parse_jobs_from_html(frag_resp.text)
+    return jobs
+@mcp.tool(description="Search LinkedIn job listings and return structured job postings. Optionally set LINKEDIN_COOKIE env for authenticated scraping.")
+def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int = 25, pages: int = 1) -> List[JobPosting]:
+    """
+    - query: Search keywords, e.g. "machine learning engineer"
+    - location: Optional location filter, e.g. "Paris, Île-de-France, France"
+    - limit: Maximum number of jobs to return (<= 200)
+    - pages: Number of pages to fetch (each page is ~25 results)
+    Note: LinkedIn may throttle or require authentication. You can set the environment
+    variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results.
+    """
+    cookie = os.environ.get("LINKEDIN_COOKIE")
+    max_items = max(1, min(limit, 200))
+    pages = max(1, min(pages, 8))
+    headers = _default_headers(cookie)
+    all_jobs: list[JobPosting] = []
+    with httpx.Client(headers=headers) as client:
+        start = 0
+        for page in range(pages):
+            try:
+                jobs = _search_page(client, query=query, location=location, start=start)
+            except httpx.HTTPStatusError as e:
+                # If unauthorized or blocked, break early
+                status = e.response.status_code
+                if status in (401, 403, 429):
+                    break
+                raise
+            except Exception:
+                # transient errors: move to next page
+                jobs = []
+            if not jobs:
+                # If no jobs were parsed, stop to avoid hammering
+                break
+            all_jobs.extend(jobs)
+            if len(all_jobs) >= max_items:
+                break
+            start += 25
+            # Be polite to avoid rate-limiting
+            time.sleep(0.8)
+    return all_jobs[:max_items]
+if __name__ == "__main__":
+    mcp.run(transport="http")

fastmcp.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "linkedin-jobs",
+  "description": "MCP server that searches LinkedIn job listings and returns structured results",
+  "entrypoint": "app.py",
+  "transport": "streamable-http",
+  "http": {
+    "host": "0.0.0.0",
+    "port": 7861
+  }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastmcp>=2.12.2
+httpx>=0.27.0
+pydantic>=2.7.0
+selectolax>=0.3.15