Jofthomas commited on
Commit
435fcc1
·
verified ·
1 Parent(s): 8962f9b

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +16 -0
  2. app.py +188 -0
  3. fastmcp.json +10 -0
  4. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Set the working directory in the container
4
+ WORKDIR /app
5
+
6
+ COPY requirements.txt .
7
+
8
+ RUN pip install --no-cache-dir -r requirements.txt
9
+
10
+ COPY . .
11
+
12
+ EXPOSE 7861
13
+
14
+ ENV PORT=7861
15
+
16
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ import time
6
+ import html
7
+ from typing import List, Optional
8
+ from urllib.parse import urlencode
9
+
10
+ import httpx
11
+ from pydantic import BaseModel, Field, HttpUrl
12
+
13
+ from fastmcp import FastMCP
14
+
15
+
16
+ mcp = FastMCP(
17
+ name="linkedin-jobs",
18
+ host="0.0.0.0",
19
+ port=7861,
20
+ )
21
+
22
+
23
+ class JobPosting(BaseModel):
24
+ title: str = Field(..., description="Job title")
25
+ company: Optional[str] = Field(None, description="Company name if available")
26
+ location: Optional[str] = Field(None, description="Job location if available")
27
+ url: HttpUrl = Field(..., description="Direct link to the LinkedIn job page")
28
+ job_id: Optional[str] = Field(None, description="LinkedIn job ID parsed from URL, if found")
29
+ listed_text: Optional[str] = Field(None, description="Human-readable posted time text, e.g., '3 days ago'")
30
+
31
+
32
+ def _default_headers(cookie: Optional[str]) -> dict:
33
+ headers = {
34
+ "User-Agent": (
35
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
36
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
37
+ "Chrome/125.0.0.0 Safari/537.36"
38
+ ),
39
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
40
+ "Accept-Language": "en-US,en;q=0.9",
41
+ "Cache-Control": "no-cache",
42
+ "Pragma": "no-cache",
43
+ "Connection": "keep-alive",
44
+ }
45
+ if cookie:
46
+ headers["Cookie"] = cookie
47
+ return headers
48
+
49
+
50
+ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
51
+ try:
52
+ from selectolax.parser import HTMLParser
53
+ except Exception:
54
+ raise RuntimeError(
55
+ "selectolax is required. Ensure it is listed in requirements.txt and installed."
56
+ )
57
+
58
+ tree = HTMLParser(html_text)
59
+
60
+ jobs: list[JobPosting] = []
61
+
62
+ # LinkedIn search uses job cards with these classes
63
+ for card in tree.css(".base-search-card, .job-search-card"):
64
+ link_el = card.css_first("a.base-card__full-link, a.hidden-nested-link, a")
65
+ title_el = card.css_first("h3.base-search-card__title, .base-search-card__title, .sr-only")
66
+ company_el = card.css_first(
67
+ "h4.base-search-card__subtitle, .base-search-card__subtitle, .job-search-card__subtitle, .hidden-nested-link+div"
68
+ )
69
+ location_el = card.css_first(".job-search-card__location, .base-search-card__metadata > .job-search-card__location")
70
+ time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new")
71
+
72
+ url = (link_el.attributes.get("href") if link_el else None) or ""
73
+ title = (title_el.text(strip=True) if title_el else "").strip()
74
+ company = (company_el.text(strip=True) if company_el else None)
75
+ location = (location_el.text(strip=True) if location_el else None)
76
+ listed_text = (time_el.text(strip=True) if time_el else None)
77
+
78
+ if not url or not title:
79
+ continue
80
+
81
+ # Clean up HTML entities and whitespace
82
+ title = html.unescape(re.sub(r"\s+", " ", title))
83
+ if company:
84
+ company = html.unescape(re.sub(r"\s+", " ", company))
85
+ if location:
86
+ location = html.unescape(re.sub(r"\s+", " ", location))
87
+ if listed_text:
88
+ listed_text = html.unescape(re.sub(r"\s+", " ", listed_text))
89
+
90
+ # Derive job id from URL if present: /jobs/view/<id>/
91
+ job_id_match = re.search(r"/jobs/view/(\d+)", url)
92
+ job_id = job_id_match.group(1) if job_id_match else None
93
+
94
+ try:
95
+ jobs.append(
96
+ JobPosting(
97
+ title=title,
98
+ company=company,
99
+ location=location,
100
+ url=url, # type: ignore[arg-type]
101
+ job_id=job_id,
102
+ listed_text=listed_text,
103
+ )
104
+ )
105
+ except Exception:
106
+ # Skip malformed entries gracefully
107
+ continue
108
+
109
+ return jobs
110
+
111
+
112
+ def _search_page(client: httpx.Client, query: str, location: Optional[str], start: int) -> list[JobPosting]:
113
+ params = {
114
+ "keywords": query,
115
+ "start": start,
116
+ }
117
+ if location:
118
+ params["location"] = location
119
+
120
+ # First request the main search page (richer HTML for the first 25 results)
121
+ url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
122
+ resp = client.get(url, follow_redirects=True, timeout=20.0)
123
+ resp.raise_for_status()
124
+ jobs = _parse_jobs_from_html(resp.text)
125
+
126
+ # For subsequent starts (>0), LinkedIn often uses this fragment endpoint
127
+ if start > 0 and len(jobs) == 0:
128
+ fragment_url = (
129
+ "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
130
+ )
131
+ frag_resp = client.get(fragment_url, follow_redirects=True, timeout=20.0)
132
+ if frag_resp.status_code == 200:
133
+ jobs = _parse_jobs_from_html(frag_resp.text)
134
+
135
+ return jobs
136
+
137
+
138
+ @mcp.tool(description="Search LinkedIn job listings and return structured job postings. Optionally set LINKEDIN_COOKIE env for authenticated scraping.")
139
+ def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int = 25, pages: int = 1) -> List[JobPosting]:
140
+ """
141
+ - query: Search keywords, e.g. "machine learning engineer"
142
+ - location: Optional location filter, e.g. "Paris, Île-de-France, France"
143
+ - limit: Maximum number of jobs to return (<= 200)
144
+ - pages: Number of pages to fetch (each page is ~25 results)
145
+
146
+ Note: LinkedIn may throttle or require authentication. You can set the environment
147
+ variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results.
148
+ """
149
+ cookie = os.environ.get("LINKEDIN_COOKIE")
150
+
151
+ max_items = max(1, min(limit, 200))
152
+ pages = max(1, min(pages, 8))
153
+
154
+ headers = _default_headers(cookie)
155
+ all_jobs: list[JobPosting] = []
156
+
157
+ with httpx.Client(headers=headers) as client:
158
+ start = 0
159
+ for page in range(pages):
160
+ try:
161
+ jobs = _search_page(client, query=query, location=location, start=start)
162
+ except httpx.HTTPStatusError as e:
163
+ # If unauthorized or blocked, break early
164
+ status = e.response.status_code
165
+ if status in (401, 403, 429):
166
+ break
167
+ raise
168
+ except Exception:
169
+ # transient errors: move to next page
170
+ jobs = []
171
+
172
+ if not jobs:
173
+ # If no jobs were parsed, stop to avoid hammering
174
+ break
175
+
176
+ all_jobs.extend(jobs)
177
+ if len(all_jobs) >= max_items:
178
+ break
179
+
180
+ start += 25
181
+ # Be polite to avoid rate-limiting
182
+ time.sleep(0.8)
183
+
184
+ return all_jobs[:max_items]
185
+
186
+
187
+ if __name__ == "__main__":
188
+ mcp.run(transport="http")
fastmcp.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "linkedin-jobs",
3
+ "description": "MCP server that searches LinkedIn job listings and returns structured results",
4
+ "entrypoint": "app.py",
5
+ "transport": "streamable-http",
6
+ "http": {
7
+ "host": "0.0.0.0",
8
+ "port": 7861
9
+ }
10
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastmcp>=2.12.2
2
+ httpx>=0.27.0
3
+ pydantic>=2.7.0
4
+ selectolax>=0.3.15