|
|
import re |
|
|
import os |
|
|
from typing import Optional |
|
|
from text_extractor import JobCore |
|
|
from llm_client import google_search |
|
|
from metrics import log_metric |
|
|
|
|
|
|
|
|
def patch_missing(core: JobCore) -> JobCore: |
|
|
"""Patch missing fields in JobCore using Google search.""" |
|
|
|
|
|
|
|
|
if not os.getenv("GOOGLE_PATCH_ENABLED", "true").lower() in ["true", "1", "yes"]: |
|
|
return core |
|
|
|
|
|
|
|
|
if not core.company: |
|
|
return core |
|
|
|
|
|
patches_applied = 0 |
|
|
|
|
|
|
|
|
if not core.salary_low and not core.salary_high: |
|
|
salary_info = _patch_salary(core.company, core.role) |
|
|
if salary_info: |
|
|
core.salary_low, core.salary_high = salary_info |
|
|
core.source_map["salary"] = "google" |
|
|
patches_applied += 1 |
|
|
|
|
|
|
|
|
if not core.funding: |
|
|
funding_info = _patch_funding(core.company) |
|
|
if funding_info: |
|
|
core.funding = funding_info |
|
|
core.source_map["funding"] = "google" |
|
|
patches_applied += 1 |
|
|
|
|
|
|
|
|
if not core.mission: |
|
|
mission_info = _patch_mission(core.company) |
|
|
if mission_info: |
|
|
core.mission = mission_info |
|
|
core.source_map["mission"] = "google" |
|
|
patches_applied += 1 |
|
|
|
|
|
|
|
|
if not core.location: |
|
|
location_info = _patch_location(core.company) |
|
|
if location_info: |
|
|
core.location = location_info |
|
|
core.source_map["location"] = "google" |
|
|
patches_applied += 1 |
|
|
|
|
|
log_metric("patch_missing", { |
|
|
"company": core.company, |
|
|
"patches_applied": patches_applied, |
|
|
"source_map": core.source_map |
|
|
}) |
|
|
|
|
|
return core |
|
|
|
|
|
|
|
|
def _patch_salary(company: str, role: str) -> Optional[tuple[int, int]]: |
|
|
"""Search for salary information and extract range.""" |
|
|
if not company or not role: |
|
|
return None |
|
|
|
|
|
query = f'"{company}" "{role}" salary range' |
|
|
snippets = google_search(query, top=3, timeout=5) |
|
|
|
|
|
for snippet in snippets: |
|
|
|
|
|
salary_patterns = [ |
|
|
r'\$(\d+)k?[-β]\$?(\d+)k?', |
|
|
r'\$(\d+),?(\d+)[-β]\$?(\d+),?(\d+)', |
|
|
r'(\d+)k?[-β](\d+)k?\s*(?:per|/)?\s*year', |
|
|
] |
|
|
|
|
|
for pattern in salary_patterns: |
|
|
match = re.search(pattern, snippet, re.IGNORECASE) |
|
|
if match: |
|
|
try: |
|
|
if 'k' in match.group(0).lower(): |
|
|
low = int(match.group(1)) * 1000 |
|
|
high = int(match.group(2)) * 1000 |
|
|
else: |
|
|
low = int(match.group(1)) |
|
|
high = int(match.group(2)) |
|
|
|
|
|
|
|
|
if 30000 <= low <= 500000 and 30000 <= high <= 500000 and low < high: |
|
|
return (low, high) |
|
|
except (ValueError, IndexError): |
|
|
continue |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def _patch_funding(company: str) -> Optional[str]: |
|
|
"""Search for funding information.""" |
|
|
if not company: |
|
|
return None |
|
|
|
|
|
query = f'"{company}" funding round raised' |
|
|
snippets = google_search(query, top=3, timeout=5) |
|
|
|
|
|
for snippet in snippets: |
|
|
|
|
|
funding_patterns = [ |
|
|
r'raised \$(\d+(?:\.\d+)?[MB]?)', |
|
|
r'Series [A-Z] \$(\d+(?:\.\d+)?[MB]?)', |
|
|
r'\$(\d+(?:\.\d+)?[MB]?) (?:Series|round|funding)', |
|
|
r'(\$\d+(?:\.\d+)?[MB]? (?:million|billion))', |
|
|
] |
|
|
|
|
|
for pattern in funding_patterns: |
|
|
match = re.search(pattern, snippet, re.IGNORECASE) |
|
|
if match: |
|
|
return match.group(0)[:50] |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def _patch_mission(company: str) -> Optional[str]: |
|
|
"""Search for company mission/tagline.""" |
|
|
if not company: |
|
|
return None |
|
|
|
|
|
query = f'"{company}" company mission tagline about' |
|
|
snippets = google_search(query, top=3, timeout=5) |
|
|
|
|
|
for snippet in snippets: |
|
|
|
|
|
sentences = re.split(r'[.!?]+', snippet) |
|
|
for sentence in sentences: |
|
|
sentence = sentence.strip() |
|
|
|
|
|
if (len(sentence) > 20 and len(sentence) < 200 and |
|
|
any(word in sentence.lower() for word in ['build', 'create', 'develop', 'provide', 'help', 'enable', 'platform'])): |
|
|
return sentence |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def _patch_location(company: str) -> Optional[str]: |
|
|
"""Search for company headquarters location.""" |
|
|
if not company: |
|
|
return None |
|
|
|
|
|
query = f'"{company}" headquarters location' |
|
|
snippets = google_search(query, top=3, timeout=5) |
|
|
|
|
|
for snippet in snippets: |
|
|
|
|
|
location_patterns = [ |
|
|
r'([A-Z][a-z]+,\s*[A-Z]{2})', |
|
|
r'([A-Z][a-z]+\s+[A-Z][a-z]+,\s*[A-Z]{2})', |
|
|
r'([A-Z][a-z]+,\s*[A-Z][a-z]+)', |
|
|
] |
|
|
|
|
|
for pattern in location_patterns: |
|
|
match = re.search(pattern, snippet) |
|
|
if match: |
|
|
location = match.group(1).strip() |
|
|
|
|
|
if any(state in location for state in ['CA', 'NY', 'WA', 'TX', 'MA']): |
|
|
return location |
|
|
|
|
|
return None |