IQKiller / micro /patch_missing.py
AvikalpK's picture
πŸš€ IQKiller: AI-Powered Job Analysis Platform
4655858
import re
import os
from typing import Optional
from text_extractor import JobCore
from llm_client import google_search
from metrics import log_metric
def patch_missing(core: JobCore) -> JobCore:
"""Patch missing fields in JobCore using Google search."""
# Check if Google patching is enabled
if not os.getenv("GOOGLE_PATCH_ENABLED", "true").lower() in ["true", "1", "yes"]:
return core
# Only patch if we have basic company info
if not core.company:
return core
patches_applied = 0
# Patch salary if missing
if not core.salary_low and not core.salary_high:
salary_info = _patch_salary(core.company, core.role)
if salary_info:
core.salary_low, core.salary_high = salary_info
core.source_map["salary"] = "google"
patches_applied += 1
# Patch funding if missing
if not core.funding:
funding_info = _patch_funding(core.company)
if funding_info:
core.funding = funding_info
core.source_map["funding"] = "google"
patches_applied += 1
# Patch mission if missing
if not core.mission:
mission_info = _patch_mission(core.company)
if mission_info:
core.mission = mission_info
core.source_map["mission"] = "google"
patches_applied += 1
# Patch location if missing
if not core.location:
location_info = _patch_location(core.company)
if location_info:
core.location = location_info
core.source_map["location"] = "google"
patches_applied += 1
log_metric("patch_missing", {
"company": core.company,
"patches_applied": patches_applied,
"source_map": core.source_map
})
return core
def _patch_salary(company: str, role: str) -> Optional[tuple[int, int]]:
"""Search for salary information and extract range."""
if not company or not role:
return None
query = f'"{company}" "{role}" salary range'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for salary patterns like "$120k-$180k", "$150,000-$200,000"
salary_patterns = [
r'\$(\d+)k?[-–]\$?(\d+)k?',
r'\$(\d+),?(\d+)[-–]\$?(\d+),?(\d+)',
r'(\d+)k?[-–](\d+)k?\s*(?:per|/)?\s*year',
]
for pattern in salary_patterns:
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
try:
if 'k' in match.group(0).lower():
low = int(match.group(1)) * 1000
high = int(match.group(2)) * 1000
else:
low = int(match.group(1))
high = int(match.group(2))
# Sanity check: reasonable salary range
if 30000 <= low <= 500000 and 30000 <= high <= 500000 and low < high:
return (low, high)
except (ValueError, IndexError):
continue
return None
def _patch_funding(company: str) -> Optional[str]:
"""Search for funding information."""
if not company:
return None
query = f'"{company}" funding round raised'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for funding patterns
funding_patterns = [
r'raised \$(\d+(?:\.\d+)?[MB]?)',
r'Series [A-Z] \$(\d+(?:\.\d+)?[MB]?)',
r'\$(\d+(?:\.\d+)?[MB]?) (?:Series|round|funding)',
r'(\$\d+(?:\.\d+)?[MB]? (?:million|billion))',
]
for pattern in funding_patterns:
match = re.search(pattern, snippet, re.IGNORECASE)
if match:
return match.group(0)[:50] # Limit length
return None
def _patch_mission(company: str) -> Optional[str]:
"""Search for company mission/tagline."""
if not company:
return None
query = f'"{company}" company mission tagline about'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for mission-like sentences
sentences = re.split(r'[.!?]+', snippet)
for sentence in sentences:
sentence = sentence.strip()
# Look for sentences that describe what the company does
if (len(sentence) > 20 and len(sentence) < 200 and
any(word in sentence.lower() for word in ['build', 'create', 'develop', 'provide', 'help', 'enable', 'platform'])):
return sentence
return None
def _patch_location(company: str) -> Optional[str]:
"""Search for company headquarters location."""
if not company:
return None
query = f'"{company}" headquarters location'
snippets = google_search(query, top=3, timeout=5)
for snippet in snippets:
# Look for location patterns
location_patterns = [
r'([A-Z][a-z]+,\s*[A-Z]{2})', # City, State
r'([A-Z][a-z]+\s+[A-Z][a-z]+,\s*[A-Z]{2})', # City City, State
r'([A-Z][a-z]+,\s*[A-Z][a-z]+)', # City, Country
]
for pattern in location_patterns:
match = re.search(pattern, snippet)
if match:
location = match.group(1).strip()
# Sanity check for common US locations
if any(state in location for state in ['CA', 'NY', 'WA', 'TX', 'MA']):
return location
return None