| """ |
| Feature Extraction Layer |
| |
| Uses LLM to extract structured features from raw text inputs. |
| Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER. |
| """ |
|
|
| import json |
| import os |
| from typing import Optional |
|
|
| from .input_processor import ProcessedInput |
| from .prompts.feature_extraction import ( |
| ROLE_FEATURE_EXTRACTION_PROMPT, |
| CANDIDATE_FEATURE_EXTRACTION_PROMPT, |
| MATCH_ANALYSIS_PROMPT, |
| ) |
|
|
|
|
| class LLMClient: |
| """Abstraction over LLM provider. Swap this for different backends.""" |
|
|
| def __init__(self, provider: str = "anthropic", model: Optional[str] = None): |
| self.provider = provider |
| self.model = model or self._default_model() |
|
|
| def _default_model(self) -> str: |
| defaults = { |
| "anthropic": "claude-sonnet-4-20250514", |
| "openai": "gpt-4o", |
| "google": "gemini-2.5-flash-lite", |
| } |
| return defaults.get(self.provider, "claude-sonnet-4-20250514") |
|
|
| def complete(self, prompt: str, temperature: float = 0.1) -> str: |
| """Send prompt to LLM and return response text.""" |
| if self.provider == "anthropic": |
| return self._call_anthropic(prompt, temperature) |
| elif self.provider == "openai": |
| return self._call_openai(prompt, temperature) |
| elif self.provider == "google": |
| return self._call_google(prompt, temperature) |
| else: |
| raise ValueError(f"Unsupported provider: {self.provider}") |
|
|
| def _call_anthropic(self, prompt: str, temperature: float) -> str: |
| import anthropic |
|
|
| client = anthropic.Anthropic() |
| response = client.messages.create( |
| model=self.model, |
| max_tokens=4096, |
| temperature=temperature, |
| messages=[{"role": "user", "content": prompt}], |
| ) |
| return response.content[0].text |
|
|
| def _call_openai(self, prompt: str, temperature: float) -> str: |
| from openai import OpenAI |
|
|
| client = OpenAI() |
| response = client.chat.completions.create( |
| model=self.model, |
| temperature=temperature, |
| messages=[{"role": "user", "content": prompt}], |
| ) |
| return response.choices[0].message.content |
|
|
| def _call_google(self, prompt: str, temperature: float) -> str: |
| import google.generativeai as genai |
|
|
| api_key = os.environ.get("GEMINI_API_KEY") |
| if not api_key: |
| raise ValueError("GEMINI_API_KEY environment variable is not set") |
|
|
| genai.configure(api_key=api_key) |
| model = genai.GenerativeModel(self.model) |
| try: |
| response = model.generate_content( |
| prompt, |
| generation_config=genai.GenerationConfig(temperature=temperature), |
| ) |
| return response.text |
| except Exception as e: |
| raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}") |
|
|
|
|
| def _extract_json(text: str) -> dict: |
| """Extract JSON from LLM response, handling markdown code fences.""" |
| import re |
|
|
| text = text.strip() |
|
|
| |
| try: |
| return json.loads(text) |
| except json.JSONDecodeError: |
| pass |
|
|
| |
| if "```" in text: |
| |
| match = re.search(r"```(?:json)?\s*\n(.*?)\n\s*```", text, re.DOTALL) |
| if match: |
| try: |
| return json.loads(match.group(1).strip()) |
| except json.JSONDecodeError: |
| pass |
|
|
| |
| lines = text.split("\n") |
| if lines[0].strip().startswith("```"): |
| lines = lines[1:] |
| if lines and lines[-1].strip() == "```": |
| lines = lines[:-1] |
| try: |
| return json.loads("\n".join(lines)) |
| except json.JSONDecodeError: |
| pass |
|
|
| |
| match = re.search(r"\{.*\}", text, re.DOTALL) |
| if match: |
| try: |
| return json.loads(match.group(0)) |
| except json.JSONDecodeError: |
| pass |
|
|
| raise json.JSONDecodeError( |
| f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}", |
| text, 0 |
| ) |
|
|
|
|
| class FeatureExtractor: |
| """Extracts structured features from processed inputs using LLM.""" |
|
|
| def __init__(self, llm_client: Optional[LLMClient] = None): |
| self.llm = llm_client or LLMClient() |
|
|
| def extract_role_features(self, processed: ProcessedInput) -> dict: |
| """Extract structured features from job description.""" |
| prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format( |
| job_description=processed.job_description, |
| company_stage=processed.company_context.stage, |
| industry=processed.company_context.industry, |
| compensation_band=processed.company_context.compensation_band, |
| location=processed.company_context.location, |
| remote_type=processed.company_context.remote_type, |
| ) |
| response = self.llm.complete(prompt) |
| return _extract_json(response) |
|
|
| def extract_candidate_features(self, processed: ProcessedInput) -> dict: |
| """Extract structured features from resume.""" |
| prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format( |
| resume_text=processed.resume_text, |
| ) |
| response = self.llm.complete(prompt) |
| return _extract_json(response) |
|
|
| def analyze_match( |
| self, |
| role_features: dict, |
| candidate_features: dict, |
| processed: ProcessedInput, |
| ) -> dict: |
| """Analyze the match between role and candidate features.""" |
| prompt = MATCH_ANALYSIS_PROMPT.format( |
| role_features=json.dumps(role_features, indent=2), |
| candidate_features=json.dumps(candidate_features, indent=2), |
| company_stage=processed.company_context.stage, |
| industry=processed.company_context.industry, |
| compensation_band=processed.company_context.compensation_band, |
| remote_type=processed.company_context.remote_type, |
| ) |
| response = self.llm.complete(prompt) |
| return _extract_json(response) |
|
|