ejqs
Fix requirement extraction logic and update sample payload for Accounting Specialist role
65e65ac
from typing import Dict, List, Any | |
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, LongformerTokenizer | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import spacy | |
from spacy.matcher import PhraseMatcher | |
from transformers import LongformerModel | |
from skillNer.general_params import SKILL_DB | |
from skillNer.skill_extractor_class import SkillExtractor | |
Job_num_labels = None | |
class EndpointHandler(): | |
def __init__(self, path=""): | |
# Label mapping as provided | |
self.Job_label_map = { | |
"JT": 0, # Job Title | |
"JS": 1, # Job Summary | |
"COT": 2, # Title of Company Overview Section | |
"COC": 3, # Content of Company Overview Section | |
"RT": 4, # Title of Responsibilites Section | |
"RC": 5, # Content of Responsibilites Section | |
"RQT": 6, # Title of Required Qualifications Section | |
"RQC": 7, # Content of Required Qualifications Section | |
"PQT": 8, # Title of Preferred Qualifications Section | |
"PQC": 9, # Content of Preferred Qualifications Section | |
"ET": 10, # Employment Type | |
"SBC": 11, # Content of Salary and Benefits Section | |
"SBT": 12 # Title of Salary and Benefits Section | |
} | |
global Job_num_labels | |
self.Job_num_labels = len(self.Job_label_map) | |
Job_num_labels = self.Job_num_labels | |
self.Job_labels = [ | |
{"value": "JT", "label": "Job Title"}, | |
{"value": "JS", "label": "Job Summary"}, | |
{"value": "COT", "label": "Title of Company Overview Section"}, | |
{"value": "COC", "label": "Content of Company Overview Section"}, | |
{"value": "RT", "label": "Title of Responsibilites Section"}, | |
{"value": "RC", "label": "Content of Responsibilites Section"}, | |
{"value": "RQT", "label": "Title of Required Qualifications Section"}, | |
{"value": "RQC", "label": "Content of Required Qualifications Section"}, | |
{"value": "PQT", "label": "Title of Preferred Qualifications Section"}, | |
{"value": "PQC", "label": "Content of Preferred Qualifications Section"}, | |
{"value": "ET", "label": "Employment Type"}, | |
{"value": "SBC", "label": "Content of Salary and Benefits Section"}, | |
{"value": "SBT", "label": "Title of Salary and Benefits Section"}, | |
] | |
# Load tokenizer | |
self.Job_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096") | |
self.Job_tokenizer.cls_token | |
# Load model architecture | |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
self.Job_model = LongformerSentenceClassifier(num_labels=self.Job_num_labels) | |
self.Job_model.to(self.device) | |
# Load trained weights | |
self.Job_model.load_state_dict(torch.load(path + "/JobSegmentClassifier3rdEpoch_v2.pth", map_location=self.device)) | |
# Set model to evaluation mode | |
self.Job_model.eval() | |
nlp = spacy.load("en_core_web_lg") | |
self.skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher) | |
def predict_job_sections(self, model, text, tokenizer, device): | |
model.eval() | |
# Tokenize text and get input tensors | |
encoding = tokenizer( | |
text, | |
return_tensors="pt", | |
truncation=True, | |
padding="max_length", | |
max_length=4096 | |
) | |
input_ids = encoding["input_ids"].to(device) | |
attention_mask = encoding["attention_mask"].to(device) | |
# Identify `[CLS]` positions (assuming each sentence starts with `[CLS]`) | |
cls_positions = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)[1] | |
cls_positions = cls_positions.unsqueeze(0).to(device) # Shape: (1, num_sentences) | |
# Create global attention mask (Longformer requires at least 1 global attention token) | |
global_attention_mask = torch.zeros_like(input_ids) | |
global_attention_mask[:, cls_positions] = 1 # Assign global attention to `[CLS]` tokens | |
# Run the model | |
with torch.no_grad(): | |
logits = model( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
global_attention_mask=global_attention_mask, | |
cls_positions=cls_positions | |
) # Shape: (1, num_sentences, num_labels) | |
logits = logits.squeeze(0) # Shape: (num_sentences, num_labels) | |
probs = F.softmax(logits, dim=-1) # Convert logits to probabilities | |
predictions = torch.argmax(probs, dim=-1) # Get predicted label indices | |
return predictions.cpu().numpy() # Convert to NumPy array for easy use | |
def extract_job_sections(self, text): | |
lines = text.splitlines() | |
lines = [line for line in text.splitlines() if line.strip()] | |
text = lines | |
concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text) | |
predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device) | |
return predictions, text | |
def extract_job_requirements(self, text): | |
lines = text.splitlines() | |
lines = [line for line in text.splitlines() if line.strip()] | |
text = lines | |
concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text) | |
predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device) | |
requirements = [] | |
for i, pred in enumerate(predictions): | |
if self.Job_labels[pred]['value'] == "RQC" and i < len(lines): | |
requirements.append(lines[i]) | |
return requirements | |
def label_job_post(self, text): | |
lines = self.extract_job_requirements(text) | |
response = { | |
"requirements": [] | |
} | |
for item in lines: | |
response["requirements"].append(item) | |
response["skills"] = [] | |
seen = set() | |
if response["requirements"]: # Only process if we have requirements | |
annotations = self.skill_extractor.annotate(" ".join(response["requirements"])) | |
if 'results' in annotations and 'full_matches' in annotations['results']: | |
for result in annotations['results']['full_matches']: | |
# Standardizing the skill names | |
skill_info = SKILL_DB.get(result["skill_id"], {}) | |
skill_name = skill_info.get('skill_name', 'Unknown Skill') | |
if skill_name not in seen: | |
seen.add(skill_name) | |
response["skills"].append({'name': skill_name, 'skill_id': result["skill_id"]}) | |
if 'results' in annotations and 'ngram_scored' in annotations['results']: | |
for result in annotations['results']['ngram_scored']: | |
if result['score'] >= 1: | |
# Standardizing the skill names | |
skill_info = SKILL_DB.get(result["skill_id"], {}) | |
skill_name = skill_info.get('skill_name', 'Unknown Skill') | |
if skill_name not in seen: | |
seen.add(skill_name) | |
response["skills"].append({'name': skill_name, 'skill_id': result["skill_id"]}) | |
return response | |
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: | |
""" | |
data args: | |
inputs (:obj: `str` | `PIL.Image` | `np.array`) | |
kwargs | |
Return: | |
A :obj:`list` | `dict`: will be serialized and returned | |
""" | |
text = data['inputs'] | |
# predictions, text = self.extract_job_sections(text) | |
# requirements = self.extract_job_requirements(text) | |
label_job_post = self.label_job_post(text) | |
return label_job_post | |
class LongformerSentenceClassifier(nn.Module): | |
def __init__(self, model_name="allenai/longformer-base-4096", num_labels=Job_num_labels): | |
""" | |
Custom Longformer model for sentence classification. | |
Args: | |
model_name (str): Hugging Face Longformer model. | |
num_labels (int): Number of possible sentence labels. | |
""" | |
super(LongformerSentenceClassifier, self).__init__() | |
self.longformer = LongformerModel.from_pretrained(model_name) | |
self.classifier = nn.Linear(self.longformer.config.hidden_size, num_labels) | |
def forward(self, input_ids, attention_mask, global_attention_mask, cls_positions): | |
""" | |
Forward pass for sentence classification. | |
Args: | |
input_ids (Tensor): Tokenized input IDs, shape (batch_size, max_length) | |
attention_mask (Tensor): Attention mask, shape (batch_size, max_length) | |
global_attention_mask (Tensor): Global attention mask, shape (batch_size, max_length) | |
cls_positions (List[Tensor]): Indices of `[CLS]` tokens for each batch element. | |
""" | |
outputs = self.longformer( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
global_attention_mask=global_attention_mask | |
) | |
last_hidden_state = outputs.last_hidden_state | |
cls_positions = cls_positions.view(input_ids.shape[0], -1) | |
cls_embeddings = last_hidden_state.gather(1, cls_positions.unsqueeze(-1).expand(-1, -1, last_hidden_state.size(-1))) | |
logits = self.classifier(cls_embeddings) | |
return logits | |
if __name__ == "__main__": | |
# init handler | |
my_handler = EndpointHandler(path=".") | |
# prepare sample payload | |
payload = {"inputs": """ | |
We are seeking an experienced Accounting Specialist to join our team. | |
The Accounting Specialist will be responsible for various financial tasks, including reconciling accounts, assist with accounts payable, | |
preparing financial reports, and assisting the Controller. | |
The ideal candidate will have a strong background in accounting principles and practices, as well as proficiency in Quickbooks accounting | |
software, Excel and financial concepts. | |
Responsibilities: | |
- Perform general ledger reconciliation to ensure accuracy of financial data | |
- Prepare and analyze financial reports, bank reconciliations and analysis. | |
- Collaborate with internal teams to ensure compliance with accounting policies and procedures | |
- Support financial audits by providing necessary documentation and information | |
- Accounts Payable - Multiple Companies | |
- GL Reconciliations | |
- Prepare Weekly, Monthly and Quarterly Commission Reports | |
- Daily Bank Deposits | |
Skills: | |
- Proficiency in accounting software QuickBooks Online | |
- Strong knowledge of corporate finance principles and practices | |
- Experience with general ledger reconciliation | |
- Ability to understand concise financial reports | |
- Strong analytical skills for financial analysis | |
- Knowledge of financial auditing processes | |
- Understanding of cash flow analysis | |
- Solid grasp of financial concepts such as revenue recognition, depreciation, and accruals | |
- Ability to manage multiple priorities and time effectively. | |
Pay: | |
$50,000 - $60,000 per year | |
Benefits: | |
401(k) matching | |
Dental insurance | |
Health insurance | |
Paid time off | |
Vision insurance | |
Experience: | |
Microsoft Excel: 3 years (Required) | |
QuickBooks Online: 3 years (Required) | |
"""} | |
# holiday_payload = {"inputs": "Today is a though day"} | |
# test the handler | |
non_holiday_pred=my_handler(payload) | |
# holiday_payload=my_handler(holiday_payload) | |
# show results | |
print(non_holiday_pred) | |
# print("holiday_payload", holiday_payload) |