File size: 4,643 Bytes
25510ce
115f5eb
d46f9de
 
 
115f5eb
25510ce
 
115f5eb
 
 
d46f9de
 
115f5eb
 
 
 
 
 
 
d46f9de
115f5eb
 
 
 
 
 
 
d46f9de
 
 
 
 
 
 
 
 
115f5eb
d46f9de
 
 
 
 
 
 
115f5eb
d46f9de
115f5eb
 
 
 
d46f9de
115f5eb
 
d46f9de
 
115f5eb
d46f9de
 
 
115f5eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25510ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d46f9de
 
115f5eb
 
 
 
 
 
 
25510ce
 
 
 
 
 
 
 
 
 
 
 
d46f9de
 
 
 
115f5eb
 
 
 
 
 
 
d46f9de
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import re
import string
import numpy as np
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline, AutoTokenizer
from semantic_similarity.semantic_similarity import model as embedding_model
from sentence_transformers import util

# Initialize FastAPI
app = FastAPI()

# Load models and tokenizers
knowledge_model_name = "jjzha/jobbert_knowledge_extraction"
knowledge_tokenizer = AutoTokenizer.from_pretrained(knowledge_model_name)
knowledge_nlp = pipeline(
    model=knowledge_model_name,
    tokenizer=knowledge_tokenizer,
    aggregation_strategy="first",
)

skill_model_name = "jjzha/jobbert_skill_extraction"
skill_tokenizer = AutoTokenizer.from_pretrained(skill_model_name)
skill_nlp = pipeline(
    model=skill_model_name,
    tokenizer=skill_tokenizer,
    aggregation_strategy="first",
)


class TextInput(BaseModel):
    jobDescription: str


def convert_from_numpy(predictions):
    for pred in predictions:
        for key, value in pred.items():
            if isinstance(value, (np.float32, np.int32, np.int64)):
                pred[key] = float(value)
    return predictions


def merge_BI_and_get_results(predictions):
    results, curSkill, curScore, curNoWords = [], "", 0, 0
    for pred in predictions:
        if pred["entity_group"] == "B":
            if curSkill:
                results.append(
                    {"name": curSkill.strip(), "confidence": curScore / curNoWords}
                )
            curSkill, curScore, curNoWords = pred["word"], pred["score"], 1
        else:
            curSkill += " " + pred["word"]
            curScore += pred["score"]
            curNoWords += 1
    if curSkill:
        results.append({"name": curSkill.strip(), "confidence": curScore / curNoWords})
    return results


def chunk_text(text, tokenizer, max_length=500, overlap=100):
    """
    Uses the tokenizer's built-in overflow mechanism to split `text` into
    chunks of at most `max_length` tokens, each overlapping the previous
    by `overlap` tokens.
    """
    enc = tokenizer(
        text,
        truncation=True,
        max_length=max_length,
        stride=overlap,
        return_overflowing_tokens=True,
        return_special_tokens_mask=False,
    )
    chunks = []
    for ids in enc["input_ids"]:
        # decode each chunk back to string
        chunks.append(tokenizer.decode(ids, skip_special_tokens=True))
    return chunks


def deduplicate_by_similarity(items, embeddings, threshold=0.7):
    keep = []
    used = set()
    sim_matrix = util.cos_sim(embeddings, embeddings)

    for i in range(len(items)):
        if i in used:
            continue
        keep.append(items[i])
        for j in range(i + 1, len(items)):
            if sim_matrix[i][j] > threshold:
                used.add(j)
    return keep


def filter_knowledge(results):
    # to_remove = ['-', '/', '(', ')', 'and', 'or', 'the', 'a', 'an']
    filtered_results = []
    for result in results:
        result["name"] = result["name"].strip()
        result["name"] = re.sub(r'[^\w\s]', '', result["name"])
        result["name"] = re.sub(r'\s+', ' ', result["name"])
        if len(result["name"].split()) > 3 or len(result["name"]) <= 2 or result['confidence'] < 0.95:
            continue
        filtered_results.append(result)
    return filtered_results


@app.post("/predict_knowledge")
def predict_knowledge(input_data: TextInput):
    # Clean non-printable chars
    text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription))
    chunks = chunk_text(text, knowledge_tokenizer)
    all_preds = []
    for chunk in chunks:
        preds = knowledge_nlp(chunk)
        all_preds.extend(convert_from_numpy(preds))
    result = merge_BI_and_get_results(all_preds)
    if not result:
        return {"knowledge_predictions": []}

    result = filter_knowledge(result)

    knowledge_names = [r["name"] for r in result]
    embeddings_tensor = embedding_model.encode(knowledge_names, convert_to_tensor=True)
    embeddings = embeddings_tensor.cpu().numpy()
    deduped_results = deduplicate_by_similarity(result, embeddings)

    return {"knowledge_predictions": deduped_results}


@app.post("/predict_skills")
def predict_skills(input_data: TextInput):
    text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription))
    chunks = chunk_text(text, skill_tokenizer)
    all_preds = []
    for chunk in chunks:
        preds = skill_nlp(chunk)
        all_preds.extend(convert_from_numpy(preds))
    return {"skills_predictions": merge_BI_and_get_results(all_preds)}

# Run with:
# uvicorn main:app --host 0.0.0.0 --port 8000