Spaces:
Sleeping
Sleeping
File size: 4,643 Bytes
25510ce 115f5eb d46f9de 115f5eb 25510ce 115f5eb d46f9de 115f5eb d46f9de 115f5eb d46f9de 115f5eb d46f9de 115f5eb d46f9de 115f5eb d46f9de 115f5eb d46f9de 115f5eb d46f9de 115f5eb 25510ce d46f9de 115f5eb 25510ce d46f9de 115f5eb d46f9de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import re
import string
import numpy as np
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline, AutoTokenizer
from semantic_similarity.semantic_similarity import model as embedding_model
from sentence_transformers import util
# Initialize FastAPI
app = FastAPI()
# Load models and tokenizers
knowledge_model_name = "jjzha/jobbert_knowledge_extraction"
knowledge_tokenizer = AutoTokenizer.from_pretrained(knowledge_model_name)
knowledge_nlp = pipeline(
model=knowledge_model_name,
tokenizer=knowledge_tokenizer,
aggregation_strategy="first",
)
skill_model_name = "jjzha/jobbert_skill_extraction"
skill_tokenizer = AutoTokenizer.from_pretrained(skill_model_name)
skill_nlp = pipeline(
model=skill_model_name,
tokenizer=skill_tokenizer,
aggregation_strategy="first",
)
class TextInput(BaseModel):
jobDescription: str
def convert_from_numpy(predictions):
for pred in predictions:
for key, value in pred.items():
if isinstance(value, (np.float32, np.int32, np.int64)):
pred[key] = float(value)
return predictions
def merge_BI_and_get_results(predictions):
results, curSkill, curScore, curNoWords = [], "", 0, 0
for pred in predictions:
if pred["entity_group"] == "B":
if curSkill:
results.append(
{"name": curSkill.strip(), "confidence": curScore / curNoWords}
)
curSkill, curScore, curNoWords = pred["word"], pred["score"], 1
else:
curSkill += " " + pred["word"]
curScore += pred["score"]
curNoWords += 1
if curSkill:
results.append({"name": curSkill.strip(), "confidence": curScore / curNoWords})
return results
def chunk_text(text, tokenizer, max_length=500, overlap=100):
"""
Uses the tokenizer's built-in overflow mechanism to split `text` into
chunks of at most `max_length` tokens, each overlapping the previous
by `overlap` tokens.
"""
enc = tokenizer(
text,
truncation=True,
max_length=max_length,
stride=overlap,
return_overflowing_tokens=True,
return_special_tokens_mask=False,
)
chunks = []
for ids in enc["input_ids"]:
# decode each chunk back to string
chunks.append(tokenizer.decode(ids, skip_special_tokens=True))
return chunks
def deduplicate_by_similarity(items, embeddings, threshold=0.7):
keep = []
used = set()
sim_matrix = util.cos_sim(embeddings, embeddings)
for i in range(len(items)):
if i in used:
continue
keep.append(items[i])
for j in range(i + 1, len(items)):
if sim_matrix[i][j] > threshold:
used.add(j)
return keep
def filter_knowledge(results):
# to_remove = ['-', '/', '(', ')', 'and', 'or', 'the', 'a', 'an']
filtered_results = []
for result in results:
result["name"] = result["name"].strip()
result["name"] = re.sub(r'[^\w\s]', '', result["name"])
result["name"] = re.sub(r'\s+', ' ', result["name"])
if len(result["name"].split()) > 3 or len(result["name"]) <= 2 or result['confidence'] < 0.95:
continue
filtered_results.append(result)
return filtered_results
@app.post("/predict_knowledge")
def predict_knowledge(input_data: TextInput):
# Clean non-printable chars
text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription))
chunks = chunk_text(text, knowledge_tokenizer)
all_preds = []
for chunk in chunks:
preds = knowledge_nlp(chunk)
all_preds.extend(convert_from_numpy(preds))
result = merge_BI_and_get_results(all_preds)
if not result:
return {"knowledge_predictions": []}
result = filter_knowledge(result)
knowledge_names = [r["name"] for r in result]
embeddings_tensor = embedding_model.encode(knowledge_names, convert_to_tensor=True)
embeddings = embeddings_tensor.cpu().numpy()
deduped_results = deduplicate_by_similarity(result, embeddings)
return {"knowledge_predictions": deduped_results}
@app.post("/predict_skills")
def predict_skills(input_data: TextInput):
text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription))
chunks = chunk_text(text, skill_tokenizer)
all_preds = []
for chunk in chunks:
preds = skill_nlp(chunk)
all_preds.extend(convert_from_numpy(preds))
return {"skills_predictions": merge_BI_and_get_results(all_preds)}
# Run with:
# uvicorn main:app --host 0.0.0.0 --port 8000
|