Spaces:

NithyaAla
/

RecSys_Skills

Running

App Files Files Community

RecSys_Skills / skill_extraction.py

NithyaAla

Update skill_extraction.py

3f04f41 verified 5 months ago

raw

history blame contribute delete

2.13 kB

	# skill_extraction.py

	import json
	import spacy
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from spacy.cli import download

	# Ensure spaCy model is available
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")

	# Embedding model
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	# Load skills from JSON
	with open("skills_vocab.json", "r", encoding="utf-8") as f:
	SKILL_VOCAB = json.load(f)["skills"]

	# Precompute embeddings
	skill_embeddings = model.encode(SKILL_VOCAB, convert_to_numpy=True, normalize_embeddings=True)

	# Map skill -> embedding for quick lookup
	SKILL_TO_EMB = {skill: emb for skill, emb in zip(SKILL_VOCAB, skill_embeddings)}

	def extract_skills(text, threshold=0.50):
	"""
	Extract skills from text.
	Returns list of (skill, confidence, evidence_sentences)
	"""
	doc = nlp(text)
	sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

	if not sentences:
	return []

	sentence_embeddings = model.encode(sentences, convert_to_numpy=True, normalize_embeddings=True)
	skill_confidences = {}
	skill_evidence = {}

	# Iterate skill-by-skill
	for j, skill_emb in enumerate(skill_embeddings):
	sims = np.dot(sentence_embeddings, skill_emb) # similarity across sentences
	max_sim_idx = int(np.argmax(sims))
	max_sim = float(sims[max_sim_idx])

	if max_sim >= threshold:
	skill = SKILL_VOCAB[j]
	skill_confidences[skill] = round(max_sim, 2)
	# Take the sentence with highest similarity as evidence
	skill_evidence[skill] = sentences[max_sim_idx]

	# Return sorted list of (skill, confidence, evidence)
	results = [(skill, skill_confidences[skill], skill_evidence[skill]) for skill in skill_confidences]
	results.sort(key=lambda x: x[1], reverse=True)
	return results


	# --- NEW FUNCTION ---
	def get_skill_embedding(skill_name):
	"""Return the embedding vector for a skill, or None if not in vocab"""
	return SKILL_TO_EMB.get(skill_name)