Spaces:
No application file
No application file
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sentence_transformers import SentenceTransformer | |
from rake_nltk import Rake | |
import nltk | |
import importlib.util | |
import sys | |
import subprocess | |
import logging | |
import re | |
import os | |
class NLTKResourceManager: | |
"""Manages NLTK resource initialization and verification""" | |
REQUIRED_RESOURCES = [ | |
('tokenizers/punkt', 'punkt'), | |
('corpora/stopwords', 'stopwords'), | |
('tokenizers/punkt_tab', 'punkt_tab') | |
] | |
def initialize_nltk_resources() -> None: | |
"""Initialize all required NLTK resources with proper error handling""" | |
def verify_resource(resource_path: str) -> bool: | |
try: | |
nltk.data.find(resource_path) | |
return True | |
except LookupError: | |
return False | |
# Create nltk_data directory in user's home if it doesn't exist | |
nltk_data_dir = os.path.expanduser('~/nltk_data') | |
os.makedirs(nltk_data_dir, exist_ok=True) | |
# Ensure NLTK uses the correct data directory | |
nltk.data.path.append(nltk_data_dir) | |
# Download missing resources | |
for resource_path, resource_name in NLTKResourceManager.REQUIRED_RESOURCES: | |
if not verify_resource(resource_path): | |
print(f"Downloading {resource_name}...") | |
nltk.download(resource_name, quiet=True) | |
# Verify successful download | |
if not verify_resource(resource_path): | |
raise RuntimeError(f"Failed to download NLTK resource: {resource_name}") | |
print("All NLTK resources successfully initialized") | |
class EnhancedRelevanceAnalyzer: | |
""" | |
A class for analyzing the relevance of interview questions against job descriptions | |
using multiple NLP techniques and scoring mechanisms. | |
""" | |
def __init__(self): | |
"""Initialize the analyzer with necessary models and vectorizers.""" | |
self.tfidf = TfidfVectorizer( | |
stop_words='english', | |
ngram_range=(1, 3), | |
max_features=5000 | |
) | |
NLTKResourceManager.initialize_nltk_resources() | |
self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2') | |
self.keyword_extractor = Rake() | |
# Initialize spaCy with proper error handling | |
self.nlp = self._initialize_spacy() | |
def _initialize_spacy(self): | |
"""Initialize spaCy with proper error handling and installation if needed.""" | |
try: | |
import spacy | |
try: | |
return spacy.load('en_core_web_sm') | |
except OSError: | |
print("Downloading required spaCy model...") | |
subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True) | |
return spacy.load('en_core_web_sm') | |
except ImportError: | |
print("Installing required dependencies...") | |
subprocess.run([sys.executable, "-m", "pip", "install", "spacy"], check=True) | |
import spacy | |
subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True) | |
return spacy.load('en_core_web_sm') | |
except Exception as e: | |
print(f"Warning: Could not initialize spaCy ({str(e)}). Falling back to basic analysis.") | |
return None | |
def check_title_jd_match(self, job_title, jd_text, threshold=0.45): | |
"""Check semantic match between job title and JD using sentence transformers""" | |
title_embed = self.semantic_model.encode([job_title], convert_to_tensor=True) | |
jd_embed = self.semantic_model.encode([jd_text[:5000]], convert_to_tensor=True) # Use first 5000 chars for efficiency | |
similarity = cosine_similarity(title_embed, jd_embed)[0][0] | |
return similarity >= threshold | |
def calculate_question_scores(self, job_description, questions): | |
""" | |
Calculate relevance scores for a list of questions against a job description. | |
Args: | |
job_description (str): The job description text | |
questions (list): List of question strings to analyze | |
Returns: | |
list: List of relevance scores (0-100) for each question | |
""" | |
# Extract key phrases using RAKE | |
self.keyword_extractor.extract_keywords_from_text(job_description) | |
jd_keywords = set(self.keyword_extractor.get_ranked_phrases()[:20]) | |
print('HEYY') | |
print(jd_keywords) | |
# Extract entities if spaCy is available | |
jd_entities = set() | |
if self.nlp: | |
jd_doc = self.nlp(job_description) | |
jd_entities = set([ent.text.lower() for ent in jd_doc.ents]) | |
# Clean and prepare texts | |
jd_clean = self._clean_text(job_description) | |
questions_clean = [self._clean_text(q) for q in questions] | |
# Calculate scores for each question | |
scores = [] | |
for i, question in enumerate(questions): | |
# Calculate base scores | |
tfidf_score = self._calculate_tfidf_score(jd_clean, questions_clean[i]) | |
semantic_score = self._calculate_semantic_score(jd_clean, questions_clean[i]) | |
keyword_score = self._calculate_keyword_score(jd_keywords, question) | |
question_words = set(self._clean_text(question).split()) | |
keyword_overlap = len(jd_keywords & question_words) | |
# Calculate additional scores if spaCy is available | |
if self.nlp: | |
entity_score = self._calculate_entity_score(jd_entities, question) | |
context_score = self._calculate_context_score(job_description, question) | |
# Combine all scores with weights | |
weighted_score = ( | |
tfidf_score * 0.15 + # Term frequency importance | |
semantic_score * 0.35 + # Semantic meaning importance | |
keyword_score * 0.20 + # Keyword matching importance | |
entity_score * 0.15 + # Named entity importance | |
context_score * 0.15 # Contextual relevance importance | |
) | |
else: | |
# Fallback scoring without spaCy-dependent components | |
weighted_score = ( | |
tfidf_score * 0.25 + | |
semantic_score * 0.45 + | |
keyword_score * 0.30 | |
) | |
# Normalize and boost the final score | |
final_score = self._normalize_and_boost_score(weighted_score, keyword_overlap) | |
scores.append(final_score) | |
return [round(score * 100, 2) for score in scores] | |
def _calculate_tfidf_score(self, jd_text, question): | |
"""Calculate TF-IDF based similarity score.""" | |
tfidf_matrix = self.tfidf.fit_transform([jd_text, question]) | |
return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] | |
def _calculate_semantic_score(self, jd_text, question): | |
"""Calculate semantic similarity using sentence transformers.""" | |
jd_embedding = self.semantic_model.encode([jd_text], convert_to_tensor=True) | |
question_embedding = self.semantic_model.encode([question], convert_to_tensor=True) | |
return cosine_similarity(jd_embedding, question_embedding)[0][0] | |
def _calculate_keyword_score(self, jd_keywords, question): | |
"""Enhanced keyword scoring with threshold-based boosting""" | |
question_words = set(self._clean_text(question).split()) | |
overlap = len(jd_keywords & question_words) | |
# Base score calculation | |
base_score = min(1.0, overlap / max(len(jd_keywords)*0.25, 1)) | |
# Threshold-based boosting | |
if overlap >= 3: # Absolute threshold | |
base_score = min(1.0, base_score * 1.25) | |
if len(question_words) > 0 and (overlap/len(question_words)) >= 0.25: # Relative threshold | |
base_score = min(1.0, base_score * 1.15) | |
return base_score | |
def _calculate_entity_score(self, jd_entities, question): | |
"""Calculate named entity overlap score.""" | |
if not self.nlp: | |
return 0.0 | |
question_doc = self.nlp(question) | |
question_entities = set([ent.text.lower() for ent in question_doc.ents]) | |
overlap = len(jd_entities & question_entities) | |
return min(1.0, overlap / max(len(jd_entities) * 0.2, 1)) | |
def _calculate_context_score(self, job_description, question): | |
"""Calculate contextual relevance score using noun phrases.""" | |
if not self.nlp: | |
return 0.0 | |
jd_doc = self.nlp(job_description) | |
question_doc = self.nlp(question) | |
# Extract noun phrases | |
jd_phrases = set([chunk.text.lower() for chunk in jd_doc.noun_chunks]) | |
question_phrases = set([chunk.text.lower() for chunk in question_doc.noun_chunks]) | |
# Calculate phrase overlap with boosting | |
phrase_overlap = len(jd_phrases & question_phrases) / max(len(jd_phrases), 1) | |
return min(1.0, phrase_overlap * 1.5) | |
def _normalize_and_boost_score(self, score,keyword_overlap): | |
"""Enhanced normalization with keyword-based boosting""" | |
# Sigmoid normalization | |
normalized = 1 / (1 + np.exp(-6 * (score - 0.5))) | |
# Additional boost based on keyword overlap | |
if keyword_overlap >= 2: | |
normalized = min(1.0, normalized * 1.1) | |
if keyword_overlap >= 4: | |
normalized = min(1.0, normalized * 1.15) | |
return normalized | |
def _clean_text(self, text): | |
"""Clean and normalize text with technical term handling.""" | |
# Basic cleaning | |
text = re.sub(r'[^\w\s-]', '', text.lower()) | |
text = re.sub(r'\s+', ' ', text).strip() | |
# Handle common technical terms and abbreviations | |
tech_mappings = { | |
'js': 'javascript', | |
'py': 'python', | |
'ml': 'machine learning', | |
'ai': 'artificial intelligence', | |
'dl': 'deep learning', | |
'nlp': 'natural language processing', | |
'db': 'database', | |
'ui': 'user interface', | |
'ux': 'user experience', | |
'api': 'application programming interface', | |
'oop': 'object oriented programming', | |
'ci': 'continuous integration', | |
'cd': 'continuous deployment', | |
'aws': 'amazon web services', | |
'azure': 'microsoft azure', | |
'gcp': 'google cloud platform' | |
} | |
words = text.split() | |
cleaned_words = [tech_mappings.get(word, word) for word in words] | |
return ' '.join(cleaned_words) |