tronskel's picture
Upload 46 files
2875866 verified
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rake_nltk import Rake
import nltk
import importlib.util
import sys
import subprocess
import logging
import re
import os
class NLTKResourceManager:
"""Manages NLTK resource initialization and verification"""
REQUIRED_RESOURCES = [
('tokenizers/punkt', 'punkt'),
('corpora/stopwords', 'stopwords'),
('tokenizers/punkt_tab', 'punkt_tab')
]
@staticmethod
def initialize_nltk_resources() -> None:
"""Initialize all required NLTK resources with proper error handling"""
def verify_resource(resource_path: str) -> bool:
try:
nltk.data.find(resource_path)
return True
except LookupError:
return False
# Create nltk_data directory in user's home if it doesn't exist
nltk_data_dir = os.path.expanduser('~/nltk_data')
os.makedirs(nltk_data_dir, exist_ok=True)
# Ensure NLTK uses the correct data directory
nltk.data.path.append(nltk_data_dir)
# Download missing resources
for resource_path, resource_name in NLTKResourceManager.REQUIRED_RESOURCES:
if not verify_resource(resource_path):
print(f"Downloading {resource_name}...")
nltk.download(resource_name, quiet=True)
# Verify successful download
if not verify_resource(resource_path):
raise RuntimeError(f"Failed to download NLTK resource: {resource_name}")
print("All NLTK resources successfully initialized")
class EnhancedRelevanceAnalyzer:
"""
A class for analyzing the relevance of interview questions against job descriptions
using multiple NLP techniques and scoring mechanisms.
"""
def __init__(self):
"""Initialize the analyzer with necessary models and vectorizers."""
self.tfidf = TfidfVectorizer(
stop_words='english',
ngram_range=(1, 3),
max_features=5000
)
NLTKResourceManager.initialize_nltk_resources()
self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
self.keyword_extractor = Rake()
# Initialize spaCy with proper error handling
self.nlp = self._initialize_spacy()
def _initialize_spacy(self):
"""Initialize spaCy with proper error handling and installation if needed."""
try:
import spacy
try:
return spacy.load('en_core_web_sm')
except OSError:
print("Downloading required spaCy model...")
subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
return spacy.load('en_core_web_sm')
except ImportError:
print("Installing required dependencies...")
subprocess.run([sys.executable, "-m", "pip", "install", "spacy"], check=True)
import spacy
subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
return spacy.load('en_core_web_sm')
except Exception as e:
print(f"Warning: Could not initialize spaCy ({str(e)}). Falling back to basic analysis.")
return None
def check_title_jd_match(self, job_title, jd_text, threshold=0.45):
"""Check semantic match between job title and JD using sentence transformers"""
title_embed = self.semantic_model.encode([job_title], convert_to_tensor=True)
jd_embed = self.semantic_model.encode([jd_text[:5000]], convert_to_tensor=True) # Use first 5000 chars for efficiency
similarity = cosine_similarity(title_embed, jd_embed)[0][0]
return similarity >= threshold
def calculate_question_scores(self, job_description, questions):
"""
Calculate relevance scores for a list of questions against a job description.
Args:
job_description (str): The job description text
questions (list): List of question strings to analyze
Returns:
list: List of relevance scores (0-100) for each question
"""
# Extract key phrases using RAKE
self.keyword_extractor.extract_keywords_from_text(job_description)
jd_keywords = set(self.keyword_extractor.get_ranked_phrases()[:20])
print('HEYY')
print(jd_keywords)
# Extract entities if spaCy is available
jd_entities = set()
if self.nlp:
jd_doc = self.nlp(job_description)
jd_entities = set([ent.text.lower() for ent in jd_doc.ents])
# Clean and prepare texts
jd_clean = self._clean_text(job_description)
questions_clean = [self._clean_text(q) for q in questions]
# Calculate scores for each question
scores = []
for i, question in enumerate(questions):
# Calculate base scores
tfidf_score = self._calculate_tfidf_score(jd_clean, questions_clean[i])
semantic_score = self._calculate_semantic_score(jd_clean, questions_clean[i])
keyword_score = self._calculate_keyword_score(jd_keywords, question)
question_words = set(self._clean_text(question).split())
keyword_overlap = len(jd_keywords & question_words)
# Calculate additional scores if spaCy is available
if self.nlp:
entity_score = self._calculate_entity_score(jd_entities, question)
context_score = self._calculate_context_score(job_description, question)
# Combine all scores with weights
weighted_score = (
tfidf_score * 0.15 + # Term frequency importance
semantic_score * 0.35 + # Semantic meaning importance
keyword_score * 0.20 + # Keyword matching importance
entity_score * 0.15 + # Named entity importance
context_score * 0.15 # Contextual relevance importance
)
else:
# Fallback scoring without spaCy-dependent components
weighted_score = (
tfidf_score * 0.25 +
semantic_score * 0.45 +
keyword_score * 0.30
)
# Normalize and boost the final score
final_score = self._normalize_and_boost_score(weighted_score, keyword_overlap)
scores.append(final_score)
return [round(score * 100, 2) for score in scores]
def _calculate_tfidf_score(self, jd_text, question):
"""Calculate TF-IDF based similarity score."""
tfidf_matrix = self.tfidf.fit_transform([jd_text, question])
return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
def _calculate_semantic_score(self, jd_text, question):
"""Calculate semantic similarity using sentence transformers."""
jd_embedding = self.semantic_model.encode([jd_text], convert_to_tensor=True)
question_embedding = self.semantic_model.encode([question], convert_to_tensor=True)
return cosine_similarity(jd_embedding, question_embedding)[0][0]
def _calculate_keyword_score(self, jd_keywords, question):
"""Enhanced keyword scoring with threshold-based boosting"""
question_words = set(self._clean_text(question).split())
overlap = len(jd_keywords & question_words)
# Base score calculation
base_score = min(1.0, overlap / max(len(jd_keywords)*0.25, 1))
# Threshold-based boosting
if overlap >= 3: # Absolute threshold
base_score = min(1.0, base_score * 1.25)
if len(question_words) > 0 and (overlap/len(question_words)) >= 0.25: # Relative threshold
base_score = min(1.0, base_score * 1.15)
return base_score
def _calculate_entity_score(self, jd_entities, question):
"""Calculate named entity overlap score."""
if not self.nlp:
return 0.0
question_doc = self.nlp(question)
question_entities = set([ent.text.lower() for ent in question_doc.ents])
overlap = len(jd_entities & question_entities)
return min(1.0, overlap / max(len(jd_entities) * 0.2, 1))
def _calculate_context_score(self, job_description, question):
"""Calculate contextual relevance score using noun phrases."""
if not self.nlp:
return 0.0
jd_doc = self.nlp(job_description)
question_doc = self.nlp(question)
# Extract noun phrases
jd_phrases = set([chunk.text.lower() for chunk in jd_doc.noun_chunks])
question_phrases = set([chunk.text.lower() for chunk in question_doc.noun_chunks])
# Calculate phrase overlap with boosting
phrase_overlap = len(jd_phrases & question_phrases) / max(len(jd_phrases), 1)
return min(1.0, phrase_overlap * 1.5)
def _normalize_and_boost_score(self, score,keyword_overlap):
"""Enhanced normalization with keyword-based boosting"""
# Sigmoid normalization
normalized = 1 / (1 + np.exp(-6 * (score - 0.5)))
# Additional boost based on keyword overlap
if keyword_overlap >= 2:
normalized = min(1.0, normalized * 1.1)
if keyword_overlap >= 4:
normalized = min(1.0, normalized * 1.15)
return normalized
def _clean_text(self, text):
"""Clean and normalize text with technical term handling."""
# Basic cleaning
text = re.sub(r'[^\w\s-]', '', text.lower())
text = re.sub(r'\s+', ' ', text).strip()
# Handle common technical terms and abbreviations
tech_mappings = {
'js': 'javascript',
'py': 'python',
'ml': 'machine learning',
'ai': 'artificial intelligence',
'dl': 'deep learning',
'nlp': 'natural language processing',
'db': 'database',
'ui': 'user interface',
'ux': 'user experience',
'api': 'application programming interface',
'oop': 'object oriented programming',
'ci': 'continuous integration',
'cd': 'continuous deployment',
'aws': 'amazon web services',
'azure': 'microsoft azure',
'gcp': 'google cloud platform'
}
words = text.split()
cleaned_words = [tech_mappings.get(word, word) for word in words]
return ' '.join(cleaned_words)