Spaces:

tronskel
/

interview_question

No application file

File size: 11,230 Bytes
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rake_nltk import Rake
import nltk
import importlib.util
import sys
import subprocess
import logging
import re
import os

class NLTKResourceManager:
    """Manages NLTK resource initialization and verification"""
    
    REQUIRED_RESOURCES = [
        ('tokenizers/punkt', 'punkt'),
        ('corpora/stopwords', 'stopwords'),
        ('tokenizers/punkt_tab', 'punkt_tab')
    ]
    
    @staticmethod
    def initialize_nltk_resources() -> None:
        """Initialize all required NLTK resources with proper error handling"""
        
        def verify_resource(resource_path: str) -> bool:
            try:
                nltk.data.find(resource_path)
                return True
            except LookupError:
                return False
        
        # Create nltk_data directory in user's home if it doesn't exist
        nltk_data_dir = os.path.expanduser('~/nltk_data')
        os.makedirs(nltk_data_dir, exist_ok=True)
        
        # Ensure NLTK uses the correct data directory
        nltk.data.path.append(nltk_data_dir)
        
        # Download missing resources
        for resource_path, resource_name in NLTKResourceManager.REQUIRED_RESOURCES:
            if not verify_resource(resource_path):
                print(f"Downloading {resource_name}...")
                nltk.download(resource_name, quiet=True)
                
                # Verify successful download
                if not verify_resource(resource_path):
                    raise RuntimeError(f"Failed to download NLTK resource: {resource_name}")
                
        print("All NLTK resources successfully initialized")

class EnhancedRelevanceAnalyzer:
    """

    A class for analyzing the relevance of interview questions against job descriptions

    using multiple NLP techniques and scoring mechanisms.

    """
    
    def __init__(self):
        """Initialize the analyzer with necessary models and vectorizers."""
        self.tfidf = TfidfVectorizer(
            stop_words='english', 
            ngram_range=(1, 3),
            max_features=5000
        )
        NLTKResourceManager.initialize_nltk_resources()
        self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.keyword_extractor = Rake()
        
        # Initialize spaCy with proper error handling
        self.nlp = self._initialize_spacy()
        
    def _initialize_spacy(self):
        """Initialize spaCy with proper error handling and installation if needed."""
        try:
            import spacy
            try:
                return spacy.load('en_core_web_sm')
            except OSError:
                print("Downloading required spaCy model...")
                subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
                return spacy.load('en_core_web_sm')
        except ImportError:
            print("Installing required dependencies...")
            subprocess.run([sys.executable, "-m", "pip", "install", "spacy"], check=True)
            import spacy
            subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
            return spacy.load('en_core_web_sm')
        except Exception as e:
            print(f"Warning: Could not initialize spaCy ({str(e)}). Falling back to basic analysis.")
            return None
        
    def check_title_jd_match(self, job_title, jd_text, threshold=0.45):
        """Check semantic match between job title and JD using sentence transformers"""
        title_embed = self.semantic_model.encode([job_title], convert_to_tensor=True)
        jd_embed = self.semantic_model.encode([jd_text[:5000]], convert_to_tensor=True)  # Use first 5000 chars for efficiency
        similarity = cosine_similarity(title_embed, jd_embed)[0][0]
        return similarity >= threshold

    def calculate_question_scores(self, job_description, questions):
        """

        Calculate relevance scores for a list of questions against a job description.

        

        Args:

            job_description (str): The job description text

            questions (list): List of question strings to analyze

            

        Returns:

            list: List of relevance scores (0-100) for each question

        """
        # Extract key phrases using RAKE
        self.keyword_extractor.extract_keywords_from_text(job_description)
        jd_keywords = set(self.keyword_extractor.get_ranked_phrases()[:20])
        print('HEYY')
        print(jd_keywords)
        # Extract entities if spaCy is available
        jd_entities = set()
        if self.nlp:
            jd_doc = self.nlp(job_description)
            jd_entities = set([ent.text.lower() for ent in jd_doc.ents])
        
        # Clean and prepare texts
        jd_clean = self._clean_text(job_description)
        questions_clean = [self._clean_text(q) for q in questions]
        
        # Calculate scores for each question
        scores = []
        for i, question in enumerate(questions):
            # Calculate base scores
            tfidf_score = self._calculate_tfidf_score(jd_clean, questions_clean[i])
            semantic_score = self._calculate_semantic_score(jd_clean, questions_clean[i])
            keyword_score = self._calculate_keyword_score(jd_keywords, question)
            
            question_words = set(self._clean_text(question).split())
            keyword_overlap = len(jd_keywords & question_words)
            # Calculate additional scores if spaCy is available
            if self.nlp:
                entity_score = self._calculate_entity_score(jd_entities, question)
                context_score = self._calculate_context_score(job_description, question)
                
                # Combine all scores with weights
                weighted_score = (
                    tfidf_score * 0.15 +      # Term frequency importance
                    semantic_score * 0.35 +    # Semantic meaning importance
                    keyword_score * 0.20 +     # Keyword matching importance
                    entity_score * 0.15 +      # Named entity importance
                    context_score * 0.15       # Contextual relevance importance
                )
            else:
                # Fallback scoring without spaCy-dependent components
                weighted_score = (
                    tfidf_score * 0.25 +
                    semantic_score * 0.45 +
                    keyword_score * 0.30
                )
            
            # Normalize and boost the final score
            final_score = self._normalize_and_boost_score(weighted_score, keyword_overlap)
            scores.append(final_score)
            
        return [round(score * 100, 2) for score in scores]
    
    def _calculate_tfidf_score(self, jd_text, question):
        """Calculate TF-IDF based similarity score."""
        tfidf_matrix = self.tfidf.fit_transform([jd_text, question])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    
    def _calculate_semantic_score(self, jd_text, question):
        """Calculate semantic similarity using sentence transformers."""
        jd_embedding = self.semantic_model.encode([jd_text], convert_to_tensor=True)
        question_embedding = self.semantic_model.encode([question], convert_to_tensor=True)
        return cosine_similarity(jd_embedding, question_embedding)[0][0]
    
    def _calculate_keyword_score(self, jd_keywords, question):
        """Enhanced keyword scoring with threshold-based boosting"""
        question_words = set(self._clean_text(question).split())
        overlap = len(jd_keywords & question_words)
        
        # Base score calculation
        base_score = min(1.0, overlap / max(len(jd_keywords)*0.25, 1))
        
        # Threshold-based boosting
        if overlap >= 3:  # Absolute threshold
            base_score = min(1.0, base_score * 1.25)
        if len(question_words) > 0 and (overlap/len(question_words)) >= 0.25:  # Relative threshold
            base_score = min(1.0, base_score * 1.15)
        return base_score
    
    def _calculate_entity_score(self, jd_entities, question):
        """Calculate named entity overlap score."""
        if not self.nlp:
            return 0.0
        question_doc = self.nlp(question)
        question_entities = set([ent.text.lower() for ent in question_doc.ents])
        overlap = len(jd_entities & question_entities)
        return min(1.0, overlap / max(len(jd_entities) * 0.2, 1))
    
    def _calculate_context_score(self, job_description, question):
        """Calculate contextual relevance score using noun phrases."""
        if not self.nlp:
            return 0.0
        jd_doc = self.nlp(job_description)
        question_doc = self.nlp(question)
        
        # Extract noun phrases
        jd_phrases = set([chunk.text.lower() for chunk in jd_doc.noun_chunks])
        question_phrases = set([chunk.text.lower() for chunk in question_doc.noun_chunks])
        
        # Calculate phrase overlap with boosting
        phrase_overlap = len(jd_phrases & question_phrases) / max(len(jd_phrases), 1)
        return min(1.0, phrase_overlap * 1.5)
    
    def _normalize_and_boost_score(self, score,keyword_overlap):
        """Enhanced normalization with keyword-based boosting"""
        # Sigmoid normalization
        normalized = 1 / (1 + np.exp(-6 * (score - 0.5)))
        
        # Additional boost based on keyword overlap
        if keyword_overlap >= 2:
            normalized = min(1.0, normalized * 1.1)
        if keyword_overlap >= 4:
            normalized = min(1.0, normalized * 1.15)
        
        return normalized
    
    def _clean_text(self, text):
        """Clean and normalize text with technical term handling."""
        # Basic cleaning
        text = re.sub(r'[^\w\s-]', '', text.lower())
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Handle common technical terms and abbreviations
        tech_mappings = {
            'js': 'javascript',
            'py': 'python',
            'ml': 'machine learning',
            'ai': 'artificial intelligence',
            'dl': 'deep learning',
            'nlp': 'natural language processing',
            'db': 'database',
            'ui': 'user interface',
            'ux': 'user experience',
            'api': 'application programming interface',
            'oop': 'object oriented programming',
            'ci': 'continuous integration',
            'cd': 'continuous deployment',
            'aws': 'amazon web services',
            'azure': 'microsoft azure',
            'gcp': 'google cloud platform'
        }
        
        words = text.split()
        cleaned_words = [tech_mappings.get(word, word) for word in words]
        return ' '.join(cleaned_words)