File size: 11,230 Bytes
2875866
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rake_nltk import Rake
import nltk
import importlib.util
import sys
import subprocess
import logging
import re
import os

class NLTKResourceManager:
    """Manages NLTK resource initialization and verification"""
    
    REQUIRED_RESOURCES = [
        ('tokenizers/punkt', 'punkt'),
        ('corpora/stopwords', 'stopwords'),
        ('tokenizers/punkt_tab', 'punkt_tab')
    ]
    
    @staticmethod
    def initialize_nltk_resources() -> None:
        """Initialize all required NLTK resources with proper error handling"""
        
        def verify_resource(resource_path: str) -> bool:
            try:
                nltk.data.find(resource_path)
                return True
            except LookupError:
                return False
        
        # Create nltk_data directory in user's home if it doesn't exist
        nltk_data_dir = os.path.expanduser('~/nltk_data')
        os.makedirs(nltk_data_dir, exist_ok=True)
        
        # Ensure NLTK uses the correct data directory
        nltk.data.path.append(nltk_data_dir)
        
        # Download missing resources
        for resource_path, resource_name in NLTKResourceManager.REQUIRED_RESOURCES:
            if not verify_resource(resource_path):
                print(f"Downloading {resource_name}...")
                nltk.download(resource_name, quiet=True)
                
                # Verify successful download
                if not verify_resource(resource_path):
                    raise RuntimeError(f"Failed to download NLTK resource: {resource_name}")
                
        print("All NLTK resources successfully initialized")

class EnhancedRelevanceAnalyzer:
    """

    A class for analyzing the relevance of interview questions against job descriptions

    using multiple NLP techniques and scoring mechanisms.

    """
    
    def __init__(self):
        """Initialize the analyzer with necessary models and vectorizers."""
        self.tfidf = TfidfVectorizer(
            stop_words='english', 
            ngram_range=(1, 3),
            max_features=5000
        )
        NLTKResourceManager.initialize_nltk_resources()
        self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.keyword_extractor = Rake()
        
        # Initialize spaCy with proper error handling
        self.nlp = self._initialize_spacy()
        
    def _initialize_spacy(self):
        """Initialize spaCy with proper error handling and installation if needed."""
        try:
            import spacy
            try:
                return spacy.load('en_core_web_sm')
            except OSError:
                print("Downloading required spaCy model...")
                subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
                return spacy.load('en_core_web_sm')
        except ImportError:
            print("Installing required dependencies...")
            subprocess.run([sys.executable, "-m", "pip", "install", "spacy"], check=True)
            import spacy
            subprocess.run([sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True)
            return spacy.load('en_core_web_sm')
        except Exception as e:
            print(f"Warning: Could not initialize spaCy ({str(e)}). Falling back to basic analysis.")
            return None
        
    def check_title_jd_match(self, job_title, jd_text, threshold=0.45):
        """Check semantic match between job title and JD using sentence transformers"""
        title_embed = self.semantic_model.encode([job_title], convert_to_tensor=True)
        jd_embed = self.semantic_model.encode([jd_text[:5000]], convert_to_tensor=True)  # Use first 5000 chars for efficiency
        similarity = cosine_similarity(title_embed, jd_embed)[0][0]
        return similarity >= threshold

    def calculate_question_scores(self, job_description, questions):
        """

        Calculate relevance scores for a list of questions against a job description.

        

        Args:

            job_description (str): The job description text

            questions (list): List of question strings to analyze

            

        Returns:

            list: List of relevance scores (0-100) for each question

        """
        # Extract key phrases using RAKE
        self.keyword_extractor.extract_keywords_from_text(job_description)
        jd_keywords = set(self.keyword_extractor.get_ranked_phrases()[:20])
        print('HEYY')
        print(jd_keywords)
        # Extract entities if spaCy is available
        jd_entities = set()
        if self.nlp:
            jd_doc = self.nlp(job_description)
            jd_entities = set([ent.text.lower() for ent in jd_doc.ents])
        
        # Clean and prepare texts
        jd_clean = self._clean_text(job_description)
        questions_clean = [self._clean_text(q) for q in questions]
        
        # Calculate scores for each question
        scores = []
        for i, question in enumerate(questions):
            # Calculate base scores
            tfidf_score = self._calculate_tfidf_score(jd_clean, questions_clean[i])
            semantic_score = self._calculate_semantic_score(jd_clean, questions_clean[i])
            keyword_score = self._calculate_keyword_score(jd_keywords, question)
            
            question_words = set(self._clean_text(question).split())
            keyword_overlap = len(jd_keywords & question_words)
            # Calculate additional scores if spaCy is available
            if self.nlp:
                entity_score = self._calculate_entity_score(jd_entities, question)
                context_score = self._calculate_context_score(job_description, question)
                
                # Combine all scores with weights
                weighted_score = (
                    tfidf_score * 0.15 +      # Term frequency importance
                    semantic_score * 0.35 +    # Semantic meaning importance
                    keyword_score * 0.20 +     # Keyword matching importance
                    entity_score * 0.15 +      # Named entity importance
                    context_score * 0.15       # Contextual relevance importance
                )
            else:
                # Fallback scoring without spaCy-dependent components
                weighted_score = (
                    tfidf_score * 0.25 +
                    semantic_score * 0.45 +
                    keyword_score * 0.30
                )
            
            # Normalize and boost the final score
            final_score = self._normalize_and_boost_score(weighted_score, keyword_overlap)
            scores.append(final_score)
            
        return [round(score * 100, 2) for score in scores]
    
    def _calculate_tfidf_score(self, jd_text, question):
        """Calculate TF-IDF based similarity score."""
        tfidf_matrix = self.tfidf.fit_transform([jd_text, question])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    
    def _calculate_semantic_score(self, jd_text, question):
        """Calculate semantic similarity using sentence transformers."""
        jd_embedding = self.semantic_model.encode([jd_text], convert_to_tensor=True)
        question_embedding = self.semantic_model.encode([question], convert_to_tensor=True)
        return cosine_similarity(jd_embedding, question_embedding)[0][0]
    
    def _calculate_keyword_score(self, jd_keywords, question):
        """Enhanced keyword scoring with threshold-based boosting"""
        question_words = set(self._clean_text(question).split())
        overlap = len(jd_keywords & question_words)
        
        # Base score calculation
        base_score = min(1.0, overlap / max(len(jd_keywords)*0.25, 1))
        
        # Threshold-based boosting
        if overlap >= 3:  # Absolute threshold
            base_score = min(1.0, base_score * 1.25)
        if len(question_words) > 0 and (overlap/len(question_words)) >= 0.25:  # Relative threshold
            base_score = min(1.0, base_score * 1.15)
        return base_score
    
    def _calculate_entity_score(self, jd_entities, question):
        """Calculate named entity overlap score."""
        if not self.nlp:
            return 0.0
        question_doc = self.nlp(question)
        question_entities = set([ent.text.lower() for ent in question_doc.ents])
        overlap = len(jd_entities & question_entities)
        return min(1.0, overlap / max(len(jd_entities) * 0.2, 1))
    
    def _calculate_context_score(self, job_description, question):
        """Calculate contextual relevance score using noun phrases."""
        if not self.nlp:
            return 0.0
        jd_doc = self.nlp(job_description)
        question_doc = self.nlp(question)
        
        # Extract noun phrases
        jd_phrases = set([chunk.text.lower() for chunk in jd_doc.noun_chunks])
        question_phrases = set([chunk.text.lower() for chunk in question_doc.noun_chunks])
        
        # Calculate phrase overlap with boosting
        phrase_overlap = len(jd_phrases & question_phrases) / max(len(jd_phrases), 1)
        return min(1.0, phrase_overlap * 1.5)
    
    def _normalize_and_boost_score(self, score,keyword_overlap):
        """Enhanced normalization with keyword-based boosting"""
        # Sigmoid normalization
        normalized = 1 / (1 + np.exp(-6 * (score - 0.5)))
        
        # Additional boost based on keyword overlap
        if keyword_overlap >= 2:
            normalized = min(1.0, normalized * 1.1)
        if keyword_overlap >= 4:
            normalized = min(1.0, normalized * 1.15)
        
        return normalized
    
    def _clean_text(self, text):
        """Clean and normalize text with technical term handling."""
        # Basic cleaning
        text = re.sub(r'[^\w\s-]', '', text.lower())
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Handle common technical terms and abbreviations
        tech_mappings = {
            'js': 'javascript',
            'py': 'python',
            'ml': 'machine learning',
            'ai': 'artificial intelligence',
            'dl': 'deep learning',
            'nlp': 'natural language processing',
            'db': 'database',
            'ui': 'user interface',
            'ux': 'user experience',
            'api': 'application programming interface',
            'oop': 'object oriented programming',
            'ci': 'continuous integration',
            'cd': 'continuous deployment',
            'aws': 'amazon web services',
            'azure': 'microsoft azure',
            'gcp': 'google cloud platform'
        }
        
        words = text.split()
        cleaned_words = [tech_mappings.get(word, word) for word in words]
        return ' '.join(cleaned_words)