Johnny commited on
Commit
8f8f414
·
1 Parent(s): 0c91845

added validations on evaluate, update filter_resumes_by_keywords to use fuzzy, extract_keywords use TfidVector, enforce strict keyword matching

Browse files
Files changed (3) hide show
  1. app.py +46 -24
  2. requirements.txt +3 -1
  3. utils.py +96 -25
app.py CHANGED
@@ -1,5 +1,8 @@
1
  import streamlit as st
2
- from utils import evaluate_resumes, generate_pdf_report, store_in_supabase, extract_email, score_candidate, parse_resume, summarize_resume
 
 
 
3
  from config import supabase
4
  from config import HF_API_TOKEN, HF_HEADERS, HF_MODELS
5
  import fitz # PyMuPDF
@@ -10,32 +13,51 @@ import requests
10
 
11
  def main():
12
  st.set_page_config(page_title="TalentLens.AI", layout="centered")
13
- st.markdown(
14
- "<h1 style='text-align: center;'>TalentLens.AI</h1>",
15
- unsafe_allow_html=True
16
- )
17
  st.divider()
18
- st.markdown(
19
- "<h3 style='text-align: center;'>AI-Powered Intelligent Resume Screening</h3>",
20
- unsafe_allow_html=True
21
- )
22
- uploaded_files = st.file_uploader(
23
- "Upload Resumes (PDF Only, Max 10)",
24
- accept_multiple_files=True,
25
- type=["pdf"])
26
- job_description = st.text_area("Enter Job Description")
27
-
28
  if uploaded_files and len(uploaded_files) > 10:
29
- st.error("Please upload a maximum of 10 resumes.")
30
-
31
- elif st.button("Evaluate Resumes"):
32
- shortlisted = evaluate_resumes(uploaded_files, job_description)
33
- for candidate in shortlisted:
34
- st.write(f"**{candidate['name']}**") # removed - Score: {candidate['score']}
 
 
 
 
 
 
 
 
35
 
36
- # Generate PDF Report
37
- pdf_report = generate_pdf_report(shortlisted)
38
- st.download_button("Download Shortlist Report", pdf_report, "shortlist.pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  if __name__ == "__main__":
41
  main()
 
1
  import streamlit as st
2
+ from utils import (
3
+ evaluate_resumes, generate_pdf_report, store_in_supabase, extract_email,
4
+ score_candidate, parse_resume, summarize_resume, extract_keywords
5
+ )
6
  from config import supabase
7
  from config import HF_API_TOKEN, HF_HEADERS, HF_MODELS
8
  import fitz # PyMuPDF
 
13
 
14
  def main():
15
  st.set_page_config(page_title="TalentLens.AI", layout="centered")
16
+
17
+ st.markdown("<h1 style='text-align: center;'>TalentLens.AI</h1>", unsafe_allow_html=True)
 
 
18
  st.divider()
19
+ st.markdown("<h3 style='text-align: center;'>AI-Powered Intelligent Resume Screening</h3>", unsafe_allow_html=True)
20
+
21
+ # Limit resume uploads to 10
22
+ uploaded_files = st.file_uploader("Upload Resumes (PDF Only, Max: 10)", accept_multiple_files=True, type=["pdf"])
 
 
 
 
 
 
23
  if uploaded_files and len(uploaded_files) > 10:
24
+ st.error("⚠️ You can upload a maximum of 10 resumes at a time.")
25
+ return
26
+
27
+ job_description = st.text_area("Enter Job Description")
28
+
29
+ if st.button("Evaluate Resumes"):
30
+ if not job_description:
31
+ st.error("⚠️ Please enter a job description.")
32
+ return
33
+ if not uploaded_files:
34
+ st.error("⚠️ Please upload at least one resume.")
35
+ return
36
+ #if uploaded_files and job_description:
37
+ st.write("### 📊 Evaluating Resumes...")
38
 
39
+ # 🔹 Extract required keywords dynamically from the job description
40
+ required_keywords = extract_keywords(job_description)
41
+ st.write(f"**Extracted Keywords:** {', '.join(required_keywords)}")
42
+
43
+ shortlisted, removed_candidates = evaluate_resumes(uploaded_files, job_description)
44
+
45
+ if not shortlisted:
46
+ st.warning("⚠️ No resumes matched the required keywords.")
47
+ else:
48
+ st.subheader("✅ Shortlisted Candidates:")
49
+ for candidate in shortlisted:
50
+ st.write(f"**{candidate['name']}**")
51
+
52
+ # Generate PDF Report
53
+ pdf_report = generate_pdf_report(shortlisted)
54
+ st.download_button("Download Shortlist Report", pdf_report, "shortlist.pdf")
55
+
56
+ # 🔻 Display removed candidates due to missing keywords
57
+ if removed_candidates:
58
+ st.subheader("❌ Resumes Removed:")
59
+ for removed in removed_candidates:
60
+ st.write(f"**{removed['name']}** - {removed['reason']}")
61
 
62
  if __name__ == "__main__":
63
  main()
requirements.txt CHANGED
@@ -4,4 +4,6 @@ python-dotenv
4
  supabase
5
  PyMuPDF
6
  pytest
7
- sentence-transformers
 
 
 
4
  supabase
5
  PyMuPDF
6
  pytest
7
+ sentence-transformers
8
+ spacy
9
+ fuzzywuzzy
utils.py CHANGED
@@ -4,22 +4,38 @@ import json
4
  import re
5
  from io import BytesIO
6
  import supabase
7
- from config import SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS, supabase, HF_MODELS, query, embedding_model
 
 
 
8
  from sentence_transformers import SentenceTransformer, util
 
 
 
 
 
9
 
10
  # These functions will be called in the app.py file
11
 
12
- def evaluate_resumes(uploaded_files, job_description):
13
- """Evaluates uploaded resumes and returns shortlisted candidates."""
 
 
 
14
  candidates = []
 
 
15
  for pdf_file in uploaded_files:
16
  resume_text = parse_resume(pdf_file)
17
  score = score_candidate(resume_text, job_description)
18
  email = extract_email(resume_text)
19
-
20
- # Generate a summary of the resume
21
  summary = summarize_resume(resume_text)
22
 
 
 
 
 
 
23
  candidates.append({
24
  "name": pdf_file.name,
25
  "resume": resume_text,
@@ -28,10 +44,71 @@ def evaluate_resumes(uploaded_files, job_description):
28
  "summary": summary
29
  })
30
 
31
- # Store all details including summary in Supabase
32
- store_in_supabase(resume_text, score, pdf_file.name, email, summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- return sorted(candidates, key=lambda x: x["score"], reverse=True)[:5] # Return top 5 candidates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def parse_resume(pdf_file):
37
  """Extracts text from a resume PDF."""
@@ -65,36 +142,30 @@ def score_candidate(resume_text, job_description):
65
  except Exception as e:
66
  print(f"Error computing similarity score: {e}")
67
  return 0 # Return 0 if scoring fails
68
-
69
- # create multiple agents for different scoring
70
- # agent_experience, agent_programming_language, agent_education, agent_soft_skills, etc etc
71
- # Scoring from 1-5
72
 
73
  def summarize_resume(resume_text):
74
  """
75
- Summarizes a resume using the Google bart model.
76
-
77
- :param resume_text: The resume text to summarize.
78
- :return: A summarized version of the resume.
79
  """
80
  payload = {"inputs": f"Summarize this resume: {resume_text}"}
81
-
82
- response = query(payload, model="bart") # Use bart for summarization
83
 
84
- if response is None:
85
  print("Error: API response is None")
86
  return "Summary could not be generated."
87
 
88
- # If the response is a list, extract the first element
89
- if isinstance(response, list) and len(response) > 0:
90
- response = response[0]
91
-
92
  try:
93
- if isinstance(response, dict) and "generated_text" in response:
94
- return response["generated_text"]
 
 
 
 
 
95
  else:
96
  print("Unexpected API response format:", response)
97
  return "Summary could not be generated."
 
98
  except (TypeError, ValueError) as e:
99
  print(f"Error parsing summary: {e}")
100
  return "Summary could not be generated."
 
4
  import re
5
  from io import BytesIO
6
  import supabase
7
+ from config import (
8
+ SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS,
9
+ supabase, HF_MODELS, query, embedding_model
10
+ )
11
  from sentence_transformers import SentenceTransformer, util
12
+ import spacy
13
+ from collections import Counter
14
+ from sklearn.feature_extraction.text import TfidfVectorizer
15
+ import streamlit as st
16
+ from fuzzywuzzy import fuzz
17
 
18
  # These functions will be called in the app.py file
19
 
20
+ # Load spaCy NLP model
21
+ nlp = spacy.load("en_core_web_sm")
22
+
23
+ def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
24
+ """Evaluates uploaded resumes, filters by keywords and score, and returns shortlisted candidates."""
25
  candidates = []
26
+ removed_candidates = []
27
+
28
  for pdf_file in uploaded_files:
29
  resume_text = parse_resume(pdf_file)
30
  score = score_candidate(resume_text, job_description)
31
  email = extract_email(resume_text)
 
 
32
  summary = summarize_resume(resume_text)
33
 
34
+ # If score is below 0.20, remove the candidate immediately
35
+ if score < 0.20:
36
+ removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"})
37
+ continue # Skip adding to candidates list
38
+
39
  candidates.append({
40
  "name": pdf_file.name,
41
  "resume": resume_text,
 
44
  "summary": summary
45
  })
46
 
47
+ # Filter resumes based on job description keywords
48
+ filtered_candidates, keyword_removed = filter_resumes_by_keywords(candidates, job_description, min_keyword_match=2)
49
+
50
+ # Store removed candidates with a reason
51
+ for name in keyword_removed:
52
+ removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})
53
+
54
+ return sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5], removed_candidates
55
+
56
+ def extract_keywords(text, top_n=10):
57
+ """Extracts key terms from the job description using TF-IDF and spaCy."""
58
+ if not text.strip(): # Handle empty job descriptions
59
+ return []
60
+
61
+ doc = nlp(text.lower())
62
+
63
+ # Extract meaningful words (nouns, proper nouns, verbs, adjectives)
64
+ keywords = [token.text for token in doc if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not token.is_stop]
65
+
66
+ if not keywords: # If no valid keywords were found, return an empty list
67
+ return []
68
+
69
+ # Use TF-IDF to rank keywords
70
+ vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
71
+
72
+ try:
73
+ tfidf_matrix = vectorizer.fit_transform([" ".join(keywords)])
74
+ feature_array = vectorizer.get_feature_names_out()
75
+ tfidf_scores = tfidf_matrix.toarray()[0]
76
+
77
+ # Sort by highest TF-IDF scores
78
+ keyword_scores = sorted(zip(feature_array, tfidf_scores), key=lambda x: x[1], reverse=True)
79
 
80
+ return [kw for kw, score in keyword_scores[:top_n]]
81
+
82
+ except ValueError: # Catch empty vocabulary error
83
+ return []
84
+
85
+ def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
86
+ """Filters resumes based on keyword presence and similarity."""
87
+ job_keywords = extract_keywords(job_description)
88
+ filtered_resumes = []
89
+ removed_resumes = []
90
+
91
+ if len(job_keywords) < min_keyword_match:
92
+ st.warning("⚠️ Job description is either too short or absent for keyword filtering.")
93
+ return resumes, [] # Skip keyword filtering if job description lacks enough keywords
94
+
95
+ for resume in resumes:
96
+ resume_text = resume["resume"].lower()
97
+ matched_keywords = []
98
+
99
+ # Apply fuzzy matching to allow flexible keyword detection
100
+ for keyword in job_keywords:
101
+ for word in resume_text.split():
102
+ if fuzz.partial_ratio(keyword, word) > 80: # 80% similarity threshold
103
+ matched_keywords.append(keyword)
104
+
105
+ # Enforce minimum keyword matches
106
+ if len(set(matched_keywords)) >= min_keyword_match:
107
+ filtered_resumes.append(resume)
108
+ else:
109
+ removed_resumes.append(resume["name"])
110
+
111
+ return filtered_resumes, removed_resumes
112
 
113
  def parse_resume(pdf_file):
114
  """Extracts text from a resume PDF."""
 
142
  except Exception as e:
143
  print(f"Error computing similarity score: {e}")
144
  return 0 # Return 0 if scoring fails
 
 
 
 
145
 
146
  def summarize_resume(resume_text):
147
  """
148
+ Summarizes a resume using the Hugging Face BART model.
 
 
 
149
  """
150
  payload = {"inputs": f"Summarize this resume: {resume_text}"}
151
+ response = query(payload, model="bart") # Call API
 
152
 
153
+ if not response:
154
  print("Error: API response is None")
155
  return "Summary could not be generated."
156
 
 
 
 
 
157
  try:
158
+ # Check if response is a list (sometimes HF returns a list with a dict inside)
159
+ if isinstance(response, list) and len(response) > 0:
160
+ response = response[0]
161
+
162
+ # Adjust for different response formats
163
+ if isinstance(response, dict) and ("generated_text" in response or "summary_text" in response):
164
+ return response.get("generated_text", response.get("summary_text", "Summary not available"))
165
  else:
166
  print("Unexpected API response format:", response)
167
  return "Summary could not be generated."
168
+
169
  except (TypeError, ValueError) as e:
170
  print(f"Error parsing summary: {e}")
171
  return "Summary could not be generated."