TalentLensAI / utils.py
Johnny
updated unit tests, and supabase
cc174b7
# === Imports ===
# Standard Library
import os
import re
import json
import random
import subprocess
from io import BytesIO
from collections import Counter
# Third-Party Libraries
import fitz # PyMuPDF
import requests
import spacy
import streamlit as st
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from huggingface_hub import InferenceClient
from openai import OpenAI
# Local Configuration
from config import (
SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS,
supabase, HF_MODELS, query, embedding_model, client
)
# === Initialization ===
# # Hugging Face inference client for Gemma model
# client = InferenceClient(
# model="tgi",
# token=HF_API_TOKEN
# )
# Load or download spaCy model
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
# === Core Resume Evaluation ===
def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
"""
Evaluate uploaded resumes and return shortlisted candidates with scores and summaries.
"""
candidates, removed_candidates = [], []
for pdf_file in uploaded_files:
resume_text = parse_resume(pdf_file)
score = score_candidate(resume_text, job_description)
email = extract_email(resume_text)
summary = summarize_resume(resume_text)
if score < 0.20:
removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"})
continue
candidates.append({
"name": pdf_file.name,
"resume": resume_text,
"score": score,
"email": email,
"summary": summary
})
# 🔹 Step 2: Filter candidates based on keyword matches
filtered_candidates, keyword_removed = filter_resumes_by_keywords(
candidates, job_description, min_keyword_match
)
# 🔹 Step 3: Log removed candidates
for name in keyword_removed:
removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})
# 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates
shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5]
# 🔹 Step 4.5: Store shortlisted candidates in Supabase
for candidate in shortlisted_candidates:
try:
store_in_supabase(
resume_text=candidate["resume"],
score=candidate["score"],
candidate_name=candidate["name"],
email=candidate["email"],
summary=candidate["summary"]
)
except Exception as e:
print(f"❌ Failed to store {candidate['name']} in Supabase: {e}")
# 🔹 Step 5: Ensure return value is always a list
if not isinstance(shortlisted_candidates, list):
print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.")
return [], removed_candidates
return shortlisted_candidates, removed_candidates
# === Keyword & Scoring Functions ===
def extract_keywords(text, top_n=10):
"""
Extracts top keywords from the job description using spaCy and TF-IDF.
"""
if not text.strip():
return []
doc = nlp(text.lower())
keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop]
if not keywords:
return []
try:
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
matrix = tfidf.fit_transform([" ".join(keywords)])
scores = matrix.toarray()[0]
features = tfidf.get_feature_names_out()
ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)
return [kw for kw, _ in ranked[:top_n]]
except ValueError:
return []
def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
"""
Filters resumes by keyword match using fuzzy logic.
"""
job_keywords = extract_keywords(job_description)
if len(job_keywords) < min_keyword_match:
st.warning("⚠️ Job description too short or missing for keyword filtering.")
return resumes, []
filtered, removed = [], []
for resume in resumes:
matched = {
keyword for keyword in job_keywords
if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split())
}
if len(matched) >= min_keyword_match:
filtered.append(resume)
else:
removed.append(resume["name"])
return filtered, removed
def score_candidate(resume_text, job_description):
"""
Computes cosine similarity between resume and job description using embeddings.
"""
try:
resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True)
job_vec = embedding_model.encode(job_description, convert_to_tensor=True)
score = util.pytorch_cos_sim(resume_vec, job_vec).item()
return round(score, 4)
except Exception as e:
print(f"Error computing similarity: {e}")
return 0
# === Text Extraction & Summarization ===
def parse_resume(pdf_file):
"""
Extracts raw text from a PDF file.
"""
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
return "\n".join([page.get_text("text") for page in doc])
def extract_email(resume_text):
"""
Extracts the first valid email found in text.
"""
match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
return match.group(0) if match else None
def summarize_resume(resume_text):
prompt = (
"You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. "
"Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. "
"Format it as a professional summary paragraph.\n\n"
f"Resume:\n{resume_text}\n\n"
"Summary:"
)
try:
response = client.chat.completions.create(
model="tgi",
messages=[{"role": "user", "content": prompt}],
temperature=0.5,
max_tokens=300,
)
result = response.choices[0].message.content.strip()
# Clean up generic lead-ins from the model
cleaned = re.sub(
r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*",
"", result, flags=re.IGNORECASE
).strip()
return cleaned
except Exception as e:
print(f"❌ Error generating structured summary: {e}")
return "Summary unavailable due to API issues."
# === Data Storage & Reporting ===
def store_in_supabase(resume_text, score, candidate_name, email, summary):
"""
Saves candidate data to the Supabase table.
"""
data = {
"name": candidate_name,
"resume": resume_text,
"score": score or 0,
"email": email,
"summary": summary
}
return supabase.table("candidates").insert(data).execute()
def generate_pdf_report(shortlisted_candidates, questions=None):
"""
Creates a PDF report summarizing top candidates and interview questions.
"""
pdf = BytesIO()
doc = fitz.open()
for candidate in shortlisted_candidates:
page = doc.new_page()
info = (
f"Candidate: {candidate['name']}\n"
f"Email: {candidate['email']}\n"
f"Score: {candidate['score']}\n\n"
f"Summary:\n{candidate.get('summary', 'No summary available')}"
)
page.insert_textbox(fitz.Rect(50, 50, 550, 750), info, fontsize=11, fontname="helv", align=0)
if questions:
q_page = doc.new_page()
q_text = "Suggested Interview Questions:\n\n" + "\n".join(questions)
q_page.insert_textbox(fitz.Rect(50, 50, 550, 750), q_text, fontsize=11, fontname="helv", align=0)
doc.save(pdf)
pdf.seek(0)
return pdf
def generate_interview_questions_from_summaries(candidates):
if not isinstance(candidates, list):
raise TypeError("Expected a list of candidate dictionaries.")
summaries = " ".join(c.get("summary", "") for c in candidates)
prompt = (
"Based on the following summary of a top candidate for a job role, "
"generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n\n"
f"{summaries}"
)
try:
response = client.chat.completions.create(
model="tgi",
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=500,
)
result = response.choices[0].message.content
# Clean and normalize questions
raw_questions = result.split("\n")
questions = []
for q in raw_questions:
q = q.strip()
# Skip empty lines and markdown headers
if not q or re.match(r"^#+\s*", q):
continue
# Remove leading bullets like "1.", "1)", "- 1.", etc.
q = re.sub(r"^(?:[-*]?\s*)?(?:Q?\d+[\.\)\-]?\s*)+", "", q)
# Remove markdown bold/italics (**, *, etc.)
q = re.sub(r"[*_]+", "", q)
# Remove duplicate trailing punctuation
q = q.strip(" .")
questions.append(q.strip())
return [f"Q{i+1}. {q}" for i, q in enumerate(questions[:5])] or ["⚠️ No questions generated."]
except Exception as e:
print(f"❌ Error generating interview questions: {e}")
return ["⚠️ Error generating questions."]