Spaces:

gauravbox
/

TalentLensAI

Running

TalentLensAI / utils.py

Johnny

updated unit tests, and supabase

cc174b7 about 1 month ago

9.87 kB

	# === Imports ===

	# Standard Library
	import os
	import re
	import json
	import random
	import subprocess
	from io import BytesIO
	from collections import Counter

	# Third-Party Libraries
	import fitz # PyMuPDF
	import requests
	import spacy
	import streamlit as st
	from fuzzywuzzy import fuzz
	from sentence_transformers import SentenceTransformer, util
	from sklearn.feature_extraction.text import TfidfVectorizer
	from huggingface_hub import InferenceClient
	from openai import OpenAI

	# Local Configuration
	from config import (
	SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS,
	supabase, HF_MODELS, query, embedding_model, client
	)

	# === Initialization ===

	# # Hugging Face inference client for Gemma model
	# client = InferenceClient(
	# model="tgi",
	# token=HF_API_TOKEN
	# )

	# Load or download spaCy model
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	nlp = spacy.load("en_core_web_sm")


	# === Core Resume Evaluation ===

	def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
	"""
	Evaluate uploaded resumes and return shortlisted candidates with scores and summaries.
	"""
	candidates, removed_candidates = [], []

	for pdf_file in uploaded_files:
	resume_text = parse_resume(pdf_file)
	score = score_candidate(resume_text, job_description)
	email = extract_email(resume_text)
	summary = summarize_resume(resume_text)

	if score < 0.20:
	removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"})
	continue

	candidates.append({
	"name": pdf_file.name,
	"resume": resume_text,
	"score": score,
	"email": email,
	"summary": summary
	})

	# 🔹 Step 2: Filter candidates based on keyword matches
	filtered_candidates, keyword_removed = filter_resumes_by_keywords(
	candidates, job_description, min_keyword_match
	)

	# 🔹 Step 3: Log removed candidates
	for name in keyword_removed:
	removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})

	# 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates
	shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5]

	# 🔹 Step 4.5: Store shortlisted candidates in Supabase
	for candidate in shortlisted_candidates:
	try:
	store_in_supabase(
	resume_text=candidate["resume"],
	score=candidate["score"],
	candidate_name=candidate["name"],
	email=candidate["email"],
	summary=candidate["summary"]
	)
	except Exception as e:
	print(f"❌ Failed to store {candidate['name']} in Supabase: {e}")

	# 🔹 Step 5: Ensure return value is always a list
	if not isinstance(shortlisted_candidates, list):
	print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.")
	return [], removed_candidates

	return shortlisted_candidates, removed_candidates

	# === Keyword & Scoring Functions ===

	def extract_keywords(text, top_n=10):
	"""
	Extracts top keywords from the job description using spaCy and TF-IDF.
	"""
	if not text.strip():
	return []

	doc = nlp(text.lower())
	keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop]

	if not keywords:
	return []

	try:
	tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
	matrix = tfidf.fit_transform([" ".join(keywords)])
	scores = matrix.toarray()[0]
	features = tfidf.get_feature_names_out()
	ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)

	return [kw for kw, _ in ranked[:top_n]]

	except ValueError:
	return []


	def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
	"""
	Filters resumes by keyword match using fuzzy logic.
	"""
	job_keywords = extract_keywords(job_description)
	if len(job_keywords) < min_keyword_match:
	st.warning("⚠️ Job description too short or missing for keyword filtering.")
	return resumes, []

	filtered, removed = [], []

	for resume in resumes:
	matched = {
	keyword for keyword in job_keywords
	if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split())
	}

	if len(matched) >= min_keyword_match:
	filtered.append(resume)
	else:
	removed.append(resume["name"])

	return filtered, removed


	def score_candidate(resume_text, job_description):
	"""
	Computes cosine similarity between resume and job description using embeddings.
	"""
	try:
	resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True)
	job_vec = embedding_model.encode(job_description, convert_to_tensor=True)
	score = util.pytorch_cos_sim(resume_vec, job_vec).item()
	return round(score, 4)
	except Exception as e:
	print(f"Error computing similarity: {e}")
	return 0


	# === Text Extraction & Summarization ===

	def parse_resume(pdf_file):
	"""
	Extracts raw text from a PDF file.
	"""
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	return "\n".join([page.get_text("text") for page in doc])


	def extract_email(resume_text):
	"""
	Extracts the first valid email found in text.
	"""
	match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
	return match.group(0) if match else None

	def summarize_resume(resume_text):
	prompt = (
	"You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. "
	"Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. "
	"Format it as a professional summary paragraph.\n\n"
	f"Resume:\n{resume_text}\n\n"
	"Summary:"
	)

	try:
	response = client.chat.completions.create(
	model="tgi",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.5,
	max_tokens=300,
	)
	result = response.choices[0].message.content.strip()

	# Clean up generic lead-ins from the model
	cleaned = re.sub(
	r"^(Sure,\|Certainly,)?\s(here is\|here’s\|this is)?\s(the)?\s(extracted)?\s(professional)?\ssummary.?:\s*",
	"", result, flags=re.IGNORECASE
	).strip()

	return cleaned

	except Exception as e:
	print(f"❌ Error generating structured summary: {e}")
	return "Summary unavailable due to API issues."

	# === Data Storage & Reporting ===

	def store_in_supabase(resume_text, score, candidate_name, email, summary):
	"""
	Saves candidate data to the Supabase table.
	"""
	data = {
	"name": candidate_name,
	"resume": resume_text,
	"score": score or 0,
	"email": email,
	"summary": summary
	}

	return supabase.table("candidates").insert(data).execute()


	def generate_pdf_report(shortlisted_candidates, questions=None):
	"""
	Creates a PDF report summarizing top candidates and interview questions.
	"""
	pdf = BytesIO()
	doc = fitz.open()

	for candidate in shortlisted_candidates:
	page = doc.new_page()
	info = (
	f"Candidate: {candidate['name']}\n"
	f"Email: {candidate['email']}\n"
	f"Score: {candidate['score']}\n\n"
	f"Summary:\n{candidate.get('summary', 'No summary available')}"
	)
	page.insert_textbox(fitz.Rect(50, 50, 550, 750), info, fontsize=11, fontname="helv", align=0)

	if questions:
	q_page = doc.new_page()
	q_text = "Suggested Interview Questions:\n\n" + "\n".join(questions)
	q_page.insert_textbox(fitz.Rect(50, 50, 550, 750), q_text, fontsize=11, fontname="helv", align=0)

	doc.save(pdf)
	pdf.seek(0)
	return pdf


	def generate_interview_questions_from_summaries(candidates):
	if not isinstance(candidates, list):
	raise TypeError("Expected a list of candidate dictionaries.")

	summaries = " ".join(c.get("summary", "") for c in candidates)

	prompt = (
	"Based on the following summary of a top candidate for a job role, "
	"generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n\n"
	f"{summaries}"
	)

	try:
	response = client.chat.completions.create(
	model="tgi",
	messages=[{"role": "user", "content": prompt}],
	temperature=0.7,
	max_tokens=500,
	)

	result = response.choices[0].message.content

	# Clean and normalize questions
	raw_questions = result.split("\n")
	questions = []

	for q in raw_questions:
	q = q.strip()

	# Skip empty lines and markdown headers
	if not q or re.match(r"^#+\s*", q):
	continue

	# Remove leading bullets like "1.", "1)", "- 1.", etc.
	q = re.sub(r"^(?:[-]?\s)?(?:Q?\d+[\.\)\-]?\s*)+", "", q)

	# Remove markdown bold/italics (*, , etc.)
	q = re.sub(r"[*_]+", "", q)

	# Remove duplicate trailing punctuation
	q = q.strip(" .")

	questions.append(q.strip())

	return [f"Q{i+1}. {q}" for i, q in enumerate(questions[:5])] or ["⚠️ No questions generated."]

	except Exception as e:
	print(f"❌ Error generating interview questions: {e}")
	return ["⚠️ Error generating questions."]