Spaces:

gauravbox
/

TalentLensAI

Sleeping

TalentLensAI / utils.py

Johnny

added generate questions function with gemma, huggging face client

949011b 3 months ago

11 kB

	import fitz # PyMuPDF for PDF processing
	import requests
	import json
	import re
	from io import BytesIO
	import supabase
	from config import (
	SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS,
	supabase, HF_MODELS, query, embedding_model
	)
	from sentence_transformers import SentenceTransformer, util
	import spacy
	from collections import Counter
	from sklearn.feature_extraction.text import TfidfVectorizer
	import streamlit as st
	from fuzzywuzzy import fuzz
	import subprocess
	import random
	from huggingface_hub import InferenceClient
	import os

	# Initialize the client
	client = InferenceClient(
	model="google/gemma-1.1-7b-it",
	token=HF_API_TOKEN
	)

	# These functions will be called in the app.py file

	# Load spaCy NLP model
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	nlp = spacy.load("en_core_web_sm")

	def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
	"""Evaluates uploaded resumes, filters by keywords and score, and returns shortlisted candidates."""
	candidates = []
	removed_candidates = []

	for pdf_file in uploaded_files:
	resume_text = parse_resume(pdf_file)
	score = score_candidate(resume_text, job_description)
	email = extract_email(resume_text)
	summary = summarize_resume(resume_text)

	if score < 0.20:
	removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"})
	continue # Skip adding to candidates list

	candidates.append({
	"name": pdf_file.name,
	"resume": resume_text,
	"score": score,
	"email": email,
	"summary": summary
	})

	# 🔹 Step 2: Filter candidates based on keyword matches
	filtered_candidates, keyword_removed = filter_resumes_by_keywords(candidates, job_description, min_keyword_match)

	# 🔹 Step 3: Log removed candidates
	for name in keyword_removed:
	removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})

	# 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates
	shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5]

	# 🔹 Step 5: Ensure return value is always a list
	if not isinstance(shortlisted_candidates, list):
	print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.")
	return [], removed_candidates

	return shortlisted_candidates, removed_candidates

	def extract_keywords(text, top_n=10):
	"""Extracts key terms from the job description using TF-IDF and spaCy."""
	if not text.strip(): # Handle empty job descriptions
	return []

	doc = nlp(text.lower())

	# Extract meaningful words (nouns, proper nouns, verbs, adjectives)
	keywords = [token.text for token in doc if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not token.is_stop]

	if not keywords: # If no valid keywords were found, return an empty list
	return []

	# Use TF-IDF to rank keywords
	vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

	try:
	tfidf_matrix = vectorizer.fit_transform([" ".join(keywords)])
	feature_array = vectorizer.get_feature_names_out()
	tfidf_scores = tfidf_matrix.toarray()[0]

	# Sort by highest TF-IDF scores
	keyword_scores = sorted(zip(feature_array, tfidf_scores), key=lambda x: x[1], reverse=True)

	return [kw for kw, score in keyword_scores[:top_n]]

	except ValueError: # Catch empty vocabulary error
	return []

	def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
	"""Filters resumes based on keyword presence and similarity."""
	job_keywords = extract_keywords(job_description)
	filtered_resumes = []
	removed_resumes = []

	if len(job_keywords) < min_keyword_match:
	st.warning("⚠️ Job description is either too short or absent for keyword filtering.")
	return resumes, [] # Skip keyword filtering if job description lacks enough keywords

	for resume in resumes:
	resume_text = resume["resume"].lower()
	matched_keywords = []

	# Apply fuzzy matching to allow flexible keyword detection
	for keyword in job_keywords:
	for word in resume_text.split():
	if fuzz.partial_ratio(keyword, word) > 80: # 80% similarity threshold
	matched_keywords.append(keyword)

	# Enforce minimum keyword matches
	if len(set(matched_keywords)) >= min_keyword_match:
	filtered_resumes.append(resume)
	else:
	removed_resumes.append(resume["name"])

	return filtered_resumes, removed_resumes

	def parse_resume(pdf_file):
	"""Extracts text from a resume PDF."""
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text = "\n".join([page.get_text("text") for page in doc])
	return text

	def extract_email(resume_text):
	"""Extracts an email address from resume text."""
	match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
	return match.group(0) if match else None

	def score_candidate(resume_text, job_description):
	"""
	Scores the candidate's resume based on the job description using sentence-transformers.

	:param resume_text: The extracted resume text.
	:param job_description: The job description for comparison.
	:return: A numerical score (cosine similarity between 0 and 1).
	"""
	try:
	# Generate embeddings
	resume_embedding = embedding_model.encode(resume_text, convert_to_tensor=True)
	job_embedding = embedding_model.encode(job_description, convert_to_tensor=True)

	# Compute cosine similarity
	score = util.pytorch_cos_sim(resume_embedding, job_embedding).item()

	return round(score, 4) # Return similarity score rounded to 4 decimal places

	except Exception as e:
	print(f"Error computing similarity score: {e}")
	return 0 # Return 0 if scoring fails

	def summarize_resume(resume_text):
	"""
	Summarizes a resume using the Hugging Face BART model with improved error handling.
	"""
	payload = {"inputs": f"Summarize this resume: {resume_text}"}
	response = query(payload, model="bart")

	if not response:
	print("⚠️ Error: API response is None. Returning fallback summary.")
	return "Summary unavailable due to API issues."

	try:
	if isinstance(response, list) and len(response) > 0:
	response = response[0]

	if isinstance(response, dict):
	summary = response.get("generated_text") or response.get("summary_text")

	if summary:
	return summary.strip()
	else:
	print("⚠️ Unexpected API response format:", response)
	return "Summary unavailable."

	except Exception as e:
	print(f"⚠️ Error parsing summary: {e}")
	return "Summary unavailable."

	return "Summary unavailable."

	def store_in_supabase(resume_text, score, candidate_name, email, summary):
	"""
	Stores resume data in Supabase.

	:param resume_text: The extracted resume text.
	:param score: The candidate's score (must be a valid number).
	:param candidate_name: The candidate's name.
	:param email: Candidate's email address.
	:param summary: A summarized version of the resume.
	"""
	if score is None:
	score = 0 # Ensure score is never NULL

	data = {
	"name": candidate_name,
	"resume": resume_text,
	"score": score,
	"email": email,
	"summary": summary
	}

	response = supabase.table("candidates").insert(data).execute()
	return response

	def generate_pdf_report(shortlisted_candidates):
	"""Generates a PDF summary of shortlisted candidates with proper text wrapping."""
	pdf = BytesIO()
	doc = fitz.open()

	for candidate in shortlisted_candidates:
	page = doc.new_page()

	# Use stored summary, or provide a fallback
	summary = candidate.get("summary", "No summary available")

	# Generate interview questions
	#questions = generate_interview_questions_from_summaries(summary)
	#questions_text = "\n".join([f"- {q}" for q in questions])

	# Define text area properties
	text_box_x = 50 # Left margin
	text_box_y = 50 # Top margin
	text_box_width = 500 # Max width before wrapping
	text_box_height = 700 # Max height before splitting to a new page
	font_size = 11 # Font size for better readability

	# Format candidate details
	candidate_info = (
	f"Candidate: {candidate['name']}\n"
	f"Email: {candidate['email']}\n"
	f"Score: {candidate['score']}\n\n"
	f"Summary:\n{summary}"
	#f"Suggested Interview Questions:\n{questions_text}"
	)

	# Check if the text fits in the allowed area
	text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height)
	text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0)

	# If text overflows, split across multiple pages
	while text_length == 0: # 0 means text didn't fit
	page = doc.new_page() # Create new page
	text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height)
	text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0)

	doc.save(pdf)
	pdf.seek(0)
	return pdf

	def generate_interview_questions_from_summaries(candidates):
	"""
	Generates common interview questions based on the combined summaries of shortlisted candidates.
	Uses the Hugging Face Gemma model to generate questions.
	"""
	if not isinstance(candidates, list):
	raise TypeError("Expected a list of candidate dictionaries.")

	summaries = [c.get("summary", "") for c in candidates if "summary" in c]
	combined_summary = " ".join(summaries)

	prompt = (
	"Based on the following summary of this top candidate for a job role, generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n"
	f"{combined_summary}"
	)

	try:
	response = client.chat_completion(
	messages=[
	{"role": "user", "content": prompt}
	],
	temperature=0.7,
	max_tokens=500
	)
	result_text = response.choices[0].message.content
	questions = [q.strip() for q in result_text.split("\n") if q.strip()]
	return questions[:5] if questions else ["⚠️ No questions generated."]
	except Exception as e:
	print(f"❌ Error generating interview questions: {e}")
	return ["⚠️ Error generating questions."]