Spaces:
Running
Running
import fitz # PyMuPDF for PDF processing | |
import requests | |
import json | |
import re | |
from io import BytesIO | |
import supabase | |
from config import ( | |
SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS, | |
supabase, HF_MODELS, query, embedding_model | |
) | |
from sentence_transformers import SentenceTransformer, util | |
import spacy | |
from collections import Counter | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import streamlit as st | |
from fuzzywuzzy import fuzz | |
import subprocess | |
import random | |
from huggingface_hub import InferenceClient | |
import os | |
# Initialize the client | |
client = InferenceClient( | |
model="google/gemma-1.1-7b-it", | |
token=HF_API_TOKEN | |
) | |
# These functions will be called in the app.py file | |
# Load spaCy NLP model | |
try: | |
nlp = spacy.load("en_core_web_sm") | |
except OSError: | |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) | |
nlp = spacy.load("en_core_web_sm") | |
def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2): | |
"""Evaluates uploaded resumes, filters by keywords and score, and returns shortlisted candidates.""" | |
candidates = [] | |
removed_candidates = [] | |
for pdf_file in uploaded_files: | |
resume_text = parse_resume(pdf_file) | |
score = score_candidate(resume_text, job_description) | |
email = extract_email(resume_text) | |
summary = summarize_resume(resume_text) | |
if score < 0.20: | |
removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"}) | |
continue # Skip adding to candidates list | |
candidates.append({ | |
"name": pdf_file.name, | |
"resume": resume_text, | |
"score": score, | |
"email": email, | |
"summary": summary | |
}) | |
# 🔹 Step 2: Filter candidates based on keyword matches | |
filtered_candidates, keyword_removed = filter_resumes_by_keywords(candidates, job_description, min_keyword_match) | |
# 🔹 Step 3: Log removed candidates | |
for name in keyword_removed: | |
removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"}) | |
# 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates | |
shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5] | |
# 🔹 Step 5: Ensure return value is always a list | |
if not isinstance(shortlisted_candidates, list): | |
print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.") | |
return [], removed_candidates | |
return shortlisted_candidates, removed_candidates | |
def extract_keywords(text, top_n=10): | |
"""Extracts key terms from the job description using TF-IDF and spaCy.""" | |
if not text.strip(): # Handle empty job descriptions | |
return [] | |
doc = nlp(text.lower()) | |
# Extract meaningful words (nouns, proper nouns, verbs, adjectives) | |
keywords = [token.text for token in doc if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not token.is_stop] | |
if not keywords: # If no valid keywords were found, return an empty list | |
return [] | |
# Use TF-IDF to rank keywords | |
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) | |
try: | |
tfidf_matrix = vectorizer.fit_transform([" ".join(keywords)]) | |
feature_array = vectorizer.get_feature_names_out() | |
tfidf_scores = tfidf_matrix.toarray()[0] | |
# Sort by highest TF-IDF scores | |
keyword_scores = sorted(zip(feature_array, tfidf_scores), key=lambda x: x[1], reverse=True) | |
return [kw for kw, score in keyword_scores[:top_n]] | |
except ValueError: # Catch empty vocabulary error | |
return [] | |
def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2): | |
"""Filters resumes based on keyword presence and similarity.""" | |
job_keywords = extract_keywords(job_description) | |
filtered_resumes = [] | |
removed_resumes = [] | |
if len(job_keywords) < min_keyword_match: | |
st.warning("⚠️ Job description is either too short or absent for keyword filtering.") | |
return resumes, [] # Skip keyword filtering if job description lacks enough keywords | |
for resume in resumes: | |
resume_text = resume["resume"].lower() | |
matched_keywords = [] | |
# Apply fuzzy matching to allow flexible keyword detection | |
for keyword in job_keywords: | |
for word in resume_text.split(): | |
if fuzz.partial_ratio(keyword, word) > 80: # 80% similarity threshold | |
matched_keywords.append(keyword) | |
# Enforce minimum keyword matches | |
if len(set(matched_keywords)) >= min_keyword_match: | |
filtered_resumes.append(resume) | |
else: | |
removed_resumes.append(resume["name"]) | |
return filtered_resumes, removed_resumes | |
def parse_resume(pdf_file): | |
"""Extracts text from a resume PDF.""" | |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
text = "\n".join([page.get_text("text") for page in doc]) | |
return text | |
def extract_email(resume_text): | |
"""Extracts an email address from resume text.""" | |
match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) | |
return match.group(0) if match else None | |
def score_candidate(resume_text, job_description): | |
""" | |
Scores the candidate's resume based on the job description using sentence-transformers. | |
:param resume_text: The extracted resume text. | |
:param job_description: The job description for comparison. | |
:return: A numerical score (cosine similarity between 0 and 1). | |
""" | |
try: | |
# Generate embeddings | |
resume_embedding = embedding_model.encode(resume_text, convert_to_tensor=True) | |
job_embedding = embedding_model.encode(job_description, convert_to_tensor=True) | |
# Compute cosine similarity | |
score = util.pytorch_cos_sim(resume_embedding, job_embedding).item() | |
return round(score, 4) # Return similarity score rounded to 4 decimal places | |
except Exception as e: | |
print(f"Error computing similarity score: {e}") | |
return 0 # Return 0 if scoring fails | |
def summarize_resume(resume_text): | |
""" | |
Summarizes a resume using the Hugging Face BART model with improved error handling. | |
""" | |
payload = {"inputs": f"Summarize this resume: {resume_text}"} | |
response = query(payload, model="bart") | |
if not response: | |
print("⚠️ Error: API response is None. Returning fallback summary.") | |
return "Summary unavailable due to API issues." | |
try: | |
if isinstance(response, list) and len(response) > 0: | |
response = response[0] | |
if isinstance(response, dict): | |
summary = response.get("generated_text") or response.get("summary_text") | |
if summary: | |
return summary.strip() | |
else: | |
print("⚠️ Unexpected API response format:", response) | |
return "Summary unavailable." | |
except Exception as e: | |
print(f"⚠️ Error parsing summary: {e}") | |
return "Summary unavailable." | |
return "Summary unavailable." | |
def store_in_supabase(resume_text, score, candidate_name, email, summary): | |
""" | |
Stores resume data in Supabase. | |
:param resume_text: The extracted resume text. | |
:param score: The candidate's score (must be a valid number). | |
:param candidate_name: The candidate's name. | |
:param email: Candidate's email address. | |
:param summary: A summarized version of the resume. | |
""" | |
if score is None: | |
score = 0 # Ensure score is never NULL | |
data = { | |
"name": candidate_name, | |
"resume": resume_text, | |
"score": score, | |
"email": email, | |
"summary": summary | |
} | |
response = supabase.table("candidates").insert(data).execute() | |
return response | |
def generate_pdf_report(shortlisted_candidates): | |
"""Generates a PDF summary of shortlisted candidates with proper text wrapping.""" | |
pdf = BytesIO() | |
doc = fitz.open() | |
for candidate in shortlisted_candidates: | |
page = doc.new_page() | |
# Use stored summary, or provide a fallback | |
summary = candidate.get("summary", "No summary available") | |
# Generate interview questions | |
#questions = generate_interview_questions_from_summaries(summary) | |
#questions_text = "\n".join([f"- {q}" for q in questions]) | |
# Define text area properties | |
text_box_x = 50 # Left margin | |
text_box_y = 50 # Top margin | |
text_box_width = 500 # Max width before wrapping | |
text_box_height = 700 # Max height before splitting to a new page | |
font_size = 11 # Font size for better readability | |
# Format candidate details | |
candidate_info = ( | |
f"Candidate: {candidate['name']}\n" | |
f"Email: {candidate['email']}\n" | |
f"Score: {candidate['score']}\n\n" | |
f"Summary:\n{summary}" | |
#f"Suggested Interview Questions:\n{questions_text}" | |
) | |
# Check if the text fits in the allowed area | |
text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height) | |
text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0) | |
# If text overflows, split across multiple pages | |
while text_length == 0: # 0 means text didn't fit | |
page = doc.new_page() # Create new page | |
text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height) | |
text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0) | |
doc.save(pdf) | |
pdf.seek(0) | |
return pdf | |
def generate_interview_questions_from_summaries(candidates): | |
""" | |
Generates common interview questions based on the combined summaries of shortlisted candidates. | |
Uses the Hugging Face Gemma model to generate questions. | |
""" | |
if not isinstance(candidates, list): | |
raise TypeError("Expected a list of candidate dictionaries.") | |
summaries = [c.get("summary", "") for c in candidates if "summary" in c] | |
combined_summary = " ".join(summaries) | |
prompt = ( | |
"Based on the following summary of this top candidate for a job role, generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n" | |
f"{combined_summary}" | |
) | |
try: | |
response = client.chat_completion( | |
messages=[ | |
{"role": "user", "content": prompt} | |
], | |
temperature=0.7, | |
max_tokens=500 | |
) | |
result_text = response.choices[0].message.content | |
questions = [q.strip() for q in result_text.split("\n") if q.strip()] | |
return questions[:5] if questions else ["⚠️ No questions generated."] | |
except Exception as e: | |
print(f"❌ Error generating interview questions: {e}") | |
return ["⚠️ Error generating questions."] | |