Spaces:
Running
Running
# === Imports === | |
# Standard Library | |
import os | |
import re | |
import json | |
import random | |
import subprocess | |
from io import BytesIO | |
from collections import Counter | |
# Third-Party Libraries | |
import fitz # PyMuPDF | |
import requests | |
import spacy | |
import streamlit as st | |
from fuzzywuzzy import fuzz | |
from sentence_transformers import SentenceTransformer, util | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from huggingface_hub import InferenceClient | |
from openai import OpenAI | |
# Local Configuration | |
from config import ( | |
SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS, | |
supabase, HF_MODELS, query, embedding_model, client | |
) | |
# === Initialization === | |
# # Hugging Face inference client for Gemma model | |
# client = InferenceClient( | |
# model="tgi", | |
# token=HF_API_TOKEN | |
# ) | |
# Load or download spaCy model | |
try: | |
nlp = spacy.load("en_core_web_sm") | |
except OSError: | |
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) | |
nlp = spacy.load("en_core_web_sm") | |
# === Core Resume Evaluation === | |
def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2): | |
""" | |
Evaluate uploaded resumes and return shortlisted candidates with scores and summaries. | |
""" | |
candidates, removed_candidates = [], [] | |
for pdf_file in uploaded_files: | |
resume_text = parse_resume(pdf_file) | |
score = score_candidate(resume_text, job_description) | |
email = extract_email(resume_text) | |
summary = summarize_resume(resume_text) | |
if score < 0.20: | |
removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"}) | |
continue | |
candidates.append({ | |
"name": pdf_file.name, | |
"resume": resume_text, | |
"score": score, | |
"email": email, | |
"summary": summary | |
}) | |
# 🔹 Step 2: Filter candidates based on keyword matches | |
filtered_candidates, keyword_removed = filter_resumes_by_keywords( | |
candidates, job_description, min_keyword_match | |
) | |
# 🔹 Step 3: Log removed candidates | |
for name in keyword_removed: | |
removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"}) | |
# 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates | |
shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5] | |
# 🔹 Step 4.5: Store shortlisted candidates in Supabase | |
for candidate in shortlisted_candidates: | |
try: | |
store_in_supabase( | |
resume_text=candidate["resume"], | |
score=candidate["score"], | |
candidate_name=candidate["name"], | |
email=candidate["email"], | |
summary=candidate["summary"] | |
) | |
except Exception as e: | |
print(f"❌ Failed to store {candidate['name']} in Supabase: {e}") | |
# 🔹 Step 5: Ensure return value is always a list | |
if not isinstance(shortlisted_candidates, list): | |
print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.") | |
return [], removed_candidates | |
return shortlisted_candidates, removed_candidates | |
# === Keyword & Scoring Functions === | |
def extract_keywords(text, top_n=10): | |
""" | |
Extracts top keywords from the job description using spaCy and TF-IDF. | |
""" | |
if not text.strip(): | |
return [] | |
doc = nlp(text.lower()) | |
keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop] | |
if not keywords: | |
return [] | |
try: | |
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) | |
matrix = tfidf.fit_transform([" ".join(keywords)]) | |
scores = matrix.toarray()[0] | |
features = tfidf.get_feature_names_out() | |
ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True) | |
return [kw for kw, _ in ranked[:top_n]] | |
except ValueError: | |
return [] | |
def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2): | |
""" | |
Filters resumes by keyword match using fuzzy logic. | |
""" | |
job_keywords = extract_keywords(job_description) | |
if len(job_keywords) < min_keyword_match: | |
st.warning("⚠️ Job description too short or missing for keyword filtering.") | |
return resumes, [] | |
filtered, removed = [], [] | |
for resume in resumes: | |
matched = { | |
keyword for keyword in job_keywords | |
if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split()) | |
} | |
if len(matched) >= min_keyword_match: | |
filtered.append(resume) | |
else: | |
removed.append(resume["name"]) | |
return filtered, removed | |
def score_candidate(resume_text, job_description): | |
""" | |
Computes cosine similarity between resume and job description using embeddings. | |
""" | |
try: | |
resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True) | |
job_vec = embedding_model.encode(job_description, convert_to_tensor=True) | |
score = util.pytorch_cos_sim(resume_vec, job_vec).item() | |
return round(score, 4) | |
except Exception as e: | |
print(f"Error computing similarity: {e}") | |
return 0 | |
# === Text Extraction & Summarization === | |
def parse_resume(pdf_file): | |
""" | |
Extracts raw text from a PDF file. | |
""" | |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
return "\n".join([page.get_text("text") for page in doc]) | |
def extract_email(resume_text): | |
""" | |
Extracts the first valid email found in text. | |
""" | |
match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) | |
return match.group(0) if match else None | |
def summarize_resume(resume_text): | |
prompt = ( | |
"You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. " | |
"Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. " | |
"Format it as a professional summary paragraph.\n\n" | |
f"Resume:\n{resume_text}\n\n" | |
"Summary:" | |
) | |
try: | |
response = client.chat.completions.create( | |
model="tgi", | |
messages=[{"role": "user", "content": prompt}], | |
temperature=0.5, | |
max_tokens=300, | |
) | |
result = response.choices[0].message.content.strip() | |
# Clean up generic lead-ins from the model | |
cleaned = re.sub( | |
r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*", | |
"", result, flags=re.IGNORECASE | |
).strip() | |
return cleaned | |
except Exception as e: | |
print(f"❌ Error generating structured summary: {e}") | |
return "Summary unavailable due to API issues." | |
# === Data Storage & Reporting === | |
def store_in_supabase(resume_text, score, candidate_name, email, summary): | |
""" | |
Saves candidate data to the Supabase table. | |
""" | |
data = { | |
"name": candidate_name, | |
"resume": resume_text, | |
"score": score or 0, | |
"email": email, | |
"summary": summary | |
} | |
return supabase.table("candidates").insert(data).execute() | |
def generate_pdf_report(shortlisted_candidates, questions=None): | |
""" | |
Creates a PDF report summarizing top candidates and interview questions. | |
""" | |
pdf = BytesIO() | |
doc = fitz.open() | |
for candidate in shortlisted_candidates: | |
page = doc.new_page() | |
info = ( | |
f"Candidate: {candidate['name']}\n" | |
f"Email: {candidate['email']}\n" | |
f"Score: {candidate['score']}\n\n" | |
f"Summary:\n{candidate.get('summary', 'No summary available')}" | |
) | |
page.insert_textbox(fitz.Rect(50, 50, 550, 750), info, fontsize=11, fontname="helv", align=0) | |
if questions: | |
q_page = doc.new_page() | |
q_text = "Suggested Interview Questions:\n\n" + "\n".join(questions) | |
q_page.insert_textbox(fitz.Rect(50, 50, 550, 750), q_text, fontsize=11, fontname="helv", align=0) | |
doc.save(pdf) | |
pdf.seek(0) | |
return pdf | |
def generate_interview_questions_from_summaries(candidates): | |
if not isinstance(candidates, list): | |
raise TypeError("Expected a list of candidate dictionaries.") | |
summaries = " ".join(c.get("summary", "") for c in candidates) | |
prompt = ( | |
"Based on the following summary of a top candidate for a job role, " | |
"generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n\n" | |
f"{summaries}" | |
) | |
try: | |
response = client.chat.completions.create( | |
model="tgi", | |
messages=[{"role": "user", "content": prompt}], | |
temperature=0.7, | |
max_tokens=500, | |
) | |
result = response.choices[0].message.content | |
# Clean and normalize questions | |
raw_questions = result.split("\n") | |
questions = [] | |
for q in raw_questions: | |
q = q.strip() | |
# Skip empty lines and markdown headers | |
if not q or re.match(r"^#+\s*", q): | |
continue | |
# Remove leading bullets like "1.", "1)", "- 1.", etc. | |
q = re.sub(r"^(?:[-*]?\s*)?(?:Q?\d+[\.\)\-]?\s*)+", "", q) | |
# Remove markdown bold/italics (**, *, etc.) | |
q = re.sub(r"[*_]+", "", q) | |
# Remove duplicate trailing punctuation | |
q = q.strip(" .") | |
questions.append(q.strip()) | |
return [f"Q{i+1}. {q}" for i, q in enumerate(questions[:5])] or ["⚠️ No questions generated."] | |
except Exception as e: | |
print(f"❌ Error generating interview questions: {e}") | |
return ["⚠️ Error generating questions."] |