# === Imports === # Standard Library import os import re import json import random import subprocess from io import BytesIO from collections import Counter # Third-Party Libraries import fitz # PyMuPDF import requests import spacy import streamlit as st from fuzzywuzzy import fuzz from sentence_transformers import SentenceTransformer, util from sklearn.feature_extraction.text import TfidfVectorizer from huggingface_hub import InferenceClient from openai import OpenAI # Local Configuration from config import ( SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS, supabase, HF_MODELS, query, embedding_model, client ) # === Initialization === # # Hugging Face inference client for Gemma model # client = InferenceClient( # model="tgi", # token=HF_API_TOKEN # ) # Load or download spaCy model try: nlp = spacy.load("en_core_web_sm") except OSError: subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) nlp = spacy.load("en_core_web_sm") # === Core Resume Evaluation === def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2): """ Evaluate uploaded resumes and return shortlisted candidates with scores and summaries. """ candidates, removed_candidates = [], [] for pdf_file in uploaded_files: resume_text = parse_resume(pdf_file) score = score_candidate(resume_text, job_description) email = extract_email(resume_text) summary = summarize_resume(resume_text) if score < 0.20: removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"}) continue candidates.append({ "name": pdf_file.name, "resume": resume_text, "score": score, "email": email, "summary": summary }) # 🔹 Step 2: Filter candidates based on keyword matches filtered_candidates, keyword_removed = filter_resumes_by_keywords( candidates, job_description, min_keyword_match ) # 🔹 Step 3: Log removed candidates for name in keyword_removed: removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"}) # 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5] # 🔹 Step 4.5: Store shortlisted candidates in Supabase for candidate in shortlisted_candidates: try: store_in_supabase( resume_text=candidate["resume"], score=candidate["score"], candidate_name=candidate["name"], email=candidate["email"], summary=candidate["summary"] ) except Exception as e: print(f"❌ Failed to store {candidate['name']} in Supabase: {e}") # 🔹 Step 5: Ensure return value is always a list if not isinstance(shortlisted_candidates, list): print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.") return [], removed_candidates return shortlisted_candidates, removed_candidates # === Keyword & Scoring Functions === def extract_keywords(text, top_n=10): """ Extracts top keywords from the job description using spaCy and TF-IDF. """ if not text.strip(): return [] doc = nlp(text.lower()) keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop] if not keywords: return [] try: tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) matrix = tfidf.fit_transform([" ".join(keywords)]) scores = matrix.toarray()[0] features = tfidf.get_feature_names_out() ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True) return [kw for kw, _ in ranked[:top_n]] except ValueError: return [] def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2): """ Filters resumes by keyword match using fuzzy logic. """ job_keywords = extract_keywords(job_description) if len(job_keywords) < min_keyword_match: st.warning("⚠️ Job description too short or missing for keyword filtering.") return resumes, [] filtered, removed = [], [] for resume in resumes: matched = { keyword for keyword in job_keywords if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split()) } if len(matched) >= min_keyword_match: filtered.append(resume) else: removed.append(resume["name"]) return filtered, removed def score_candidate(resume_text, job_description): """ Computes cosine similarity between resume and job description using embeddings. """ try: resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True) job_vec = embedding_model.encode(job_description, convert_to_tensor=True) score = util.pytorch_cos_sim(resume_vec, job_vec).item() return round(score, 4) except Exception as e: print(f"Error computing similarity: {e}") return 0 # === Text Extraction & Summarization === def parse_resume(pdf_file): """ Extracts raw text from a PDF file. """ doc = fitz.open(stream=pdf_file.read(), filetype="pdf") return "\n".join([page.get_text("text") for page in doc]) def extract_email(resume_text): """ Extracts the first valid email found in text. """ match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) return match.group(0) if match else None def summarize_resume(resume_text): prompt = ( "You are an expert technical recruiter. Extract a professional summary for this candidate based on their resume text. " "Include: full name (if found), job title, years of experience, key technologies/tools, industries worked in, and certifications. " "Format it as a professional summary paragraph.\n\n" f"Resume:\n{resume_text}\n\n" "Summary:" ) try: response = client.chat.completions.create( model="tgi", messages=[{"role": "user", "content": prompt}], temperature=0.5, max_tokens=300, ) result = response.choices[0].message.content.strip() # Clean up generic lead-ins from the model cleaned = re.sub( r"^(Sure,|Certainly,)?\s*(here is|here’s|this is)?\s*(the)?\s*(extracted)?\s*(professional)?\s*summary.*?:\s*", "", result, flags=re.IGNORECASE ).strip() return cleaned except Exception as e: print(f"❌ Error generating structured summary: {e}") return "Summary unavailable due to API issues." # === Data Storage & Reporting === def store_in_supabase(resume_text, score, candidate_name, email, summary): """ Saves candidate data to the Supabase table. """ data = { "name": candidate_name, "resume": resume_text, "score": score or 0, "email": email, "summary": summary } return supabase.table("candidates").insert(data).execute() def generate_pdf_report(shortlisted_candidates, questions=None): """ Creates a PDF report summarizing top candidates and interview questions. """ pdf = BytesIO() doc = fitz.open() for candidate in shortlisted_candidates: page = doc.new_page() info = ( f"Candidate: {candidate['name']}\n" f"Email: {candidate['email']}\n" f"Score: {candidate['score']}\n\n" f"Summary:\n{candidate.get('summary', 'No summary available')}" ) page.insert_textbox(fitz.Rect(50, 50, 550, 750), info, fontsize=11, fontname="helv", align=0) if questions: q_page = doc.new_page() q_text = "Suggested Interview Questions:\n\n" + "\n".join(questions) q_page.insert_textbox(fitz.Rect(50, 50, 550, 750), q_text, fontsize=11, fontname="helv", align=0) doc.save(pdf) pdf.seek(0) return pdf def generate_interview_questions_from_summaries(candidates): if not isinstance(candidates, list): raise TypeError("Expected a list of candidate dictionaries.") summaries = " ".join(c.get("summary", "") for c in candidates) prompt = ( "Based on the following summary of a top candidate for a job role, " "generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n\n" f"{summaries}" ) try: response = client.chat.completions.create( model="tgi", messages=[{"role": "user", "content": prompt}], temperature=0.7, max_tokens=500, ) result = response.choices[0].message.content # Clean and normalize questions raw_questions = result.split("\n") questions = [] for q in raw_questions: q = q.strip() # Skip empty lines and markdown headers if not q or re.match(r"^#+\s*", q): continue # Remove leading bullets like "1.", "1)", "- 1.", etc. q = re.sub(r"^(?:[-*]?\s*)?(?:Q?\d+[\.\)\-]?\s*)+", "", q) # Remove markdown bold/italics (**, *, etc.) q = re.sub(r"[*_]+", "", q) # Remove duplicate trailing punctuation q = q.strip(" .") questions.append(q.strip()) return [f"Q{i+1}. {q}" for i, q in enumerate(questions[:5])] or ["⚠️ No questions generated."] except Exception as e: print(f"❌ Error generating interview questions: {e}") return ["⚠️ Error generating questions."]