# === Imports === # Standard Library import os import re import json import random import subprocess from io import BytesIO from collections import Counter # Third-Party Libraries import fitz # PyMuPDF import requests import spacy import streamlit as st from fuzzywuzzy import fuzz from sentence_transformers import SentenceTransformer, util from sklearn.feature_extraction.text import TfidfVectorizer from huggingface_hub import InferenceClient # Local Configuration from config import ( SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS, supabase, HF_MODELS, query, embedding_model ) # === Initialization === # Hugging Face inference client for Gemma model client = InferenceClient( model="google/gemma-1.1-7b-it", token=HF_API_TOKEN ) # Load or download spaCy model try: nlp = spacy.load("en_core_web_sm") except OSError: subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) nlp = spacy.load("en_core_web_sm") # === Core Resume Evaluation === def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2): """ Evaluate uploaded resumes and return shortlisted candidates with scores and summaries. """ candidates, removed_candidates = [], [] for pdf_file in uploaded_files: resume_text = parse_resume(pdf_file) score = score_candidate(resume_text, job_description) email = extract_email(resume_text) summary = summarize_resume(resume_text) if score < 0.20: removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"}) continue candidates.append({ "name": pdf_file.name, "resume": resume_text, "score": score, "email": email, "summary": summary }) filtered_candidates, keyword_removed = filter_resumes_by_keywords( candidates, job_description, min_keyword_match ) for name in keyword_removed: removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"}) shortlisted = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5] return shortlisted if isinstance(shortlisted, list) else [], removed_candidates # === Keyword & Scoring Functions === def extract_keywords(text, top_n=10): """ Extracts top keywords from the job description using spaCy and TF-IDF. """ if not text.strip(): return [] doc = nlp(text.lower()) keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop] if not keywords: return [] try: tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) matrix = tfidf.fit_transform([" ".join(keywords)]) scores = matrix.toarray()[0] features = tfidf.get_feature_names_out() ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True) return [kw for kw, _ in ranked[:top_n]] except ValueError: return [] def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2): """ Filters resumes by keyword match using fuzzy logic. """ job_keywords = extract_keywords(job_description) if len(job_keywords) < min_keyword_match: st.warning("⚠️ Job description too short or missing for keyword filtering.") return resumes, [] filtered, removed = [], [] for resume in resumes: matched = { keyword for keyword in job_keywords if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split()) } if len(matched) >= min_keyword_match: filtered.append(resume) else: removed.append(resume["name"]) return filtered, removed def score_candidate(resume_text, job_description): """ Computes cosine similarity between resume and job description using embeddings. """ try: resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True) job_vec = embedding_model.encode(job_description, convert_to_tensor=True) score = util.pytorch_cos_sim(resume_vec, job_vec).item() return round(score, 4) except Exception as e: print(f"Error computing similarity: {e}") return 0 # === Text Extraction & Summarization === def parse_resume(pdf_file): """ Extracts raw text from a PDF file. """ doc = fitz.open(stream=pdf_file.read(), filetype="pdf") return "\n".join([page.get_text("text") for page in doc]) def extract_email(resume_text): """ Extracts the first valid email found in text. """ match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) return match.group(0) if match else None def summarize_resume(resume_text): """ Generates a summary of the resume using Hugging Face BART. """ payload = {"inputs": f"Summarize this resume: {resume_text}"} response = query(payload, model="bart") if not response: return "Summary unavailable due to API issues." try: response = response[0] if isinstance(response, list) else response summary = response.get("generated_text") or response.get("summary_text") return summary.strip() if summary else "Summary unavailable." except Exception as e: print(f"Error parsing summary: {e}") return "Summary unavailable." # === Data Storage & Reporting === def store_in_supabase(resume_text, score, candidate_name, email, summary): """ Saves candidate data to the Supabase table. """ data = { "name": candidate_name, "resume": resume_text, "score": score or 0, "email": email, "summary": summary } return supabase.table("candidates").insert(data).execute() def generate_pdf_report(shortlisted_candidates, questions=None): """ Creates a PDF report summarizing top candidates and interview questions. """ pdf = BytesIO() doc = fitz.open() for candidate in shortlisted_candidates: page = doc.new_page() info = ( f"Candidate: {candidate['name']}\n" f"Email: {candidate['email']}\n" f"Score: {candidate['score']}\n\n" f"Summary:\n{candidate.get('summary', 'No summary available')}" ) page.insert_textbox(fitz.Rect(50, 50, 550, 750), info, fontsize=11, fontname="helv", align=0) if questions: q_page = doc.new_page() q_text = "Suggested Interview Questions:\n\n" + "\n".join(questions) q_page.insert_textbox(fitz.Rect(50, 50, 550, 750), q_text, fontsize=11, fontname="helv", align=0) doc.save(pdf) pdf.seek(0) return pdf def generate_interview_questions_from_summaries(candidates): """ Generates 5 interview questions based on combined summaries using Gemma model. """ if not isinstance(candidates, list): raise TypeError("Expected a list of candidate dictionaries.") summaries = " ".join(c.get("summary", "") for c in candidates) prompt = ( "Based on the following summary of this top candidate for a job role, " "generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n" f"{summaries}" ) try: response = client.chat_completion( messages=[{"role": "user", "content": prompt}], temperature=0.7, max_tokens=500 ) result = response.choices[0].message.content questions = [re.sub(r"^(?:\*\*)?(Q?\d+[\.\)\-]?\s*)+(?:\*\*)?", "", q.strip()) for q in result.split("\n") if q.strip()] return [f"Q{i+1}. {q}" for i, q in enumerate(questions[:5])] or ["⚠️ No questions generated."] except Exception as e: print(f"❌ Error generating interview questions: {e}") return ["⚠️ Error generating questions."]