import fitz # PyMuPDF for PDF processing import requests import json import re from io import BytesIO import supabase from config import ( SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS, supabase, HF_MODELS, query, embedding_model ) from sentence_transformers import SentenceTransformer, util import spacy from collections import Counter from sklearn.feature_extraction.text import TfidfVectorizer import streamlit as st from fuzzywuzzy import fuzz import subprocess import random from huggingface_hub import InferenceClient import os # Initialize the client client = InferenceClient( model="google/gemma-1.1-7b-it", token=HF_API_TOKEN ) # These functions will be called in the app.py file # Load spaCy NLP model try: nlp = spacy.load("en_core_web_sm") except OSError: subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) nlp = spacy.load("en_core_web_sm") def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2): """Evaluates uploaded resumes, filters by keywords and score, and returns shortlisted candidates.""" candidates = [] removed_candidates = [] for pdf_file in uploaded_files: resume_text = parse_resume(pdf_file) score = score_candidate(resume_text, job_description) email = extract_email(resume_text) summary = summarize_resume(resume_text) if score < 0.20: removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"}) continue # Skip adding to candidates list candidates.append({ "name": pdf_file.name, "resume": resume_text, "score": score, "email": email, "summary": summary }) # 🔹 Step 2: Filter candidates based on keyword matches filtered_candidates, keyword_removed = filter_resumes_by_keywords(candidates, job_description, min_keyword_match) # 🔹 Step 3: Log removed candidates for name in keyword_removed: removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"}) # 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5] # 🔹 Step 5: Ensure return value is always a list if not isinstance(shortlisted_candidates, list): print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.") return [], removed_candidates return shortlisted_candidates, removed_candidates def extract_keywords(text, top_n=10): """Extracts key terms from the job description using TF-IDF and spaCy.""" if not text.strip(): # Handle empty job descriptions return [] doc = nlp(text.lower()) # Extract meaningful words (nouns, proper nouns, verbs, adjectives) keywords = [token.text for token in doc if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not token.is_stop] if not keywords: # If no valid keywords were found, return an empty list return [] # Use TF-IDF to rank keywords vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2)) try: tfidf_matrix = vectorizer.fit_transform([" ".join(keywords)]) feature_array = vectorizer.get_feature_names_out() tfidf_scores = tfidf_matrix.toarray()[0] # Sort by highest TF-IDF scores keyword_scores = sorted(zip(feature_array, tfidf_scores), key=lambda x: x[1], reverse=True) return [kw for kw, score in keyword_scores[:top_n]] except ValueError: # Catch empty vocabulary error return [] def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2): """Filters resumes based on keyword presence and similarity.""" job_keywords = extract_keywords(job_description) filtered_resumes = [] removed_resumes = [] if len(job_keywords) < min_keyword_match: st.warning("⚠️ Job description is either too short or absent for keyword filtering.") return resumes, [] # Skip keyword filtering if job description lacks enough keywords for resume in resumes: resume_text = resume["resume"].lower() matched_keywords = [] # Apply fuzzy matching to allow flexible keyword detection for keyword in job_keywords: for word in resume_text.split(): if fuzz.partial_ratio(keyword, word) > 80: # 80% similarity threshold matched_keywords.append(keyword) # Enforce minimum keyword matches if len(set(matched_keywords)) >= min_keyword_match: filtered_resumes.append(resume) else: removed_resumes.append(resume["name"]) return filtered_resumes, removed_resumes def parse_resume(pdf_file): """Extracts text from a resume PDF.""" doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "\n".join([page.get_text("text") for page in doc]) return text def extract_email(resume_text): """Extracts an email address from resume text.""" match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text) return match.group(0) if match else None def score_candidate(resume_text, job_description): """ Scores the candidate's resume based on the job description using sentence-transformers. :param resume_text: The extracted resume text. :param job_description: The job description for comparison. :return: A numerical score (cosine similarity between 0 and 1). """ try: # Generate embeddings resume_embedding = embedding_model.encode(resume_text, convert_to_tensor=True) job_embedding = embedding_model.encode(job_description, convert_to_tensor=True) # Compute cosine similarity score = util.pytorch_cos_sim(resume_embedding, job_embedding).item() return round(score, 4) # Return similarity score rounded to 4 decimal places except Exception as e: print(f"Error computing similarity score: {e}") return 0 # Return 0 if scoring fails def summarize_resume(resume_text): """ Summarizes a resume using the Hugging Face BART model with improved error handling. """ payload = {"inputs": f"Summarize this resume: {resume_text}"} response = query(payload, model="bart") if not response: print("⚠️ Error: API response is None. Returning fallback summary.") return "Summary unavailable due to API issues." try: if isinstance(response, list) and len(response) > 0: response = response[0] if isinstance(response, dict): summary = response.get("generated_text") or response.get("summary_text") if summary: return summary.strip() else: print("⚠️ Unexpected API response format:", response) return "Summary unavailable." except Exception as e: print(f"⚠️ Error parsing summary: {e}") return "Summary unavailable." return "Summary unavailable." def store_in_supabase(resume_text, score, candidate_name, email, summary): """ Stores resume data in Supabase. :param resume_text: The extracted resume text. :param score: The candidate's score (must be a valid number). :param candidate_name: The candidate's name. :param email: Candidate's email address. :param summary: A summarized version of the resume. """ if score is None: score = 0 # Ensure score is never NULL data = { "name": candidate_name, "resume": resume_text, "score": score, "email": email, "summary": summary } response = supabase.table("candidates").insert(data).execute() return response def generate_pdf_report(shortlisted_candidates): """Generates a PDF summary of shortlisted candidates with proper text wrapping.""" pdf = BytesIO() doc = fitz.open() for candidate in shortlisted_candidates: page = doc.new_page() # Use stored summary, or provide a fallback summary = candidate.get("summary", "No summary available") # Generate interview questions #questions = generate_interview_questions_from_summaries(summary) #questions_text = "\n".join([f"- {q}" for q in questions]) # Define text area properties text_box_x = 50 # Left margin text_box_y = 50 # Top margin text_box_width = 500 # Max width before wrapping text_box_height = 700 # Max height before splitting to a new page font_size = 11 # Font size for better readability # Format candidate details candidate_info = ( f"Candidate: {candidate['name']}\n" f"Email: {candidate['email']}\n" f"Score: {candidate['score']}\n\n" f"Summary:\n{summary}" #f"Suggested Interview Questions:\n{questions_text}" ) # Check if the text fits in the allowed area text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height) text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0) # If text overflows, split across multiple pages while text_length == 0: # 0 means text didn't fit page = doc.new_page() # Create new page text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height) text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0) doc.save(pdf) pdf.seek(0) return pdf def generate_interview_questions_from_summaries(candidates): """ Generates common interview questions based on the combined summaries of shortlisted candidates. Uses the Hugging Face Gemma model to generate questions. """ if not isinstance(candidates, list): raise TypeError("Expected a list of candidate dictionaries.") summaries = [c.get("summary", "") for c in candidates if "summary" in c] combined_summary = " ".join(summaries) prompt = ( "Based on the following summary of this top candidate for a job role, generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n" f"{combined_summary}" ) try: response = client.chat_completion( messages=[ {"role": "user", "content": prompt} ], temperature=0.7, max_tokens=500 ) result_text = response.choices[0].message.content questions = [q.strip() for q in result_text.split("\n") if q.strip()] return questions[:5] if questions else ["⚠️ No questions generated."] except Exception as e: print(f"❌ Error generating interview questions: {e}") return ["⚠️ Error generating questions."]