Spaces:

gauravbox
/

TalentLensAI

Running

App Files Files Community

Johnny commited on Apr 7

Commit

2854e2c

1 Parent(s): 949011b

added interview questions to pdf, refactor question format

Browse files

Files changed (2) hide show

app.py +37 -22
utils.py +129 -165

app.py CHANGED Viewed

@@ -1,16 +1,25 @@
 import streamlit as st
-from utils import (
-    evaluate_resumes, generate_pdf_report, store_in_supabase, extract_email,
-    score_candidate, parse_resume, summarize_resume, extract_keywords, generate_interview_questions_from_summaries
-)
-from config import supabase
-from config import HF_API_TOKEN, HF_HEADERS, HF_MODELS
 import fitz  # PyMuPDF
-from io import BytesIO
-from dotenv import load_dotenv
-import os
 import requests
 def main():
     st.set_page_config(page_title="TalentLens.AI", layout="centered")
@@ -18,28 +27,33 @@ def main():
     st.divider()
     st.markdown("<h3 style='text-align: center;'>AI-Powered Intelligent Resume Screening</h3>", unsafe_allow_html=True)
-    # Limit resume uploads to 10
-    uploaded_files = st.file_uploader("Upload Resumes (PDF Only, Max: 10)", accept_multiple_files=True, type=["pdf"])
     if uploaded_files and len(uploaded_files) > 10:
         st.error("⚠️ You can upload a maximum of 10 resumes at a time.")
         return
     job_description = st.text_area("Enter Job Description")
     if st.button("Evaluate Resumes"):
         if not job_description:
             st.error("⚠️ Please enter a job description.")
             return
         if not uploaded_files:
             st.error("⚠️ Please upload at least one resume.")
             return
-        #if uploaded_files and job_description:
         st.write("### 📊 Evaluating Resumes...")
-        # 🔹 Extract required keywords dynamically from the job description
-        # required_keywords = extract_keywords(job_description)
-        # st.write(f"**Extracted Keywords:** {', '.join(required_keywords)}")
         shortlisted, removed_candidates = evaluate_resumes(uploaded_files, job_description)
         if not shortlisted:
@@ -49,21 +63,22 @@ def main():
             for candidate in shortlisted:
                 st.write(f"**{candidate['name']}**")
-            # Generate PDF Report
-            pdf_report = generate_pdf_report(shortlisted)
-            st.download_button("Download Shortlist Report", pdf_report, "shortlist.pdf")
             # Generate Interview Questions
             questions = generate_interview_questions_from_summaries(shortlisted)
             st.subheader("🧠 Suggested Interview Questions:")
             for idx, q in enumerate(questions, 1):
-                st.markdown(f"**Q{idx}.** {q}")
-        # 🔻 Display removed candidates due to missing keywords
         if removed_candidates:
             st.subheader("❌ Resumes Removed:")
             for removed in removed_candidates:
                 st.write(f"**{removed['name']}** - {removed['reason']}")
 if __name__ == "__main__":
     main()

+import os
+from io import BytesIO
 import streamlit as st
 import fitz  # PyMuPDF
 import requests
+from dotenv import load_dotenv
+from config import supabase, HF_API_TOKEN, HF_HEADERS, HF_MODELS
+from utils import (
+    evaluate_resumes,
+    generate_pdf_report,
+    store_in_supabase,
+    extract_email,
+    score_candidate,
+    parse_resume,
+    summarize_resume,
+    extract_keywords,
+    generate_interview_questions_from_summaries,
+)
+# ------------------------- Main App Function -------------------------
 def main():
     st.set_page_config(page_title="TalentLens.AI", layout="centered")
     st.divider()
     st.markdown("<h3 style='text-align: center;'>AI-Powered Intelligent Resume Screening</h3>", unsafe_allow_html=True)
+    # Upload resumes (limit: 10 files)
+    uploaded_files = st.file_uploader(
+        "Upload Resumes (PDF Only, Max: 10)",
+        accept_multiple_files=True,
+        type=["pdf"]
+    )
     if uploaded_files and len(uploaded_files) > 10:
         st.error("⚠️ You can upload a maximum of 10 resumes at a time.")
         return
+    # Input job description
     job_description = st.text_area("Enter Job Description")
+    # Evaluation trigger
     if st.button("Evaluate Resumes"):
         if not job_description:
             st.error("⚠️ Please enter a job description.")
             return
         if not uploaded_files:
             st.error("⚠️ Please upload at least one resume.")
             return
         st.write("### 📊 Evaluating Resumes...")
+        # Resume Evaluation
         shortlisted, removed_candidates = evaluate_resumes(uploaded_files, job_description)
         if not shortlisted:
             for candidate in shortlisted:
                 st.write(f"**{candidate['name']}**")
             # Generate Interview Questions
             questions = generate_interview_questions_from_summaries(shortlisted)
             st.subheader("🧠 Suggested Interview Questions:")
             for idx, q in enumerate(questions, 1):
+                st.markdown(f"{q}")
+            # Downloadable PDF Report
+            pdf_report = generate_pdf_report(shortlisted, questions)
+            st.download_button("Download Shortlist Report", pdf_report, "shortlist.pdf")
+        # Removed Candidates Info
         if removed_candidates:
             st.subheader("❌ Resumes Removed:")
             for removed in removed_candidates:
                 st.write(f"**{removed['name']}** - {removed['reason']}")
+# ------------------------- Run the App -------------------------
 if __name__ == "__main__":
     main()

utils.py CHANGED Viewed

@@ -1,43 +1,53 @@
-import fitz  # PyMuPDF for PDF processing
-import requests
-import json
 import re
 from io import BytesIO
-import supabase
-from config import (
-    SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS,
-    supabase, HF_MODELS, query, embedding_model
-)
-from sentence_transformers import SentenceTransformer, util
-import spacy
 from collections import Counter
-from sklearn.feature_extraction.text import TfidfVectorizer
 import streamlit as st
 from fuzzywuzzy import fuzz
-import subprocess
-import random
 from huggingface_hub import InferenceClient
-import os
-# Initialize the client
 client = InferenceClient(
     model="google/gemma-1.1-7b-it",
     token=HF_API_TOKEN
 )
-# These functions will be called in the app.py file
-# Load spaCy NLP model
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
 def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
-    """Evaluates uploaded resumes, filters by keywords and score, and returns shortlisted candidates."""
-    candidates = []
-    removed_candidates = []
     for pdf_file in uploaded_files:
         resume_text = parse_resume(pdf_file)
@@ -47,7 +57,7 @@ def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
         if score < 0.20:
             removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"})
-            continue  # Skip adding to candidates list
         candidates.append({
             "name": pdf_file.name,
@@ -57,240 +67,194 @@ def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
             "summary": summary
         })
-    # 🔹 Step 2: Filter candidates based on keyword matches
-    filtered_candidates, keyword_removed = filter_resumes_by_keywords(candidates, job_description, min_keyword_match)
-    # 🔹 Step 3: Log removed candidates
     for name in keyword_removed:
         removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})
-    # 🔹 Step 4: Ensure the final list is sorted by score and limit to top 5 candidates
-    shortlisted_candidates = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5]
-    # 🔹 Step 5: Ensure return value is always a list
-    if not isinstance(shortlisted_candidates, list):
-        print("⚠️ ERROR: shortlisted_candidates is not a list! Returning empty list.")
-        return [], removed_candidates
-    return shortlisted_candidates, removed_candidates
 def extract_keywords(text, top_n=10):
-    """Extracts key terms from the job description using TF-IDF and spaCy."""
-    if not text.strip():  # Handle empty job descriptions
         return []
     doc = nlp(text.lower())
-    # Extract meaningful words (nouns, proper nouns, verbs, adjectives)
-    keywords = [token.text for token in doc if token.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not token.is_stop]
-    if not keywords:  # If no valid keywords were found, return an empty list
         return []
-    # Use TF-IDF to rank keywords
-    vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
     try:
-        tfidf_matrix = vectorizer.fit_transform([" ".join(keywords)])
-        feature_array = vectorizer.get_feature_names_out()
-        tfidf_scores = tfidf_matrix.toarray()[0]
-        # Sort by highest TF-IDF scores
-        keyword_scores = sorted(zip(feature_array, tfidf_scores), key=lambda x: x[1], reverse=True)
-        return [kw for kw, score in keyword_scores[:top_n]]
-    except ValueError:  # Catch empty vocabulary error
         return []
 def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
-    """Filters resumes based on keyword presence and similarity."""
     job_keywords = extract_keywords(job_description)
-    filtered_resumes = []
-    removed_resumes = []
     if len(job_keywords) < min_keyword_match:
-        st.warning("⚠️ Job description is either too short or absent for keyword filtering.")
-        return resumes, []  # Skip keyword filtering if job description lacks enough keywords
     for resume in resumes:
-        resume_text = resume["resume"].lower()
-        matched_keywords = []
-        # Apply fuzzy matching to allow flexible keyword detection
-        for keyword in job_keywords:
-            for word in resume_text.split():
-                if fuzz.partial_ratio(keyword, word) > 80:  # 80% similarity threshold
-                    matched_keywords.append(keyword)
-        # Enforce minimum keyword matches
-        if len(set(matched_keywords)) >= min_keyword_match:
-            filtered_resumes.append(resume)
-        else:
-            removed_resumes.append(resume["name"])
-    return filtered_resumes, removed_resumes
-def parse_resume(pdf_file):
-    """Extracts text from a resume PDF."""
-    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-    text = "\n".join([page.get_text("text") for page in doc])
-    return text
-def extract_email(resume_text):
-    """Extracts an email address from resume text."""
-    match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
-    return match.group(0) if match else None
 def score_candidate(resume_text, job_description):
     """
-    Scores the candidate's resume based on the job description using sentence-transformers.
-    :param resume_text: The extracted resume text.
-    :param job_description: The job description for comparison.
-    :return: A numerical score (cosine similarity between 0 and 1).
     """
     try:
-        # Generate embeddings
-        resume_embedding = embedding_model.encode(resume_text, convert_to_tensor=True)
-        job_embedding = embedding_model.encode(job_description, convert_to_tensor=True)
-        # Compute cosine similarity
-        score = util.pytorch_cos_sim(resume_embedding, job_embedding).item()
-        return round(score, 4)  # Return similarity score rounded to 4 decimal places
-    except Exception as e:
-        print(f"Error computing similarity score: {e}")
-        return 0  # Return 0 if scoring fails
 def summarize_resume(resume_text):
     """
-    Summarizes a resume using the Hugging Face BART model with improved error handling.
     """
     payload = {"inputs": f"Summarize this resume: {resume_text}"}
     response = query(payload, model="bart")
     if not response:
-        print("⚠️ Error: API response is None. Returning fallback summary.")
         return "Summary unavailable due to API issues."
     try:
-        if isinstance(response, list) and len(response) > 0:
-            response = response[0]
-        if isinstance(response, dict):
-            summary = response.get("generated_text") or response.get("summary_text")
-            if summary:
-                return summary.strip()
-            else:
-                print("⚠️ Unexpected API response format:", response)
-                return "Summary unavailable."
     except Exception as e:
-        print(f"⚠️ Error parsing summary: {e}")
         return "Summary unavailable."
-    return "Summary unavailable."
 def store_in_supabase(resume_text, score, candidate_name, email, summary):
     """
-    Stores resume data in Supabase.
-    :param resume_text: The extracted resume text.
-    :param score: The candidate's score (must be a valid number).
-    :param candidate_name: The candidate's name.
-    :param email: Candidate's email address.
-    :param summary: A summarized version of the resume.
     """
-    if score is None:
-        score = 0  # Ensure score is never NULL
     data = {
         "name": candidate_name,
         "resume": resume_text,
-        "score": score,
         "email": email,
         "summary": summary
     }
-    response = supabase.table("candidates").insert(data).execute()
-    return response
-def generate_pdf_report(shortlisted_candidates):
-    """Generates a PDF summary of shortlisted candidates with proper text wrapping."""
     pdf = BytesIO()
     doc = fitz.open()
     for candidate in shortlisted_candidates:
         page = doc.new_page()
-        # Use stored summary, or provide a fallback
-        summary = candidate.get("summary", "No summary available")
-        # Generate interview questions
-        #questions = generate_interview_questions_from_summaries(summary)
-        #questions_text = "\n".join([f"- {q}" for q in questions])
-        # Define text area properties
-        text_box_x = 50  # Left margin
-        text_box_y = 50  # Top margin
-        text_box_width = 500  # Max width before wrapping
-        text_box_height = 700  # Max height before splitting to a new page
-        font_size = 11  # Font size for better readability
-        # Format candidate details
-        candidate_info = (
             f"Candidate: {candidate['name']}\n"
             f"Email: {candidate['email']}\n"
             f"Score: {candidate['score']}\n\n"
-            f"Summary:\n{summary}"
-            #f"Suggested Interview Questions:\n{questions_text}"
         )
-        # Check if the text fits in the allowed area
-        text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height)
-        text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0)
-        # If text overflows, split across multiple pages
-        while text_length == 0:  # 0 means text didn't fit
-            page = doc.new_page()  # Create new page
-            text_rect = fitz.Rect(text_box_x, text_box_y, text_box_x + text_box_width, text_box_y + text_box_height)
-            text_length = page.insert_textbox(text_rect, candidate_info, fontsize=font_size, fontname="helv", align=0)
     doc.save(pdf)
     pdf.seek(0)
     return pdf
 def generate_interview_questions_from_summaries(candidates):
     """
-    Generates common interview questions based on the combined summaries of shortlisted candidates.
-    Uses the Hugging Face Gemma model to generate questions.
     """
     if not isinstance(candidates, list):
         raise TypeError("Expected a list of candidate dictionaries.")
-    summaries = [c.get("summary", "") for c in candidates if "summary" in c]
-    combined_summary = " ".join(summaries)
     prompt = (
-        "Based on the following summary of this top candidate for a job role, generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n"
-        f"{combined_summary}"
     )
     try:
         response = client.chat_completion(
-            messages=[
-                {"role": "user", "content": prompt}
-            ],
             temperature=0.7,
             max_tokens=500
         )
-        result_text = response.choices[0].message.content
-        questions = [q.strip() for q in result_text.split("\n") if q.strip()]
-        return questions[:5] if questions else ["⚠️ No questions generated."]
     except Exception as e:
         print(f"❌ Error generating interview questions: {e}")
-        return ["⚠️ Error generating questions."]

+# === Imports ===
+# Standard Library
+import os
 import re
+import json
+import random
+import subprocess
 from io import BytesIO
 from collections import Counter
+# Third-Party Libraries
+import fitz  # PyMuPDF
+import requests
+import spacy
 import streamlit as st
 from fuzzywuzzy import fuzz
+from sentence_transformers import SentenceTransformer, util
+from sklearn.feature_extraction.text import TfidfVectorizer
 from huggingface_hub import InferenceClient
+# Local Configuration
+from config import (
+    SUPABASE_URL, SUPABASE_KEY, HF_API_TOKEN, HF_HEADERS,
+    supabase, HF_MODELS, query, embedding_model
+)
+# === Initialization ===
+# Hugging Face inference client for Gemma model
 client = InferenceClient(
     model="google/gemma-1.1-7b-it",
     token=HF_API_TOKEN
 )
+# Load or download spaCy model
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
+# === Core Resume Evaluation ===
 def evaluate_resumes(uploaded_files, job_description, min_keyword_match=2):
+    """
+    Evaluate uploaded resumes and return shortlisted candidates with scores and summaries.
+    """
+    candidates, removed_candidates = [], []
     for pdf_file in uploaded_files:
         resume_text = parse_resume(pdf_file)
         if score < 0.20:
             removed_candidates.append({"name": pdf_file.name, "reason": "Low confidence score (< 0.20)"})
+            continue
         candidates.append({
             "name": pdf_file.name,
             "summary": summary
         })
+    filtered_candidates, keyword_removed = filter_resumes_by_keywords(
+        candidates, job_description, min_keyword_match
+    )
     for name in keyword_removed:
         removed_candidates.append({"name": name, "reason": "Insufficient keyword matches"})
+    shortlisted = sorted(filtered_candidates, key=lambda x: x["score"], reverse=True)[:5]
+    return shortlisted if isinstance(shortlisted, list) else [], removed_candidates
+# === Keyword & Scoring Functions ===
 def extract_keywords(text, top_n=10):
+    """
+    Extracts top keywords from the job description using spaCy and TF-IDF.
+    """
+    if not text.strip():
         return []
     doc = nlp(text.lower())
+    keywords = [t.text for t in doc if t.pos_ in {"NOUN", "PROPN", "VERB", "ADJ"} and not t.is_stop]
+    if not keywords:
         return []
     try:
+        tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
+        matrix = tfidf.fit_transform([" ".join(keywords)])
+        scores = matrix.toarray()[0]
+        features = tfidf.get_feature_names_out()
+        ranked = sorted(zip(features, scores), key=lambda x: x[1], reverse=True)
+        return [kw for kw, _ in ranked[:top_n]]
+    except ValueError:
         return []
 def filter_resumes_by_keywords(resumes, job_description, min_keyword_match=2):
+    """
+    Filters resumes by keyword match using fuzzy logic.
+    """
     job_keywords = extract_keywords(job_description)
     if len(job_keywords) < min_keyword_match:
+        st.warning("⚠️ Job description too short or missing for keyword filtering.")
+        return resumes, []
+    filtered, removed = [], []
     for resume in resumes:
+        matched = {
+            keyword for keyword in job_keywords
+            if any(fuzz.partial_ratio(keyword, word) > 80 for word in resume["resume"].lower().split())
+        }
+        if len(matched) >= min_keyword_match:
+            filtered.append(resume)
+        else:
+            removed.append(resume["name"])
+    return filtered, removed
 def score_candidate(resume_text, job_description):
     """
+    Computes cosine similarity between resume and job description using embeddings.
     """
     try:
+        resume_vec = embedding_model.encode(resume_text, convert_to_tensor=True)
+        job_vec = embedding_model.encode(job_description, convert_to_tensor=True)
+        score = util.pytorch_cos_sim(resume_vec, job_vec).item()
+        return round(score, 4)
+    except Exception as e:
+        print(f"Error computing similarity: {e}")
+        return 0
+# === Text Extraction & Summarization ===
+def parse_resume(pdf_file):
+    """
+    Extracts raw text from a PDF file.
+    """
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    return "\n".join([page.get_text("text") for page in doc])
+def extract_email(resume_text):
+    """
+    Extracts the first valid email found in text.
+    """
+    match = re.search(r"[\w\.-]+@[\w\.-]+", resume_text)
+    return match.group(0) if match else None
 def summarize_resume(resume_text):
     """
+    Generates a summary of the resume using Hugging Face BART.
     """
     payload = {"inputs": f"Summarize this resume: {resume_text}"}
     response = query(payload, model="bart")
     if not response:
         return "Summary unavailable due to API issues."
     try:
+        response = response[0] if isinstance(response, list) else response
+        summary = response.get("generated_text") or response.get("summary_text")
+        return summary.strip() if summary else "Summary unavailable."
     except Exception as e:
+        print(f"Error parsing summary: {e}")
         return "Summary unavailable."
+# === Data Storage & Reporting ===
 def store_in_supabase(resume_text, score, candidate_name, email, summary):
     """
+    Saves candidate data to the Supabase table.
     """
     data = {
         "name": candidate_name,
         "resume": resume_text,
+        "score": score or 0,
         "email": email,
         "summary": summary
     }
+    return supabase.table("candidates").insert(data).execute()
+def generate_pdf_report(shortlisted_candidates, questions=None):
+    """
+    Creates a PDF report summarizing top candidates and interview questions.
+    """
     pdf = BytesIO()
     doc = fitz.open()
     for candidate in shortlisted_candidates:
         page = doc.new_page()
+        info = (
             f"Candidate: {candidate['name']}\n"
             f"Email: {candidate['email']}\n"
             f"Score: {candidate['score']}\n\n"
+            f"Summary:\n{candidate.get('summary', 'No summary available')}"
         )
+        page.insert_textbox(fitz.Rect(50, 50, 550, 750), info, fontsize=11, fontname="helv", align=0)
+    if questions:
+        q_page = doc.new_page()
+        q_text = "Suggested Interview Questions:\n\n" + "\n".join(questions)
+        q_page.insert_textbox(fitz.Rect(50, 50, 550, 750), q_text, fontsize=11, fontname="helv", align=0)
     doc.save(pdf)
     pdf.seek(0)
     return pdf
 def generate_interview_questions_from_summaries(candidates):
     """
+    Generates 5 interview questions based on combined summaries using Gemma model.
     """
     if not isinstance(candidates, list):
         raise TypeError("Expected a list of candidate dictionaries.")
+    summaries = " ".join(c.get("summary", "") for c in candidates)
     prompt = (
+        "Based on the following summary of this top candidate for a job role, "
+        "generate 5 thoughtful, general interview questions that would help a recruiter assess their fit:\n"
+        f"{summaries}"
     )
     try:
         response = client.chat_completion(
+            messages=[{"role": "user", "content": prompt}],
             temperature=0.7,
             max_tokens=500
         )
+        result = response.choices[0].message.content
+        questions = [re.sub(r"^(?:\*\*)?(Q?\d+[\.\)\-]?\s*)+(?:\*\*)?", "", q.strip())
+                     for q in result.split("\n") if q.strip()]
+        return [f"Q{i+1}. {q}" for i, q in enumerate(questions[:5])] or ["⚠️ No questions generated."]
     except Exception as e:
         print(f"❌ Error generating interview questions: {e}")
+        return ["⚠️ Error generating questions."]