Spaces:

Prernas19
/

resume_analysis

Sleeping

File size: 7,249 Bytes

import os
import re
import fitz  # Importing PyMuPDF for PDF text extraction
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import gradio as gr

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text.lower())  # Remove non-alphanumeric characters and lower case
    return text

# Function to extract keywords using TF-IDF
def extract_keywords_tfidf(text, max_features=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray().flatten()
    keyword_scores = sorted(zip(tfidf_scores, feature_names), reverse=True)
    return [keyword for score, keyword in keyword_scores]

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Function to give feedback on resume
def give_feedback(resume_text, job_description):
    feedback = []

    # Check formatting (example: consistency in bullet points)
    if '•' in resume_text and '-' in resume_text:
        feedback.append("Consider using a consistent bullet point style throughout your resume.")

    # Check for grammar and spelling
    if not any(re.findall(r'\bexperience\b|\beducation\b|\bskills\b', resume_text.lower())):
        feedback.append("Make sure your resume includes sections like Experience, Education, and Skills.")

    # Extract keywords and check relevance
    jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
    resume_keywords = extract_keywords_tfidf(preprocess_text(resume_text))

    common_keywords = set(jd_keywords).intersection(set(resume_keywords))
    if len(common_keywords) < 8:
        feedback.append(f"Your resume could better match the job description. Consider adding keywords such as: {', '.join(jd_keywords[:5])}.")

    # Check for action verbs
    action_verbs = ["managed", "led", "developed", "designed", "implemented", "created"]
    if not any(verb in resume_text.lower() for verb in action_verbs):
        feedback.append("Consider using strong action verbs to describe your achievements and responsibilities.")

    if not re.search(r'\bsummary\b|\bobjective\b', resume_text, re.IGNORECASE):
        feedback.append("Consider adding a professional summary or objective statement to provide a quick overview of your qualifications.")

    # Check for quantifiable achievements
    if not re.findall(r'\d+', resume_text):
        feedback.append("Include quantifiable achievements in your experience section (e.g., increased sales by 20%).")

    # Provide positive feedback if none of the above conditions are met
    if not feedback:
        feedback.append("Your resume is well-aligned with the job description. Ensure to keep it updated with relevant keywords and achievements.")

    return feedback

# Function to calculate TF-IDF cosine similarity score
def tfidf_cosine_similarity(resume, jd):
    documents = [resume, jd]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]

# Function to calculate Doc2Vec cosine similarity score
def doc2vec_cosine_similarity(resume, jd, model):
    resume_vector = model.infer_vector(resume.split())
    jd_vector = model.infer_vector(jd.split())

    cosine_sim = cosine_similarity([resume_vector], [jd_vector])
    return cosine_sim[0][0]

# Function to extract years of experience from resume
def extract_years_of_experience(text):
    years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
    if years:
        return sum(map(int, years))
    return 0

# Function to extract information from resumes in a folder
def extract_info_from_resumes(resume_files, job_description):
    data = []

    # Train Doc2Vec model on resumes and job description
    documents = []
    for file in resume_files:
        text = extract_text_from_pdf(file.name)
        documents.append(preprocess_text(text))

    documents.append(preprocess_text(job_description))
    tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)

    for file in resume_files:
        text = extract_text_from_pdf(file.name)

        preprocessed_text = preprocess_text(text)
        resume_keywords = extract_keywords_tfidf(preprocessed_text)
        years_of_experience = extract_years_of_experience(text)

        # Append years of experience to the resume keywords
        if years_of_experience > 0:
            resume_keywords.append(f"{years_of_experience} years experience")

        name = os.path.splitext(os.path.basename(file.name))[0]

        feedback = give_feedback(text, job_description)

        # Calculate scores
        jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
        common_keywords = set(jd_keywords).intersection(set(resume_keywords))
        keyword_match_score = len(common_keywords)  # Count of common keywords as a whole number
        tfidf_score = tfidf_cosine_similarity(text, job_description)
        doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model)

        data.append({
            'Name': name,
            'Keyword_Match_Score': keyword_match_score,  # Whole number
            'TFIDF_Score': tfidf_score,
            'Doc2Vec_Score': doc2vec_score,
            'Years_of_Experience': years_of_experience,
            'Feedback': '; '.join(feedback),  # Combine feedback into a single string
        })

    return data

# Function to save data to an Excel file
def save_to_excel(data, output_file):
    df = pd.DataFrame(data)
    try:
        df.to_excel(output_file, index=False)
        return output_file
    except Exception as e:
        return f"Error saving file: {e}"

# Gradio interface function
def gradio_interface(resume_files, job_description):
    if resume_files:
        output_file = '/content/Resume_Analysis.xlsx'
        resumes = extract_info_from_resumes(resume_files, job_description)
        result = save_to_excel(resumes, output_file)
    else:
        result = "No resumes to process."

    return result

# Gradio UI setup
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Files(label="Upload multiple Resumes", type="filepath"),  # Accept multiple file uploads
        gr.Textbox(label="Job Description", lines=5, placeholder="Enter the job description here...")
    ],
    outputs=gr.File(label="Download Results"),  # Provide the output file

    description="Upload multiple resume PDFs and provide a job description to analyze the resumes and get an Excel file with the results."
)

# Launch the Gradio interface
iface.launch()