File size: 7,249 Bytes
9e79309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a52026
9e79309
 
 
 
 
 
 
3a52026
9e79309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a52026
9e79309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a52026
9e79309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a52026
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import os
import re
import fitz  # Importing PyMuPDF for PDF text extraction
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import gradio as gr

# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text.lower())  # Remove non-alphanumeric characters and lower case
    return text

# Function to extract keywords using TF-IDF
def extract_keywords_tfidf(text, max_features=50):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray().flatten()
    keyword_scores = sorted(zip(tfidf_scores, feature_names), reverse=True)
    return [keyword for score, keyword in keyword_scores]

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Function to give feedback on resume
def give_feedback(resume_text, job_description):
    feedback = []

    # Check formatting (example: consistency in bullet points)
    if '•' in resume_text and '-' in resume_text:
        feedback.append("Consider using a consistent bullet point style throughout your resume.")

    # Check for grammar and spelling
    if not any(re.findall(r'\bexperience\b|\beducation\b|\bskills\b', resume_text.lower())):
        feedback.append("Make sure your resume includes sections like Experience, Education, and Skills.")

    # Extract keywords and check relevance
    jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
    resume_keywords = extract_keywords_tfidf(preprocess_text(resume_text))

    common_keywords = set(jd_keywords).intersection(set(resume_keywords))
    if len(common_keywords) < 8:
        feedback.append(f"Your resume could better match the job description. Consider adding keywords such as: {', '.join(jd_keywords[:5])}.")

    # Check for action verbs
    action_verbs = ["managed", "led", "developed", "designed", "implemented", "created"]
    if not any(verb in resume_text.lower() for verb in action_verbs):
        feedback.append("Consider using strong action verbs to describe your achievements and responsibilities.")

    if not re.search(r'\bsummary\b|\bobjective\b', resume_text, re.IGNORECASE):
        feedback.append("Consider adding a professional summary or objective statement to provide a quick overview of your qualifications.")

    # Check for quantifiable achievements
    if not re.findall(r'\d+', resume_text):
        feedback.append("Include quantifiable achievements in your experience section (e.g., increased sales by 20%).")

    # Provide positive feedback if none of the above conditions are met
    if not feedback:
        feedback.append("Your resume is well-aligned with the job description. Ensure to keep it updated with relevant keywords and achievements.")

    return feedback

# Function to calculate TF-IDF cosine similarity score
def tfidf_cosine_similarity(resume, jd):
    documents = [resume, jd]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]

# Function to calculate Doc2Vec cosine similarity score
def doc2vec_cosine_similarity(resume, jd, model):
    resume_vector = model.infer_vector(resume.split())
    jd_vector = model.infer_vector(jd.split())

    cosine_sim = cosine_similarity([resume_vector], [jd_vector])
    return cosine_sim[0][0]

# Function to extract years of experience from resume
def extract_years_of_experience(text):
    years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
    if years:
        return sum(map(int, years))
    return 0

# Function to extract information from resumes in a folder
def extract_info_from_resumes(resume_files, job_description):
    data = []

    # Train Doc2Vec model on resumes and job description
    documents = []
    for file in resume_files:
        text = extract_text_from_pdf(file.name)
        documents.append(preprocess_text(text))

    documents.append(preprocess_text(job_description))
    tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)

    for file in resume_files:
        text = extract_text_from_pdf(file.name)

        preprocessed_text = preprocess_text(text)
        resume_keywords = extract_keywords_tfidf(preprocessed_text)
        years_of_experience = extract_years_of_experience(text)

        # Append years of experience to the resume keywords
        if years_of_experience > 0:
            resume_keywords.append(f"{years_of_experience} years experience")

        name = os.path.splitext(os.path.basename(file.name))[0]

        feedback = give_feedback(text, job_description)

        # Calculate scores
        jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
        common_keywords = set(jd_keywords).intersection(set(resume_keywords))
        keyword_match_score = len(common_keywords)  # Count of common keywords as a whole number
        tfidf_score = tfidf_cosine_similarity(text, job_description)
        doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model)

        data.append({
            'Name': name,
            'Keyword_Match_Score': keyword_match_score,  # Whole number
            'TFIDF_Score': tfidf_score,
            'Doc2Vec_Score': doc2vec_score,
            'Years_of_Experience': years_of_experience,
            'Feedback': '; '.join(feedback),  # Combine feedback into a single string
        })

    return data

# Function to save data to an Excel file
def save_to_excel(data, output_file):
    df = pd.DataFrame(data)
    try:
        df.to_excel(output_file, index=False)
        return output_file
    except Exception as e:
        return f"Error saving file: {e}"

# Gradio interface function
def gradio_interface(resume_files, job_description):
    if resume_files:
        output_file = '/content/Resume_Analysis.xlsx'
        resumes = extract_info_from_resumes(resume_files, job_description)
        result = save_to_excel(resumes, output_file)
    else:
        result = "No resumes to process."

    return result

# Gradio UI setup
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Files(label="Upload multiple Resumes", type="filepath"),  # Accept multiple file uploads
        gr.Textbox(label="Job Description", lines=5, placeholder="Enter the job description here...")
    ],
    outputs=gr.File(label="Download Results"),  # Provide the output file

    description="Upload multiple resume PDFs and provide a job description to analyze the resumes and get an Excel file with the results."
)

# Launch the Gradio interface
iface.launch()