Spaces:
Sleeping
Sleeping
File size: 7,249 Bytes
9e79309 3a52026 9e79309 3a52026 9e79309 3a52026 9e79309 3a52026 9e79309 3a52026 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import os
import re
import fitz # Importing PyMuPDF for PDF text extraction
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import gradio as gr
# Download NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
# Function to preprocess text
def preprocess_text(text):
text = re.sub(r'\W+', ' ', text.lower()) # Remove non-alphanumeric characters and lower case
return text
# Function to extract keywords using TF-IDF
def extract_keywords_tfidf(text, max_features=50):
vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
tfidf_matrix = vectorizer.fit_transform([text])
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray().flatten()
keyword_scores = sorted(zip(tfidf_scores, feature_names), reverse=True)
return [keyword for score, keyword in keyword_scores]
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
document = fitz.open(pdf_path)
text = ""
for page_num in range(len(document)):
page = document.load_page(page_num)
text += page.get_text()
return text
# Function to give feedback on resume
def give_feedback(resume_text, job_description):
feedback = []
# Check formatting (example: consistency in bullet points)
if '•' in resume_text and '-' in resume_text:
feedback.append("Consider using a consistent bullet point style throughout your resume.")
# Check for grammar and spelling
if not any(re.findall(r'\bexperience\b|\beducation\b|\bskills\b', resume_text.lower())):
feedback.append("Make sure your resume includes sections like Experience, Education, and Skills.")
# Extract keywords and check relevance
jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
resume_keywords = extract_keywords_tfidf(preprocess_text(resume_text))
common_keywords = set(jd_keywords).intersection(set(resume_keywords))
if len(common_keywords) < 8:
feedback.append(f"Your resume could better match the job description. Consider adding keywords such as: {', '.join(jd_keywords[:5])}.")
# Check for action verbs
action_verbs = ["managed", "led", "developed", "designed", "implemented", "created"]
if not any(verb in resume_text.lower() for verb in action_verbs):
feedback.append("Consider using strong action verbs to describe your achievements and responsibilities.")
if not re.search(r'\bsummary\b|\bobjective\b', resume_text, re.IGNORECASE):
feedback.append("Consider adding a professional summary or objective statement to provide a quick overview of your qualifications.")
# Check for quantifiable achievements
if not re.findall(r'\d+', resume_text):
feedback.append("Include quantifiable achievements in your experience section (e.g., increased sales by 20%).")
# Provide positive feedback if none of the above conditions are met
if not feedback:
feedback.append("Your resume is well-aligned with the job description. Ensure to keep it updated with relevant keywords and achievements.")
return feedback
# Function to calculate TF-IDF cosine similarity score
def tfidf_cosine_similarity(resume, jd):
documents = [resume, jd]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
return cosine_sim[0][0]
# Function to calculate Doc2Vec cosine similarity score
def doc2vec_cosine_similarity(resume, jd, model):
resume_vector = model.infer_vector(resume.split())
jd_vector = model.infer_vector(jd.split())
cosine_sim = cosine_similarity([resume_vector], [jd_vector])
return cosine_sim[0][0]
# Function to extract years of experience from resume
def extract_years_of_experience(text):
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
if years:
return sum(map(int, years))
return 0
# Function to extract information from resumes in a folder
def extract_info_from_resumes(resume_files, job_description):
data = []
# Train Doc2Vec model on resumes and job description
documents = []
for file in resume_files:
text = extract_text_from_pdf(file.name)
documents.append(preprocess_text(text))
documents.append(preprocess_text(job_description))
tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4)
for file in resume_files:
text = extract_text_from_pdf(file.name)
preprocessed_text = preprocess_text(text)
resume_keywords = extract_keywords_tfidf(preprocessed_text)
years_of_experience = extract_years_of_experience(text)
# Append years of experience to the resume keywords
if years_of_experience > 0:
resume_keywords.append(f"{years_of_experience} years experience")
name = os.path.splitext(os.path.basename(file.name))[0]
feedback = give_feedback(text, job_description)
# Calculate scores
jd_keywords = extract_keywords_tfidf(preprocess_text(job_description))
common_keywords = set(jd_keywords).intersection(set(resume_keywords))
keyword_match_score = len(common_keywords) # Count of common keywords as a whole number
tfidf_score = tfidf_cosine_similarity(text, job_description)
doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model)
data.append({
'Name': name,
'Keyword_Match_Score': keyword_match_score, # Whole number
'TFIDF_Score': tfidf_score,
'Doc2Vec_Score': doc2vec_score,
'Years_of_Experience': years_of_experience,
'Feedback': '; '.join(feedback), # Combine feedback into a single string
})
return data
# Function to save data to an Excel file
def save_to_excel(data, output_file):
df = pd.DataFrame(data)
try:
df.to_excel(output_file, index=False)
return output_file
except Exception as e:
return f"Error saving file: {e}"
# Gradio interface function
def gradio_interface(resume_files, job_description):
if resume_files:
output_file = '/content/Resume_Analysis.xlsx'
resumes = extract_info_from_resumes(resume_files, job_description)
result = save_to_excel(resumes, output_file)
else:
result = "No resumes to process."
return result
# Gradio UI setup
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Files(label="Upload multiple Resumes", type="filepath"), # Accept multiple file uploads
gr.Textbox(label="Job Description", lines=5, placeholder="Enter the job description here...")
],
outputs=gr.File(label="Download Results"), # Provide the output file
description="Upload multiple resume PDFs and provide a job description to analyze the resumes and get an Excel file with the results."
)
# Launch the Gradio interface
iface.launch()
|