Spaces:
Sleeping
Sleeping
import os | |
import re | |
import fitz # Importing PyMuPDF for PDF text extraction | |
import nltk | |
from gensim.models.doc2vec import Doc2Vec, TaggedDocument | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import pandas as pd | |
import gradio as gr | |
# Download NLTK data files | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# Function to preprocess text | |
def preprocess_text(text): | |
text = re.sub(r'\W+', ' ', text.lower()) # Remove non-alphanumeric characters and lower case | |
return text | |
# Function to extract keywords using TF-IDF | |
def extract_keywords_tfidf(text, max_features=50): | |
vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features) | |
tfidf_matrix = vectorizer.fit_transform([text]) | |
feature_names = vectorizer.get_feature_names_out() | |
tfidf_scores = tfidf_matrix.toarray().flatten() | |
keyword_scores = sorted(zip(tfidf_scores, feature_names), reverse=True) | |
return [keyword for score, keyword in keyword_scores] | |
# Function to extract text from a PDF | |
def extract_text_from_pdf(pdf_path): | |
document = fitz.open(pdf_path) | |
text = "" | |
for page_num in range(len(document)): | |
page = document.load_page(page_num) | |
text += page.get_text() | |
return text | |
# Function to give feedback on resume | |
def give_feedback(resume_text, job_description): | |
feedback = [] | |
# Check formatting (example: consistency in bullet points) | |
if '•' in resume_text and '-' in resume_text: | |
feedback.append("Consider using a consistent bullet point style throughout your resume.") | |
# Check for grammar and spelling | |
if not any(re.findall(r'\bexperience\b|\beducation\b|\bskills\b', resume_text.lower())): | |
feedback.append("Make sure your resume includes sections like Experience, Education, and Skills.") | |
# Extract keywords and check relevance | |
jd_keywords = extract_keywords_tfidf(preprocess_text(job_description)) | |
resume_keywords = extract_keywords_tfidf(preprocess_text(resume_text)) | |
common_keywords = set(jd_keywords).intersection(set(resume_keywords)) | |
if len(common_keywords) < 8: | |
feedback.append(f"Your resume could better match the job description. Consider adding keywords such as: {', '.join(jd_keywords[:5])}.") | |
# Check for action verbs | |
action_verbs = ["managed", "led", "developed", "designed", "implemented", "created"] | |
if not any(verb in resume_text.lower() for verb in action_verbs): | |
feedback.append("Consider using strong action verbs to describe your achievements and responsibilities.") | |
if not re.search(r'\bsummary\b|\bobjective\b', resume_text, re.IGNORECASE): | |
feedback.append("Consider adding a professional summary or objective statement to provide a quick overview of your qualifications.") | |
# Check for quantifiable achievements | |
if not re.findall(r'\d+', resume_text): | |
feedback.append("Include quantifiable achievements in your experience section (e.g., increased sales by 20%).") | |
# Provide positive feedback if none of the above conditions are met | |
if not feedback: | |
feedback.append("Your resume is well-aligned with the job description. Ensure to keep it updated with relevant keywords and achievements.") | |
return feedback | |
# Function to calculate TF-IDF cosine similarity score | |
def tfidf_cosine_similarity(resume, jd): | |
documents = [resume, jd] | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(documents) | |
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]) | |
return cosine_sim[0][0] | |
# Function to calculate Doc2Vec cosine similarity score | |
def doc2vec_cosine_similarity(resume, jd, model): | |
resume_vector = model.infer_vector(resume.split()) | |
jd_vector = model.infer_vector(jd.split()) | |
cosine_sim = cosine_similarity([resume_vector], [jd_vector]) | |
return cosine_sim[0][0] | |
# Function to extract years of experience from resume | |
def extract_years_of_experience(text): | |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE) | |
if years: | |
return sum(map(int, years)) | |
return 0 | |
# Function to extract information from resumes in a folder | |
def extract_info_from_resumes(resume_files, job_description): | |
data = [] | |
# Train Doc2Vec model on resumes and job description | |
documents = [] | |
for file in resume_files: | |
text = extract_text_from_pdf(file.name) | |
documents.append(preprocess_text(text)) | |
documents.append(preprocess_text(job_description)) | |
tagged_docs = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(documents)] | |
model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4) | |
for file in resume_files: | |
text = extract_text_from_pdf(file.name) | |
preprocessed_text = preprocess_text(text) | |
resume_keywords = extract_keywords_tfidf(preprocessed_text) | |
years_of_experience = extract_years_of_experience(text) | |
# Append years of experience to the resume keywords | |
if years_of_experience > 0: | |
resume_keywords.append(f"{years_of_experience} years experience") | |
name = os.path.splitext(os.path.basename(file.name))[0] | |
feedback = give_feedback(text, job_description) | |
# Calculate scores | |
jd_keywords = extract_keywords_tfidf(preprocess_text(job_description)) | |
common_keywords = set(jd_keywords).intersection(set(resume_keywords)) | |
keyword_match_score = len(common_keywords) # Count of common keywords as a whole number | |
tfidf_score = tfidf_cosine_similarity(text, job_description) | |
doc2vec_score = doc2vec_cosine_similarity(preprocessed_text, preprocess_text(job_description), model) | |
data.append({ | |
'Name': name, | |
'Keyword_Match_Score': keyword_match_score, # Whole number | |
'TFIDF_Score': tfidf_score, | |
'Doc2Vec_Score': doc2vec_score, | |
'Years_of_Experience': years_of_experience, | |
'Feedback': '; '.join(feedback), # Combine feedback into a single string | |
}) | |
return data | |
# Function to save data to an Excel file | |
def save_to_excel(data, output_file): | |
df = pd.DataFrame(data) | |
try: | |
df.to_excel(output_file, index=False) | |
return output_file | |
except Exception as e: | |
return f"Error saving file: {e}" | |
# Gradio interface function | |
def gradio_interface(resume_files, job_description): | |
if resume_files: | |
output_file = '/content/Resume_Analysis.xlsx' | |
resumes = extract_info_from_resumes(resume_files, job_description) | |
result = save_to_excel(resumes, output_file) | |
else: | |
result = "No resumes to process." | |
return result | |
# Gradio UI setup | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[ | |
gr.Files(label="Upload multiple Resumes", type="filepath"), # Accept multiple file uploads | |
gr.Textbox(label="Job Description", lines=5, placeholder="Enter the job description here...") | |
], | |
outputs=gr.File(label="Download Results"), # Provide the output file | |
description="Upload multiple resume PDFs and provide a job description to analyze the resumes and get an Excel file with the results." | |
) | |
# Launch the Gradio interface | |
iface.launch() | |