Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,23 +1,17 @@
|
|
1 |
-
import os
|
2 |
-
import pandas as pd
|
3 |
-
import google.generativeai as genai
|
4 |
-
import PyPDF2 as pdf
|
5 |
-
import io
|
6 |
import re
|
7 |
-
import
|
8 |
-
|
|
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
11 |
|
12 |
-
#
|
13 |
-
api_key =
|
14 |
-
if not api_key:
|
15 |
-
raise ValueError("API key not found. Please set GOOGLE_API_KEY in your Hugging Face Space secrets.")
|
16 |
-
|
17 |
-
# Initialize the generative AI model
|
18 |
genai.configure(api_key=api_key)
|
19 |
|
20 |
-
#
|
21 |
skill_extractor = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
|
22 |
education_extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
|
23 |
|
@@ -30,41 +24,31 @@ def input_pdf_text(uploaded_file):
|
|
30 |
text += page.extract_text()
|
31 |
return text
|
32 |
|
33 |
-
# Extract
|
34 |
-
def extract_name_from_model_response(response_text):
|
35 |
-
match = re.search(r"Candidate Name:\s*(.*)", response_text)
|
36 |
-
if match:
|
37 |
-
return match.group(1)
|
38 |
-
return "Not Available"
|
39 |
-
|
40 |
-
# Extract email and phone numbers using regex
|
41 |
def extract_contact_info(resume_text):
|
42 |
email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", resume_text)
|
|
|
43 |
email = email_match.group(0) if email_match else "Not Available"
|
|
|
|
|
44 |
|
45 |
-
|
46 |
-
contact = contact_match.group(0) if contact_match else "Not Available"
|
47 |
-
|
48 |
-
return email, contact
|
49 |
-
|
50 |
-
# Extract skills using NER model
|
51 |
def extract_skills(resume_text):
|
52 |
ner_results = skill_extractor(resume_text)
|
53 |
skills = [entity['word'] for entity in ner_results if entity['entity_group'] == 'SKILL']
|
54 |
return ", ".join(skills) if skills else "Not Available"
|
55 |
|
56 |
-
# Extract education
|
57 |
def extract_education(resume_text):
|
58 |
ner_results = education_extractor(resume_text)
|
59 |
education_entities = [entity['word'] for entity in ner_results if entity['entity_group'] == 'EDUCATION']
|
60 |
|
61 |
-
# If no direct education found,
|
62 |
if education_entities:
|
63 |
return ", ".join(education_entities)
|
64 |
else:
|
65 |
-
# Using regex to capture education background
|
66 |
edu_patterns = [
|
67 |
-
r"(Bachelor of .+|Master of .+|PhD|BSc|MSc|MBA|B.A|M.A|B.Tech|M.Tech
|
68 |
r"(University of [A-Za-z]+.*)"
|
69 |
]
|
70 |
education = []
|
@@ -74,141 +58,53 @@ def extract_education(resume_text):
|
|
74 |
|
75 |
return ", ".join(education) if education else "Not Available"
|
76 |
|
77 |
-
#
|
78 |
-
def extract_experience_years(text):
|
79 |
-
years = 0
|
80 |
-
patterns = [
|
81 |
-
r"(\d{4})\s?[-to]+\s?(\d{4})", # From year to year
|
82 |
-
r"(\d+) years", # Exact mention of years
|
83 |
-
r"since (\d{4})", # Mentions "since"
|
84 |
-
r"(\d+)\s?[\-–]\s?(\d+)", # Handles year ranges with hyphens (e.g., 2015-2020)
|
85 |
-
r"(\d+)\s?[\–]\s?present", # Present with range (e.g., 2019–present)
|
86 |
-
]
|
87 |
-
|
88 |
-
for pattern in patterns:
|
89 |
-
matches = re.findall(pattern, text)
|
90 |
-
for match in matches:
|
91 |
-
if len(match) == 2:
|
92 |
-
start_year = int(match[0])
|
93 |
-
end_year = int(match[1])
|
94 |
-
years += end_year - start_year
|
95 |
-
elif len(match) == 1:
|
96 |
-
years += int(match[0])
|
97 |
-
|
98 |
-
return years
|
99 |
-
|
100 |
-
# Calculate the match percentage using TF-IDF and cosine similarity
|
101 |
def calculate_match_percentage(resume_text, job_description):
|
102 |
-
# Combine resume and job description into a list
|
103 |
documents = [resume_text, job_description]
|
104 |
-
|
105 |
-
# Use TF-IDF to convert the documents into vectors
|
106 |
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
107 |
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
|
108 |
-
|
109 |
-
# Compute cosine similarity between resume and job description
|
110 |
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
|
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
# Generate the detailed analysis from the Gemini model
|
117 |
-
def get_gemini_response(input_text, job_description):
|
118 |
-
prompt = f"""
|
119 |
-
Act as an Applicant Tracking System. Analyze the resume with respect to the job description.
|
120 |
-
Candidate Details: {input_text}
|
121 |
-
Job Description: {job_description}
|
122 |
-
Please extract the following:
|
123 |
-
1. Candidate Name
|
124 |
-
2. Relevant Skills
|
125 |
-
3. Educational Background
|
126 |
-
4. Direct Team Leadership Experience (in years)
|
127 |
-
5. Direct Management Experience (in years)
|
128 |
-
6. Match percentage with the job description
|
129 |
-
7. Provide a resume summary in 5 bullet points highlighting the key details about the candidate
|
130 |
-
"""
|
131 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
132 |
response = model.generate_content(prompt)
|
133 |
return response.text.strip()
|
134 |
|
135 |
-
#
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
#
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
data['Direct_Team_Leadership_Experience_Years'] = team_leadership_years if team_leadership_years > 0 else "Not Available"
|
171 |
-
data['Direct_Management_Experience_Years'] = management_years if management_years > 0 else "Not Available"
|
172 |
-
|
173 |
-
# Extract skills using the NER model
|
174 |
-
relevant_skills = extract_skills(resume_text)
|
175 |
-
data['Relevant_Skills_and_Qualifications'] = relevant_skills if relevant_skills != "Not Available" else "Not Available"
|
176 |
-
|
177 |
-
# Extract education using the NER model or regex
|
178 |
-
educational_background = extract_education(resume_text)
|
179 |
-
data['Educational_Background'] = educational_background if educational_background != "Not Available" else "Not Available"
|
180 |
-
|
181 |
-
# Calculate match percentage dynamically
|
182 |
-
match_percentage = calculate_match_percentage(resume_text, job_description)
|
183 |
-
data['Match_Percentage'] = match_percentage
|
184 |
-
|
185 |
-
# Calculate Job Description Match Score dynamically (based on match percentage)
|
186 |
-
if match_percentage >= 80:
|
187 |
-
job_description_match_score = "High"
|
188 |
-
elif match_percentage >= 60:
|
189 |
-
job_description_match_score = "Medium"
|
190 |
-
else:
|
191 |
-
job_description_match_score = "Low"
|
192 |
-
data['Job_Description_Match_Score'] = job_description_match_score
|
193 |
-
|
194 |
-
# Extract resume summary from the model response (5 bullet points)
|
195 |
-
bullet_summary = "Resume Summary (5 Bullet Points):\n"
|
196 |
-
bullets = response_text.split("\n")[-5:] # Assuming the last 5 lines are the bullet points
|
197 |
-
for idx, bullet in enumerate(bullets):
|
198 |
-
bullet_summary += f"{idx+1}. {bullet}\n"
|
199 |
-
data['Resume_Summary'] = bullet_summary
|
200 |
-
|
201 |
-
# Prepare DataFrame for output
|
202 |
-
df = pd.DataFrame([data])
|
203 |
-
|
204 |
-
# Show the result table
|
205 |
-
st.write(df)
|
206 |
-
|
207 |
-
# Provide a download button for the CSV file
|
208 |
-
csv = df.to_csv(index=False)
|
209 |
-
st.download_button(
|
210 |
-
label="Download Analysis as CSV",
|
211 |
-
data=csv,
|
212 |
-
file_name="resume_analysis.csv",
|
213 |
-
mime="text/csv"
|
214 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
+
import io
|
3 |
+
import PyPDF2 as pdf
|
4 |
+
import pandas as pd
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
from transformers import pipeline
|
8 |
+
import google.generativeai as genai
|
9 |
|
10 |
+
# Initialize Google Gemini AI API for summarization (ensure API key is set up)
|
11 |
+
api_key = 'YOUR_GOOGLE_API_KEY'
|
|
|
|
|
|
|
|
|
12 |
genai.configure(api_key=api_key)
|
13 |
|
14 |
+
# Set up NER pipelines for skill and education extraction
|
15 |
skill_extractor = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
|
16 |
education_extractor = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
|
17 |
|
|
|
24 |
text += page.extract_text()
|
25 |
return text
|
26 |
|
27 |
+
# Extract contact information using regular expressions (email, phone)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def extract_contact_info(resume_text):
|
29 |
email_match = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", resume_text)
|
30 |
+
phone_match = re.search(r"\+?\(?\d{1,3}\)?[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{3}[-.\s]?\d{4}|\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", resume_text)
|
31 |
email = email_match.group(0) if email_match else "Not Available"
|
32 |
+
phone = phone_match.group(0) if phone_match else "Not Available"
|
33 |
+
return email, phone
|
34 |
|
35 |
+
# Extract skills using NER model or regex
|
|
|
|
|
|
|
|
|
|
|
36 |
def extract_skills(resume_text):
|
37 |
ner_results = skill_extractor(resume_text)
|
38 |
skills = [entity['word'] for entity in ner_results if entity['entity_group'] == 'SKILL']
|
39 |
return ", ".join(skills) if skills else "Not Available"
|
40 |
|
41 |
+
# Extract education details using NER model
|
42 |
def extract_education(resume_text):
|
43 |
ner_results = education_extractor(resume_text)
|
44 |
education_entities = [entity['word'] for entity in ner_results if entity['entity_group'] == 'EDUCATION']
|
45 |
|
46 |
+
# If no direct education found, use regex patterns to capture education
|
47 |
if education_entities:
|
48 |
return ", ".join(education_entities)
|
49 |
else:
|
|
|
50 |
edu_patterns = [
|
51 |
+
r"(Bachelor of .+|Master of .+|PhD|BSc|MSc|MBA|B.A|M.A|B.Tech|M.Tech)",
|
52 |
r"(University of [A-Za-z]+.*)"
|
53 |
]
|
54 |
education = []
|
|
|
58 |
|
59 |
return ", ".join(education) if education else "Not Available"
|
60 |
|
61 |
+
# Calculate match percentage between resume and job description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def calculate_match_percentage(resume_text, job_description):
|
|
|
63 |
documents = [resume_text, job_description]
|
|
|
|
|
64 |
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
65 |
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
|
|
|
|
|
66 |
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
67 |
+
return cosine_sim[0][0] * 100 # Return percentage match
|
68 |
|
69 |
+
# Generate resume summary using Gemini API
|
70 |
+
def generate_resume_summary(resume_text):
|
71 |
+
prompt = f"Summarize the resume below in 5 bullet points:\n\n{resume_text}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
model = genai.GenerativeModel('gemini-1.5-flash')
|
73 |
response = model.generate_content(prompt)
|
74 |
return response.text.strip()
|
75 |
|
76 |
+
# Main function to process and analyze the resume and job description
|
77 |
+
def analyze_resume_and_job(resume_pdf, job_description):
|
78 |
+
resume_text = input_pdf_text(resume_pdf)
|
79 |
+
|
80 |
+
# Extract candidate details
|
81 |
+
email, phone = extract_contact_info(resume_text)
|
82 |
+
skills = extract_skills(resume_text)
|
83 |
+
education = extract_education(resume_text)
|
84 |
+
|
85 |
+
# Calculate match percentage
|
86 |
+
match_percentage = calculate_match_percentage(resume_text, job_description)
|
87 |
+
|
88 |
+
# Generate summary using Gemini
|
89 |
+
resume_summary = generate_resume_summary(resume_text)
|
90 |
+
|
91 |
+
# Prepare results for display
|
92 |
+
result_data = {
|
93 |
+
'Email': email,
|
94 |
+
'Phone': phone,
|
95 |
+
'Skills': skills,
|
96 |
+
'Education': education,
|
97 |
+
'Match_Percentage': match_percentage,
|
98 |
+
'Resume_Summary': resume_summary
|
99 |
+
}
|
100 |
+
|
101 |
+
# Create a DataFrame for display
|
102 |
+
df = pd.DataFrame([result_data])
|
103 |
+
|
104 |
+
return df
|
105 |
+
|
106 |
+
# Example usage
|
107 |
+
resume_pdf = 'path_to_resume.pdf' # Path to the uploaded resume PDF
|
108 |
+
job_description = '''Insert job description here''' # Job description text
|
109 |
+
df = analyze_resume_and_job(resume_pdf, job_description)
|
110 |
+
print(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|