chelscelis
commited on
Commit
·
3cdb53b
1
Parent(s):
f9848af
Upload 2 files
Browse files
app.py
CHANGED
@@ -59,6 +59,7 @@ with tab2:
|
|
59 |
st.divider()
|
60 |
st.header('Output')
|
61 |
resumeClf = pd.read_excel(uploadedResumeClf)
|
|
|
62 |
if 'Resume' in resumeClf.columns:
|
63 |
resumeClf = classifyResumes(resumeClf)
|
64 |
with st.expander('View Bar Chart'):
|
@@ -98,6 +99,7 @@ with tab3:
|
|
98 |
st.header('Output')
|
99 |
jobDescriptionRnk = uploadedJobDescriptionRnk.read().decode('utf-8')
|
100 |
resumeRnk = pd.read_excel(uploadedResumeRnk)
|
|
|
101 |
if 'Resume' in resumeRnk.columns:
|
102 |
resumeRnk = rankResumes(jobDescriptionRnk, resumeRnk)
|
103 |
with st.expander('View Job Description'):
|
|
|
59 |
st.divider()
|
60 |
st.header('Output')
|
61 |
resumeClf = pd.read_excel(uploadedResumeClf)
|
62 |
+
|
63 |
if 'Resume' in resumeClf.columns:
|
64 |
resumeClf = classifyResumes(resumeClf)
|
65 |
with st.expander('View Bar Chart'):
|
|
|
99 |
st.header('Output')
|
100 |
jobDescriptionRnk = uploadedJobDescriptionRnk.read().decode('utf-8')
|
101 |
resumeRnk = pd.read_excel(uploadedResumeRnk)
|
102 |
+
|
103 |
if 'Resume' in resumeRnk.columns:
|
104 |
resumeRnk = rankResumes(jobDescriptionRnk, resumeRnk)
|
105 |
with st.expander('View Job Description'):
|
utils.py
CHANGED
@@ -40,7 +40,6 @@ def addZeroFeatures(matrix):
|
|
40 |
|
41 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
42 |
def classifyResumes(df):
|
43 |
-
# WITH PROGRESS BAR
|
44 |
progressBar = st.progress(0)
|
45 |
progressBar.progress(0, text = "Preprocessing data ...")
|
46 |
startTime = time.time()
|
@@ -72,29 +71,6 @@ def classifyResumes(df):
|
|
72 |
st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}')
|
73 |
return df
|
74 |
|
75 |
-
# NO LOADING WIDGET
|
76 |
-
# startTime = time.time()
|
77 |
-
# df['cleanedResume'] = df.Resume.apply(lambda x: performStemming(x))
|
78 |
-
# resumeText = df['cleanedResume'].values
|
79 |
-
# vectorizer = loadTfidfVectorizer()
|
80 |
-
# wordFeatures = vectorizer.transform(resumeText)
|
81 |
-
# wordFeaturesWithZeros = addZeroFeatures(wordFeatures)
|
82 |
-
# finalFeatures = dimensionalityReduction(wordFeaturesWithZeros)
|
83 |
-
# knn = loadKnnModel()
|
84 |
-
# predictedCategories = knn.predict(finalFeatures)
|
85 |
-
# le = loadLabelEncoder()
|
86 |
-
# df['Industry Category'] = le.inverse_transform(predictedCategories)
|
87 |
-
# df['Industry Category'] = pd.Categorical(df['Industry Category'])
|
88 |
-
# df.drop(columns = ['cleanedResume'], inplace = True)
|
89 |
-
# endTime = time.time()
|
90 |
-
# elapsedSeconds = endTime - startTime
|
91 |
-
# elapsedTime = datetime.timedelta(seconds = elapsedSeconds)
|
92 |
-
# hours, remainder = divmod(elapsedTime.seconds, 3600)
|
93 |
-
# minutes, seconds = divmod(remainder, 60)
|
94 |
-
# elapsedTimeStr = f"{hours} hr {minutes} min {seconds} sec"
|
95 |
-
# st.info(f'Finished in {elapsedTimeStr}')
|
96 |
-
# return df
|
97 |
-
|
98 |
def clickClassify():
|
99 |
st.session_state.processClf = True
|
100 |
|
@@ -283,7 +259,6 @@ model = loadModel()
|
|
283 |
|
284 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
285 |
def rankResumes(text, df):
|
286 |
-
# WITH PROGRESS BAR
|
287 |
progressBar = st.progress(0)
|
288 |
progressBar.progress(0, text = "Preprocessing data ...")
|
289 |
startTime = time.time()
|
@@ -326,156 +301,6 @@ def rankResumes(text, df):
|
|
326 |
st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
|
327 |
return df
|
328 |
|
329 |
-
# NO LOADING WIDGET
|
330 |
-
# startTime = time.time()
|
331 |
-
# jobDescriptionText = performLemmatization(text)
|
332 |
-
# df['cleanedResume'] = df['Resume'].apply(lambda x: performLemmatization(x))
|
333 |
-
# documents = [jobDescriptionText] + df['cleanedResume'].tolist()
|
334 |
-
# dictionary = Dictionary(documents)
|
335 |
-
# tfidf = TfidfModel(dictionary = dictionary)
|
336 |
-
# similarityIndex = WordEmbeddingSimilarityIndex(model)
|
337 |
-
# similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
|
338 |
-
# query = tfidf[dictionary.doc2bow(jobDescriptionText)]
|
339 |
-
# index = SoftCosineSimilarity(
|
340 |
-
# tfidf[[dictionary.doc2bow(resume) for resume in df['cleanedResume']]],
|
341 |
-
# similarityMatrix
|
342 |
-
# )
|
343 |
-
# similarities = index[query]
|
344 |
-
# df['Similarity Score'] = similarities
|
345 |
-
# df.sort_values(by = 'Similarity Score', ascending = False, inplace = True)
|
346 |
-
# df.drop(columns = ['cleanedResume'], inplace = True)
|
347 |
-
# endTime = time.time()
|
348 |
-
# elapsedSeconds = endTime - startTime
|
349 |
-
# elapsedTime = datetime.timedelta(seconds = elapsedSeconds)
|
350 |
-
# hours, remainder = divmod(elapsedTime.seconds, 3600)
|
351 |
-
# minutes, seconds = divmod(remainder, 60)
|
352 |
-
# elapsedTimeStr = f"{hours} hr {minutes} min {seconds} sec"
|
353 |
-
# st.info(f'Finished in {elapsedTimeStr}')
|
354 |
-
# return df
|
355 |
-
|
356 |
-
# TF-IDF + LSA + COSSIM
|
357 |
-
# from sklearn.decomposition import TruncatedSVD
|
358 |
-
# import math
|
359 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
360 |
-
# jobDescriptionRnk = preprocessing(jobDescriptionRnk)
|
361 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing(x))
|
362 |
-
# resumes = resumeRnk['cleanedResume'].values
|
363 |
-
# # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words = 'english')
|
364 |
-
# # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True)
|
365 |
-
# # tfidfVectorizer = TfidfVectorizer(stop_words = 'english')
|
366 |
-
# tfidfVectorizer = TfidfVectorizer()
|
367 |
-
# tfidfMatrix = tfidfVectorizer.fit_transform([jobDescriptionRnk] + list(resumes))
|
368 |
-
# num_features = len(tfidfVectorizer.get_feature_names_out())
|
369 |
-
# st.write(f"Number of TF-IDF Features: {num_features}")
|
370 |
-
# nComponents = math.ceil(len(resumes) * 0.55)
|
371 |
-
# # nComponents = math.ceil(num_features * 0.01)
|
372 |
-
# # nComponents = 5
|
373 |
-
# st.write(nComponents)
|
374 |
-
# # nComponents = len(resumes)
|
375 |
-
# lsa = TruncatedSVD(n_components=nComponents)
|
376 |
-
# lsaMatrix = lsa.fit_transform(tfidfMatrix)
|
377 |
-
# similarityScores = cosine_similarity(lsaMatrix[0:1], lsaMatrix[1:])
|
378 |
-
# resumeRnk['Similarity Score (%)'] = similarityScores[0] * 100
|
379 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
380 |
-
# del resumeRnk['cleanedResume']
|
381 |
-
# return resumeRnk
|
382 |
-
|
383 |
-
# 1 BY 1 SOFT COSSIM
|
384 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
385 |
-
# jobDescriptionText = preprocessing2(jobDescriptionRnk)
|
386 |
-
# resumeRnk['cleanedResume'] = resumeRnk['Resume'].apply(lambda x: preprocessing2(x))
|
387 |
-
# similarityscore = []
|
388 |
-
# for resume in resumeRnk['cleanedResume']:
|
389 |
-
# documents = [jobDescriptionText, resume]
|
390 |
-
# dictionary = Dictionary(documents)
|
391 |
-
# documentBow = [dictionary.doc2bow(doc) for doc in documents]
|
392 |
-
# tfidf = TfidfModel(documentBow, dictionary=dictionary)
|
393 |
-
# similarityIndex = WordEmbeddingSimilarityIndex(model)
|
394 |
-
# similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
|
395 |
-
# # similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary)
|
396 |
-
# value = tfidf[dictionary.doc2bow(resume)]
|
397 |
-
# # value = dictionary.doc2bow(jobDescriptionText)
|
398 |
-
# index = SoftCosineSimilarity(
|
399 |
-
# # tfidf[[dictionary.doc2bow(resume)]],
|
400 |
-
# tfidf[[dictionary.doc2bow(jobDescriptionText)]],
|
401 |
-
# # [dictionary.doc2bow(resume) for resume in resumeRnk['cleanedResume']],
|
402 |
-
# similarityMatrix,
|
403 |
-
# )
|
404 |
-
# similarities = index[value]
|
405 |
-
# similarityscore.append(similarities)
|
406 |
-
# print(similarityscore)
|
407 |
-
# resumeRnk['Similarity Score'] = similarityscore
|
408 |
-
# resumeRnk.sort_values(by='Similarity Score', ascending=False, inplace=True)
|
409 |
-
# resumeRnk.drop(columns=['cleanedResume'], inplace=True)
|
410 |
-
# return resumeRnk
|
411 |
-
#
|
412 |
-
# TF-IDF SCORE + WORD EMBEDDINGS SCORE
|
413 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
414 |
-
# def get_word_embedding(text):
|
415 |
-
# words = text.split()
|
416 |
-
# valid_words = [word for word in text.split() if word in model]
|
417 |
-
# if valid_words:
|
418 |
-
# return np.mean([model[word] for word in valid_words], axis=0)
|
419 |
-
# else:
|
420 |
-
# return np.zeros(model.vector_size)
|
421 |
-
# jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
|
422 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
|
423 |
-
# tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words='english')
|
424 |
-
# jobTfidf = tfidfVectorizer.fit_transform([jobDescriptionRnk])
|
425 |
-
# jobDescriptionEmbedding = get_word_embedding(jobDescriptionRnk)
|
426 |
-
# resumeSimilarities = []
|
427 |
-
# for resumeContent in resumeRnk['cleanedResume']:
|
428 |
-
# resumeEmbedding = get_word_embedding(resumeContent)
|
429 |
-
# similarityFastText = cosine_similarity([jobDescriptionEmbedding], [resumeEmbedding])[0][0]
|
430 |
-
# similarityTFIDF = cosine_similarity(jobTfidf, tfidfVectorizer.transform([resumeContent]))[0][0]
|
431 |
-
# similarity = (0.6 * similarityTFIDF) + (0.4 * similarityFastText)
|
432 |
-
# final_similarity = similarity * 100
|
433 |
-
# resumeSimilarities.append(final_similarity)
|
434 |
-
# resumeRnk['Similarity Score (%)'] = resumeSimilarities
|
435 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
436 |
-
# del resumeRnk['cleanedResume']
|
437 |
-
# return resumeRnk
|
438 |
-
|
439 |
-
# WORD EMBEDDINGS + COSSIM
|
440 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
441 |
-
# def get_word_embedding(text):
|
442 |
-
# words = text.split()
|
443 |
-
# valid_words = [word for word in text.split() if word in model]
|
444 |
-
# if valid_words:
|
445 |
-
# return np.mean([model[word] for word in valid_words], axis=0)
|
446 |
-
# else:
|
447 |
-
# return np.zeros(model.vector_size)
|
448 |
-
# jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
|
449 |
-
# jobDescriptionEmbedding = get_word_embedding(jobDescriptionRnk)
|
450 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
|
451 |
-
# resumeSimilarities = []
|
452 |
-
# for resumeContent in resumeRnk['cleanedResume']:
|
453 |
-
# resumeEmbedding = get_word_embedding(resumeContent)
|
454 |
-
# similarity = cosine_similarity([jobDescriptionEmbedding], [resumeEmbedding])[0][0]
|
455 |
-
# percentageSimilarity = similarity * 100
|
456 |
-
# resumeSimilarities.append(percentageSimilarity)
|
457 |
-
# resumeRnk['Similarity Score (%)'] = resumeSimilarities
|
458 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
459 |
-
# del resumeRnk['cleanedResume']
|
460 |
-
# return resumeRnk
|
461 |
-
|
462 |
-
# TF-IDF + COSSIM
|
463 |
-
# def resumesRank(jobDescriptionRnk, resumeRnk):
|
464 |
-
# jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
|
465 |
-
# resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
|
466 |
-
# tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words='english')
|
467 |
-
# jobTfidf = tfidfVectorizer.fit_transform([jobDescriptionRnk])
|
468 |
-
# resumeSimilarities = []
|
469 |
-
# for resumeContent in resumeRnk['cleanedResume']:
|
470 |
-
# resumeTfidf = tfidfVectorizer.transform([resumeContent])
|
471 |
-
# similarity = cosine_similarity(jobTfidf, resumeTfidf)
|
472 |
-
# percentageSimilarity = (similarity[0][0] * 100)
|
473 |
-
# resumeSimilarities.append(percentageSimilarity)
|
474 |
-
# resumeRnk['Similarity Score (%)'] = resumeSimilarities
|
475 |
-
# resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
|
476 |
-
# del resumeRnk['cleanedResume']
|
477 |
-
# return resumeRnk
|
478 |
-
|
479 |
def writeGettingStarted():
|
480 |
st.write("""
|
481 |
## Hello, Welcome!
|
@@ -500,6 +325,11 @@ def writeGettingStarted():
|
|
500 |
The organization of columns is up to you but ensure that the "Resume" column is present.
|
501 |
The values under this column should include all the relevant details for each resume.
|
502 |
""")
|
|
|
|
|
|
|
|
|
|
|
503 |
st.divider()
|
504 |
st.write("""
|
505 |
## Demo Walkthrough
|
|
|
40 |
|
41 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
42 |
def classifyResumes(df):
|
|
|
43 |
progressBar = st.progress(0)
|
44 |
progressBar.progress(0, text = "Preprocessing data ...")
|
45 |
startTime = time.time()
|
|
|
71 |
st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}')
|
72 |
return df
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
def clickClassify():
|
75 |
st.session_state.processClf = True
|
76 |
|
|
|
259 |
|
260 |
@st.cache_data(max_entries = 1, show_spinner = False)
|
261 |
def rankResumes(text, df):
|
|
|
262 |
progressBar = st.progress(0)
|
263 |
progressBar.progress(0, text = "Preprocessing data ...")
|
264 |
startTime = time.time()
|
|
|
301 |
st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
|
302 |
return df
|
303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
def writeGettingStarted():
|
305 |
st.write("""
|
306 |
## Hello, Welcome!
|
|
|
325 |
The organization of columns is up to you but ensure that the "Resume" column is present.
|
326 |
The values under this column should include all the relevant details for each resume.
|
327 |
""")
|
328 |
+
st.info("""
|
329 |
+
##### NOTE:
|
330 |
+
- If the "Resume" column is not present, the classification/ranking process will not be executed.
|
331 |
+
- If there are multiple "Resume" columns, the first occurrence will be taken into account while the remaining duplicates are given a different column name.
|
332 |
+
""")
|
333 |
st.divider()
|
334 |
st.write("""
|
335 |
## Demo Walkthrough
|