chelscelis commited on
Commit
3cdb53b
·
1 Parent(s): f9848af

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +2 -0
  2. utils.py +5 -175
app.py CHANGED
@@ -59,6 +59,7 @@ with tab2:
59
  st.divider()
60
  st.header('Output')
61
  resumeClf = pd.read_excel(uploadedResumeClf)
 
62
  if 'Resume' in resumeClf.columns:
63
  resumeClf = classifyResumes(resumeClf)
64
  with st.expander('View Bar Chart'):
@@ -98,6 +99,7 @@ with tab3:
98
  st.header('Output')
99
  jobDescriptionRnk = uploadedJobDescriptionRnk.read().decode('utf-8')
100
  resumeRnk = pd.read_excel(uploadedResumeRnk)
 
101
  if 'Resume' in resumeRnk.columns:
102
  resumeRnk = rankResumes(jobDescriptionRnk, resumeRnk)
103
  with st.expander('View Job Description'):
 
59
  st.divider()
60
  st.header('Output')
61
  resumeClf = pd.read_excel(uploadedResumeClf)
62
+
63
  if 'Resume' in resumeClf.columns:
64
  resumeClf = classifyResumes(resumeClf)
65
  with st.expander('View Bar Chart'):
 
99
  st.header('Output')
100
  jobDescriptionRnk = uploadedJobDescriptionRnk.read().decode('utf-8')
101
  resumeRnk = pd.read_excel(uploadedResumeRnk)
102
+
103
  if 'Resume' in resumeRnk.columns:
104
  resumeRnk = rankResumes(jobDescriptionRnk, resumeRnk)
105
  with st.expander('View Job Description'):
utils.py CHANGED
@@ -40,7 +40,6 @@ def addZeroFeatures(matrix):
40
 
41
  @st.cache_data(max_entries = 1, show_spinner = False)
42
  def classifyResumes(df):
43
- # WITH PROGRESS BAR
44
  progressBar = st.progress(0)
45
  progressBar.progress(0, text = "Preprocessing data ...")
46
  startTime = time.time()
@@ -72,29 +71,6 @@ def classifyResumes(df):
72
  st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}')
73
  return df
74
 
75
- # NO LOADING WIDGET
76
- # startTime = time.time()
77
- # df['cleanedResume'] = df.Resume.apply(lambda x: performStemming(x))
78
- # resumeText = df['cleanedResume'].values
79
- # vectorizer = loadTfidfVectorizer()
80
- # wordFeatures = vectorizer.transform(resumeText)
81
- # wordFeaturesWithZeros = addZeroFeatures(wordFeatures)
82
- # finalFeatures = dimensionalityReduction(wordFeaturesWithZeros)
83
- # knn = loadKnnModel()
84
- # predictedCategories = knn.predict(finalFeatures)
85
- # le = loadLabelEncoder()
86
- # df['Industry Category'] = le.inverse_transform(predictedCategories)
87
- # df['Industry Category'] = pd.Categorical(df['Industry Category'])
88
- # df.drop(columns = ['cleanedResume'], inplace = True)
89
- # endTime = time.time()
90
- # elapsedSeconds = endTime - startTime
91
- # elapsedTime = datetime.timedelta(seconds = elapsedSeconds)
92
- # hours, remainder = divmod(elapsedTime.seconds, 3600)
93
- # minutes, seconds = divmod(remainder, 60)
94
- # elapsedTimeStr = f"{hours} hr {minutes} min {seconds} sec"
95
- # st.info(f'Finished in {elapsedTimeStr}')
96
- # return df
97
-
98
  def clickClassify():
99
  st.session_state.processClf = True
100
 
@@ -283,7 +259,6 @@ model = loadModel()
283
 
284
  @st.cache_data(max_entries = 1, show_spinner = False)
285
  def rankResumes(text, df):
286
- # WITH PROGRESS BAR
287
  progressBar = st.progress(0)
288
  progressBar.progress(0, text = "Preprocessing data ...")
289
  startTime = time.time()
@@ -326,156 +301,6 @@ def rankResumes(text, df):
326
  st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
327
  return df
328
 
329
- # NO LOADING WIDGET
330
- # startTime = time.time()
331
- # jobDescriptionText = performLemmatization(text)
332
- # df['cleanedResume'] = df['Resume'].apply(lambda x: performLemmatization(x))
333
- # documents = [jobDescriptionText] + df['cleanedResume'].tolist()
334
- # dictionary = Dictionary(documents)
335
- # tfidf = TfidfModel(dictionary = dictionary)
336
- # similarityIndex = WordEmbeddingSimilarityIndex(model)
337
- # similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
338
- # query = tfidf[dictionary.doc2bow(jobDescriptionText)]
339
- # index = SoftCosineSimilarity(
340
- # tfidf[[dictionary.doc2bow(resume) for resume in df['cleanedResume']]],
341
- # similarityMatrix
342
- # )
343
- # similarities = index[query]
344
- # df['Similarity Score'] = similarities
345
- # df.sort_values(by = 'Similarity Score', ascending = False, inplace = True)
346
- # df.drop(columns = ['cleanedResume'], inplace = True)
347
- # endTime = time.time()
348
- # elapsedSeconds = endTime - startTime
349
- # elapsedTime = datetime.timedelta(seconds = elapsedSeconds)
350
- # hours, remainder = divmod(elapsedTime.seconds, 3600)
351
- # minutes, seconds = divmod(remainder, 60)
352
- # elapsedTimeStr = f"{hours} hr {minutes} min {seconds} sec"
353
- # st.info(f'Finished in {elapsedTimeStr}')
354
- # return df
355
-
356
- # TF-IDF + LSA + COSSIM
357
- # from sklearn.decomposition import TruncatedSVD
358
- # import math
359
- # def resumesRank(jobDescriptionRnk, resumeRnk):
360
- # jobDescriptionRnk = preprocessing(jobDescriptionRnk)
361
- # resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing(x))
362
- # resumes = resumeRnk['cleanedResume'].values
363
- # # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words = 'english')
364
- # # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True)
365
- # # tfidfVectorizer = TfidfVectorizer(stop_words = 'english')
366
- # tfidfVectorizer = TfidfVectorizer()
367
- # tfidfMatrix = tfidfVectorizer.fit_transform([jobDescriptionRnk] + list(resumes))
368
- # num_features = len(tfidfVectorizer.get_feature_names_out())
369
- # st.write(f"Number of TF-IDF Features: {num_features}")
370
- # nComponents = math.ceil(len(resumes) * 0.55)
371
- # # nComponents = math.ceil(num_features * 0.01)
372
- # # nComponents = 5
373
- # st.write(nComponents)
374
- # # nComponents = len(resumes)
375
- # lsa = TruncatedSVD(n_components=nComponents)
376
- # lsaMatrix = lsa.fit_transform(tfidfMatrix)
377
- # similarityScores = cosine_similarity(lsaMatrix[0:1], lsaMatrix[1:])
378
- # resumeRnk['Similarity Score (%)'] = similarityScores[0] * 100
379
- # resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
380
- # del resumeRnk['cleanedResume']
381
- # return resumeRnk
382
-
383
- # 1 BY 1 SOFT COSSIM
384
- # def resumesRank(jobDescriptionRnk, resumeRnk):
385
- # jobDescriptionText = preprocessing2(jobDescriptionRnk)
386
- # resumeRnk['cleanedResume'] = resumeRnk['Resume'].apply(lambda x: preprocessing2(x))
387
- # similarityscore = []
388
- # for resume in resumeRnk['cleanedResume']:
389
- # documents = [jobDescriptionText, resume]
390
- # dictionary = Dictionary(documents)
391
- # documentBow = [dictionary.doc2bow(doc) for doc in documents]
392
- # tfidf = TfidfModel(documentBow, dictionary=dictionary)
393
- # similarityIndex = WordEmbeddingSimilarityIndex(model)
394
- # similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
395
- # # similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary)
396
- # value = tfidf[dictionary.doc2bow(resume)]
397
- # # value = dictionary.doc2bow(jobDescriptionText)
398
- # index = SoftCosineSimilarity(
399
- # # tfidf[[dictionary.doc2bow(resume)]],
400
- # tfidf[[dictionary.doc2bow(jobDescriptionText)]],
401
- # # [dictionary.doc2bow(resume) for resume in resumeRnk['cleanedResume']],
402
- # similarityMatrix,
403
- # )
404
- # similarities = index[value]
405
- # similarityscore.append(similarities)
406
- # print(similarityscore)
407
- # resumeRnk['Similarity Score'] = similarityscore
408
- # resumeRnk.sort_values(by='Similarity Score', ascending=False, inplace=True)
409
- # resumeRnk.drop(columns=['cleanedResume'], inplace=True)
410
- # return resumeRnk
411
- #
412
- # TF-IDF SCORE + WORD EMBEDDINGS SCORE
413
- # def resumesRank(jobDescriptionRnk, resumeRnk):
414
- # def get_word_embedding(text):
415
- # words = text.split()
416
- # valid_words = [word for word in text.split() if word in model]
417
- # if valid_words:
418
- # return np.mean([model[word] for word in valid_words], axis=0)
419
- # else:
420
- # return np.zeros(model.vector_size)
421
- # jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
422
- # resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
423
- # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words='english')
424
- # jobTfidf = tfidfVectorizer.fit_transform([jobDescriptionRnk])
425
- # jobDescriptionEmbedding = get_word_embedding(jobDescriptionRnk)
426
- # resumeSimilarities = []
427
- # for resumeContent in resumeRnk['cleanedResume']:
428
- # resumeEmbedding = get_word_embedding(resumeContent)
429
- # similarityFastText = cosine_similarity([jobDescriptionEmbedding], [resumeEmbedding])[0][0]
430
- # similarityTFIDF = cosine_similarity(jobTfidf, tfidfVectorizer.transform([resumeContent]))[0][0]
431
- # similarity = (0.6 * similarityTFIDF) + (0.4 * similarityFastText)
432
- # final_similarity = similarity * 100
433
- # resumeSimilarities.append(final_similarity)
434
- # resumeRnk['Similarity Score (%)'] = resumeSimilarities
435
- # resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
436
- # del resumeRnk['cleanedResume']
437
- # return resumeRnk
438
-
439
- # WORD EMBEDDINGS + COSSIM
440
- # def resumesRank(jobDescriptionRnk, resumeRnk):
441
- # def get_word_embedding(text):
442
- # words = text.split()
443
- # valid_words = [word for word in text.split() if word in model]
444
- # if valid_words:
445
- # return np.mean([model[word] for word in valid_words], axis=0)
446
- # else:
447
- # return np.zeros(model.vector_size)
448
- # jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
449
- # jobDescriptionEmbedding = get_word_embedding(jobDescriptionRnk)
450
- # resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
451
- # resumeSimilarities = []
452
- # for resumeContent in resumeRnk['cleanedResume']:
453
- # resumeEmbedding = get_word_embedding(resumeContent)
454
- # similarity = cosine_similarity([jobDescriptionEmbedding], [resumeEmbedding])[0][0]
455
- # percentageSimilarity = similarity * 100
456
- # resumeSimilarities.append(percentageSimilarity)
457
- # resumeRnk['Similarity Score (%)'] = resumeSimilarities
458
- # resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
459
- # del resumeRnk['cleanedResume']
460
- # return resumeRnk
461
-
462
- # TF-IDF + COSSIM
463
- # def resumesRank(jobDescriptionRnk, resumeRnk):
464
- # jobDescriptionRnk = preprocessing2(jobDescriptionRnk)
465
- # resumeRnk['cleanedResume'] = resumeRnk.Resume.apply(lambda x: preprocessing2(x))
466
- # tfidfVectorizer = TfidfVectorizer(sublinear_tf = True, stop_words='english')
467
- # jobTfidf = tfidfVectorizer.fit_transform([jobDescriptionRnk])
468
- # resumeSimilarities = []
469
- # for resumeContent in resumeRnk['cleanedResume']:
470
- # resumeTfidf = tfidfVectorizer.transform([resumeContent])
471
- # similarity = cosine_similarity(jobTfidf, resumeTfidf)
472
- # percentageSimilarity = (similarity[0][0] * 100)
473
- # resumeSimilarities.append(percentageSimilarity)
474
- # resumeRnk['Similarity Score (%)'] = resumeSimilarities
475
- # resumeRnk = resumeRnk.sort_values(by='Similarity Score (%)', ascending=False)
476
- # del resumeRnk['cleanedResume']
477
- # return resumeRnk
478
-
479
  def writeGettingStarted():
480
  st.write("""
481
  ## Hello, Welcome!
@@ -500,6 +325,11 @@ def writeGettingStarted():
500
  The organization of columns is up to you but ensure that the "Resume" column is present.
501
  The values under this column should include all the relevant details for each resume.
502
  """)
 
 
 
 
 
503
  st.divider()
504
  st.write("""
505
  ## Demo Walkthrough
 
40
 
41
  @st.cache_data(max_entries = 1, show_spinner = False)
42
  def classifyResumes(df):
 
43
  progressBar = st.progress(0)
44
  progressBar.progress(0, text = "Preprocessing data ...")
45
  startTime = time.time()
 
71
  st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}')
72
  return df
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def clickClassify():
75
  st.session_state.processClf = True
76
 
 
259
 
260
  @st.cache_data(max_entries = 1, show_spinner = False)
261
  def rankResumes(text, df):
 
262
  progressBar = st.progress(0)
263
  progressBar.progress(0, text = "Preprocessing data ...")
264
  startTime = time.time()
 
301
  st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
302
  return df
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  def writeGettingStarted():
305
  st.write("""
306
  ## Hello, Welcome!
 
325
  The organization of columns is up to you but ensure that the "Resume" column is present.
326
  The values under this column should include all the relevant details for each resume.
327
  """)
328
+ st.info("""
329
+ ##### NOTE:
330
+ - If the "Resume" column is not present, the classification/ranking process will not be executed.
331
+ - If there are multiple "Resume" columns, the first occurrence will be taken into account while the remaining duplicates are given a different column name.
332
+ """)
333
  st.divider()
334
  st.write("""
335
  ## Demo Walkthrough