chelscelis commited on
Commit
0d28eff
·
1 Parent(s): 77e86cf

Upload 2 files

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. utils.py +9 -6
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  altair==5.1.1
 
2
  attrs==23.1.0
3
  blinker==1.6.2
4
  cachetools==5.3.1
 
1
  altair==5.1.1
2
+ annoy==1.17.3
3
  attrs==23.1.0
4
  blinker==1.6.2
5
  cachetools==5.3.1
utils.py CHANGED
@@ -11,6 +11,7 @@ import time
11
  from gensim.corpora import Dictionary
12
  from gensim.models import KeyedVectors, TfidfModel
13
  from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
 
14
  from io import BytesIO
15
  from nltk import pos_tag, word_tokenize
16
  from nltk.corpus import stopwords, wordnet
@@ -274,9 +275,8 @@ def performStemming(text):
274
 
275
  @st.cache_data
276
  def loadModel():
277
- # model_path = '~/Projects/hau/csstudy/final-csstudy/wiki-news-300d-1M-subword.vec'
278
  model_path = 'wiki-news-300d-1M-subword.vec'
279
- model = KeyedVectors.load_word2vec_format(model_path, limit = 100000)
280
  return model
281
 
282
  model = loadModel()
@@ -295,7 +295,10 @@ def rankResumes(text, df):
295
  progressBar.progress(25, text = "Creating a TF-IDF model ...")
296
  tfidf = TfidfModel(dictionary = dictionary)
297
  progressBar.progress(38, text = "Creating a Similarity Index...")
298
- similarityIndex = WordEmbeddingSimilarityIndex(model)
 
 
 
299
  progressBar.progress(50, text = "Creating a Similarity Matrix...")
300
  similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
301
  progressBar.progress(63, text = "Setting up job description as the query ...")
@@ -307,8 +310,8 @@ def rankResumes(text, df):
307
  )
308
  similarities = index[query]
309
  progressBar.progress(88, text = "Finishing touches ...")
310
- df['Similarity Score'] = similarities
311
- df['Rank'] = df['Similarity Score'].rank(ascending=False, method='dense').astype(int)
312
  df.sort_values(by='Rank', inplace=True)
313
  df.drop(columns = ['cleanedResume'], inplace = True)
314
  endTime = time.time()
@@ -317,7 +320,7 @@ def rankResumes(text, df):
317
  minutes, _ = divmod(remainder, 60)
318
  secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60)
319
  elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s'
320
- progressBar.progress(100, text = f'Classification Complete!')
321
  time.sleep(1)
322
  progressBar.empty()
323
  st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
 
11
  from gensim.corpora import Dictionary
12
  from gensim.models import KeyedVectors, TfidfModel
13
  from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
14
+ from gensim.similarities.annoy import AnnoyIndexer
15
  from io import BytesIO
16
  from nltk import pos_tag, word_tokenize
17
  from nltk.corpus import stopwords, wordnet
 
275
 
276
  @st.cache_data
277
  def loadModel():
 
278
  model_path = 'wiki-news-300d-1M-subword.vec'
279
+ model = KeyedVectors.load_word2vec_format(model_path)
280
  return model
281
 
282
  model = loadModel()
 
295
  progressBar.progress(25, text = "Creating a TF-IDF model ...")
296
  tfidf = TfidfModel(dictionary = dictionary)
297
  progressBar.progress(38, text = "Creating a Similarity Index...")
298
+ words = [word for word, count in dictionary.most_common()]
299
+ wordVectors = model.vectors_for_all(words, allow_inference = False)
300
+ indexer = AnnoyIndexer(wordVectors, num_trees = 10)
301
+ similarityIndex = WordEmbeddingSimilarityIndex(wordVectors, kwargs = {'indexer': indexer})
302
  progressBar.progress(50, text = "Creating a Similarity Matrix...")
303
  similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
304
  progressBar.progress(63, text = "Setting up job description as the query ...")
 
310
  )
311
  similarities = index[query]
312
  progressBar.progress(88, text = "Finishing touches ...")
313
+ df['Similarity Score (-1 to 1)'] = similarities
314
+ df['Rank'] = df['Similarity Score (-1 to 1)'].rank(ascending=False, method='dense').astype(int)
315
  df.sort_values(by='Rank', inplace=True)
316
  df.drop(columns = ['cleanedResume'], inplace = True)
317
  endTime = time.time()
 
320
  minutes, _ = divmod(remainder, 60)
321
  secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60)
322
  elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s'
323
+ progressBar.progress(100, text = f'Ranking Complete!')
324
  time.sleep(1)
325
  progressBar.empty()
326
  st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')