chelscelis
commited on
Commit
·
0d28eff
1
Parent(s):
77e86cf
Upload 2 files
Browse files- requirements.txt +1 -0
- utils.py +9 -6
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
altair==5.1.1
|
|
|
2 |
attrs==23.1.0
|
3 |
blinker==1.6.2
|
4 |
cachetools==5.3.1
|
|
|
1 |
altair==5.1.1
|
2 |
+
annoy==1.17.3
|
3 |
attrs==23.1.0
|
4 |
blinker==1.6.2
|
5 |
cachetools==5.3.1
|
utils.py
CHANGED
@@ -11,6 +11,7 @@ import time
|
|
11 |
from gensim.corpora import Dictionary
|
12 |
from gensim.models import KeyedVectors, TfidfModel
|
13 |
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
|
|
|
14 |
from io import BytesIO
|
15 |
from nltk import pos_tag, word_tokenize
|
16 |
from nltk.corpus import stopwords, wordnet
|
@@ -274,9 +275,8 @@ def performStemming(text):
|
|
274 |
|
275 |
@st.cache_data
|
276 |
def loadModel():
|
277 |
-
# model_path = '~/Projects/hau/csstudy/final-csstudy/wiki-news-300d-1M-subword.vec'
|
278 |
model_path = 'wiki-news-300d-1M-subword.vec'
|
279 |
-
model = KeyedVectors.load_word2vec_format(model_path
|
280 |
return model
|
281 |
|
282 |
model = loadModel()
|
@@ -295,7 +295,10 @@ def rankResumes(text, df):
|
|
295 |
progressBar.progress(25, text = "Creating a TF-IDF model ...")
|
296 |
tfidf = TfidfModel(dictionary = dictionary)
|
297 |
progressBar.progress(38, text = "Creating a Similarity Index...")
|
298 |
-
|
|
|
|
|
|
|
299 |
progressBar.progress(50, text = "Creating a Similarity Matrix...")
|
300 |
similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
|
301 |
progressBar.progress(63, text = "Setting up job description as the query ...")
|
@@ -307,8 +310,8 @@ def rankResumes(text, df):
|
|
307 |
)
|
308 |
similarities = index[query]
|
309 |
progressBar.progress(88, text = "Finishing touches ...")
|
310 |
-
df['Similarity Score'] = similarities
|
311 |
-
df['Rank'] = df['Similarity Score'].rank(ascending=False, method='dense').astype(int)
|
312 |
df.sort_values(by='Rank', inplace=True)
|
313 |
df.drop(columns = ['cleanedResume'], inplace = True)
|
314 |
endTime = time.time()
|
@@ -317,7 +320,7 @@ def rankResumes(text, df):
|
|
317 |
minutes, _ = divmod(remainder, 60)
|
318 |
secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60)
|
319 |
elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s'
|
320 |
-
progressBar.progress(100, text = f'
|
321 |
time.sleep(1)
|
322 |
progressBar.empty()
|
323 |
st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
|
|
|
11 |
from gensim.corpora import Dictionary
|
12 |
from gensim.models import KeyedVectors, TfidfModel
|
13 |
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
|
14 |
+
from gensim.similarities.annoy import AnnoyIndexer
|
15 |
from io import BytesIO
|
16 |
from nltk import pos_tag, word_tokenize
|
17 |
from nltk.corpus import stopwords, wordnet
|
|
|
275 |
|
276 |
@st.cache_data
|
277 |
def loadModel():
|
|
|
278 |
model_path = 'wiki-news-300d-1M-subword.vec'
|
279 |
+
model = KeyedVectors.load_word2vec_format(model_path)
|
280 |
return model
|
281 |
|
282 |
model = loadModel()
|
|
|
295 |
progressBar.progress(25, text = "Creating a TF-IDF model ...")
|
296 |
tfidf = TfidfModel(dictionary = dictionary)
|
297 |
progressBar.progress(38, text = "Creating a Similarity Index...")
|
298 |
+
words = [word for word, count in dictionary.most_common()]
|
299 |
+
wordVectors = model.vectors_for_all(words, allow_inference = False)
|
300 |
+
indexer = AnnoyIndexer(wordVectors, num_trees = 10)
|
301 |
+
similarityIndex = WordEmbeddingSimilarityIndex(wordVectors, kwargs = {'indexer': indexer})
|
302 |
progressBar.progress(50, text = "Creating a Similarity Matrix...")
|
303 |
similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf)
|
304 |
progressBar.progress(63, text = "Setting up job description as the query ...")
|
|
|
310 |
)
|
311 |
similarities = index[query]
|
312 |
progressBar.progress(88, text = "Finishing touches ...")
|
313 |
+
df['Similarity Score (-1 to 1)'] = similarities
|
314 |
+
df['Rank'] = df['Similarity Score (-1 to 1)'].rank(ascending=False, method='dense').astype(int)
|
315 |
df.sort_values(by='Rank', inplace=True)
|
316 |
df.drop(columns = ['cleanedResume'], inplace = True)
|
317 |
endTime = time.time()
|
|
|
320 |
minutes, _ = divmod(remainder, 60)
|
321 |
secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60)
|
322 |
elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s'
|
323 |
+
progressBar.progress(100, text = f'Ranking Complete!')
|
324 |
time.sleep(1)
|
325 |
progressBar.empty()
|
326 |
st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}')
|