Elvan Selvano commited on
Commit
688b98f
β€’
1 Parent(s): 98d0028

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import pickle
4
+ from sentence_transformers import SentenceTransformer, util
5
+ import streamlit as st
6
+ import io
7
+ import torch
8
+
9
+ @st.cache(allow_output_mutation=True)
10
+ def load_model():
11
+ return SentenceTransformer('all-MiniLM-L6-v2')
12
+
13
+ def find_top_similar(sentence, corpus_sentences, corpus_embeddings):
14
+
15
+ # preprocess query
16
+ model = load_model()
17
+ query_embeddings = model.encode(sentence, convert_to_tensor=True) # encode to tensor
18
+ # query_embeddings = query_embeddings.to('cuda') # put into gpu
19
+ query_embeddings = util.normalize_embeddings(query_embeddings) # normalize
20
+
21
+ # find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
22
+ hits = util.semantic_search(query_embeddings,
23
+ corpus_embeddings,
24
+ top_k=len(corpus_embeddings),
25
+ score_function=util.dot_score)
26
+ hits = hits[0] # get the hits for the first query
27
+
28
+ # Create dataframe to store top searches
29
+ records = []
30
+
31
+ for hit in hits[0:len(corpus_embeddings)]:
32
+ records.append(corpus_sentences[hit['corpus_id']])
33
+
34
+ return records
35
+
36
+ def top_k_similarity(df, query, corpus_sentences, corpus_embeddings):
37
+ hits = find_top_similar([query], corpus_sentences, corpus_embeddings)
38
+
39
+ res = pd.DataFrame()
40
+
41
+ for h in hits:
42
+ s = df[df['Last job role'] == h]
43
+ res = pd.concat([res, s])
44
+
45
+ return res
46
+
47
+ def get_result(df, query, corpus_sentences, corpus_embeddings):
48
+ result = top_k_similarity(df, query, corpus_sentences, corpus_embeddings)
49
+ result.drop_duplicates(inplace=True)
50
+ return result
51
+
52
+ class cpu_unpickler(pickle.Unpickler):
53
+ """
54
+ Overrides the default behavior of the `Unpickler` class to load
55
+ a `torch.storage` object from abyte string
56
+ """
57
+ def find_class(self, module, name):
58
+ if module == 'torch.storage' and name == '_load_from_bytes':
59
+ return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
60
+ return super().find_class(module, name)
61
+
62
+ @st.cache(allow_output_mutation=True)
63
+ def load_embedding():
64
+ """Loads the embeddings from the pickle file"""
65
+ with open('corpus_embeddings.pkl', 'rb') as file:
66
+ cache_data = cpu_unpickler(file).load()
67
+ corpus_sentences = cache_data['sentences']
68
+ corpus_embeddings = cache_data['embeddings']
69
+
70
+ return corpus_sentences, corpus_embeddings
71
+
72
+ def main():
73
+ # get dataset
74
+ sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
75
+ sheet_name = 'Form Response 3'.replace(' ', '%20')
76
+ url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
77
+ print(url)
78
+ df = pd.read_csv(url)
79
+ df = df.iloc[: , :7]
80
+
81
+ # get embeddings
82
+ corpus_sentences, corpus_embeddings = load_embedding()
83
+
84
+ # streamlit form
85
+ st.title('Job Posting Similarity')
86
+ job_title = st.text_input('Insert the job title below:', '')
87
+ submitted = st.button('Submit')
88
+
89
+ if submitted:
90
+ result = get_result(df, job_title, corpus_sentences, corpus_embeddings)
91
+ result.reset_index(drop=True, inplace=True)
92
+ result.index += 1
93
+ st.table(result)
94
+
95
+ if __name__ == '__main__':
96
+ main()