Spaces:
Runtime error
Runtime error
Elvan Selvano
commited on
Commit
β’
20f5c36
1
Parent(s):
55c3ecb
Update app.py
Browse files
app.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
-
from typing import List
|
|
|
|
|
2 |
import pandas as pd
|
3 |
-
from sentence_transformers import SentenceTransformer, util
|
4 |
import streamlit as st
|
|
|
5 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
6 |
-
|
|
|
7 |
st.set_page_config(layout='wide')
|
8 |
|
9 |
@st.cache(allow_output_mutation=True)
|
@@ -11,9 +14,13 @@ def load_model():
|
|
11 |
"""Load pretrained model from SentenceTransformer"""
|
12 |
return SentenceTransformer('minilm_sbert')
|
13 |
|
14 |
-
def semantic_search(model
|
|
|
|
|
15 |
"""Perform semantic search on the corpus"""
|
16 |
-
query_embeddings = model.encode(
|
|
|
|
|
17 |
convert_to_tensor=True,
|
18 |
normalize_embeddings=True)
|
19 |
|
@@ -24,24 +31,31 @@ def semantic_search(model, sentence, corpus_embeddings):
|
|
24 |
|
25 |
return pd.DataFrame(hits[0])
|
26 |
|
27 |
-
def get_similarity_score(model
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
30 |
result = pd.merge(data, hits, left_on='ID', right_on='corpus_id')
|
31 |
result['Last Day'] = pd.to_datetime(result['Last Day'], format='%d/%m/%Y').dt.date
|
32 |
result.sort_values(by=['score', 'Last Day'], ascending=[False, True], inplace=True)
|
33 |
return result
|
34 |
|
35 |
-
@st.cache(ttl=
|
36 |
-
def create_embedding(model: SentenceTransformer,
|
37 |
-
|
|
|
|
|
38 |
corpus_sentences = data[key].astype(str).tolist()
|
39 |
corpus_embeddings = model.encode(sentences=corpus_sentences,
|
|
|
|
|
40 |
convert_to_tensor=True,
|
41 |
normalize_embeddings=True)
|
42 |
return corpus_embeddings
|
43 |
|
44 |
-
def load_dataset(columns: List) -> pd.DataFrame:
|
45 |
"""Load real-time dataset from google sheets"""
|
46 |
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
|
47 |
sheet_name = 'Form Response 3'.replace(' ', '%20')
|
@@ -106,8 +120,6 @@ def main():
|
|
106 |
columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
|
107 |
'Experience (months)', 'Last Day', 'LinkedIn Profile']
|
108 |
data = load_dataset(columns)
|
109 |
-
|
110 |
-
# Inference
|
111 |
model = load_model()
|
112 |
corpus_embeddings = create_embedding(model, data, 'Previous Role')
|
113 |
|
@@ -115,6 +127,7 @@ def main():
|
|
115 |
submitted = st.button('Submit')
|
116 |
|
117 |
if submitted:
|
|
|
118 |
st.info(f'Showing results for {job_title}')
|
119 |
result = get_similarity_score(model, data, job_title, corpus_embeddings)
|
120 |
result = result[columns]
|
|
|
1 |
+
from typing import List
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
import pandas as pd
|
|
|
5 |
import streamlit as st
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
|
8 |
+
from textblob import Sentence
|
9 |
+
|
10 |
st.set_page_config(layout='wide')
|
11 |
|
12 |
@st.cache(allow_output_mutation=True)
|
|
|
14 |
"""Load pretrained model from SentenceTransformer"""
|
15 |
return SentenceTransformer('minilm_sbert')
|
16 |
|
17 |
+
def semantic_search(model: SentenceTransformer,
|
18 |
+
query: str,
|
19 |
+
corpus_embeddings: List) -> pd.DataFrame:
|
20 |
"""Perform semantic search on the corpus"""
|
21 |
+
query_embeddings = model.encode(sentences=query,
|
22 |
+
batch_size=128,
|
23 |
+
show_progress_bar=False,
|
24 |
convert_to_tensor=True,
|
25 |
normalize_embeddings=True)
|
26 |
|
|
|
31 |
|
32 |
return pd.DataFrame(hits[0])
|
33 |
|
34 |
+
def get_similarity_score(model: Sentence,
|
35 |
+
data: pd.DataFrame,
|
36 |
+
query: str,
|
37 |
+
corpus_embeddings: List) -> pd.DataFrame:
|
38 |
+
"""Get similarity score for each data point and sort by similarity score and last day"""
|
39 |
+
hits = semantic_search(model, query, corpus_embeddings)
|
40 |
result = pd.merge(data, hits, left_on='ID', right_on='corpus_id')
|
41 |
result['Last Day'] = pd.to_datetime(result['Last Day'], format='%d/%m/%Y').dt.date
|
42 |
result.sort_values(by=['score', 'Last Day'], ascending=[False, True], inplace=True)
|
43 |
return result
|
44 |
|
45 |
+
@st.cache(ttl=2*3600)
|
46 |
+
def create_embedding(model: SentenceTransformer,
|
47 |
+
data: pd.DataFrame,
|
48 |
+
key: str) -> List:
|
49 |
+
"Maps job title from the corpus to a 384 dimensional vector embeddings"
|
50 |
corpus_sentences = data[key].astype(str).tolist()
|
51 |
corpus_embeddings = model.encode(sentences=corpus_sentences,
|
52 |
+
batch_size=128,
|
53 |
+
show_progress_bar=False,
|
54 |
convert_to_tensor=True,
|
55 |
normalize_embeddings=True)
|
56 |
return corpus_embeddings
|
57 |
|
58 |
+
def load_dataset(columns: List[str]) -> pd.DataFrame:
|
59 |
"""Load real-time dataset from google sheets"""
|
60 |
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
|
61 |
sheet_name = 'Form Response 3'.replace(' ', '%20')
|
|
|
120 |
columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
|
121 |
'Experience (months)', 'Last Day', 'LinkedIn Profile']
|
122 |
data = load_dataset(columns)
|
|
|
|
|
123 |
model = load_model()
|
124 |
corpus_embeddings = create_embedding(model, data, 'Previous Role')
|
125 |
|
|
|
127 |
submitted = st.button('Submit')
|
128 |
|
129 |
if submitted:
|
130 |
+
print(job_title + ',' + str(pd.Timestamp.now()))
|
131 |
st.info(f'Showing results for {job_title}')
|
132 |
result = get_similarity_score(model, data, job_title, corpus_embeddings)
|
133 |
result = result[columns]
|