Elvan Selvano commited on
Commit
20f5c36
β€’
1 Parent(s): 55c3ecb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -14
app.py CHANGED
@@ -1,9 +1,12 @@
1
- from typing import List, Tuple
 
 
2
  import pandas as pd
3
- from sentence_transformers import SentenceTransformer, util
4
  import streamlit as st
 
5
  from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
6
- import numpy as np
 
7
  st.set_page_config(layout='wide')
8
 
9
  @st.cache(allow_output_mutation=True)
@@ -11,9 +14,13 @@ def load_model():
11
  """Load pretrained model from SentenceTransformer"""
12
  return SentenceTransformer('minilm_sbert')
13
 
14
- def semantic_search(model, sentence, corpus_embeddings):
 
 
15
  """Perform semantic search on the corpus"""
16
- query_embeddings = model.encode(sentence,
 
 
17
  convert_to_tensor=True,
18
  normalize_embeddings=True)
19
 
@@ -24,24 +31,31 @@ def semantic_search(model, sentence, corpus_embeddings):
24
 
25
  return pd.DataFrame(hits[0])
26
 
27
- def get_similarity_score(model, data, query, corpus_embeddings):
28
- """Get similarity score for each data point and sort by similarity score and day"""
29
- hits = semantic_search(model, [query], corpus_embeddings)
 
 
 
30
  result = pd.merge(data, hits, left_on='ID', right_on='corpus_id')
31
  result['Last Day'] = pd.to_datetime(result['Last Day'], format='%d/%m/%Y').dt.date
32
  result.sort_values(by=['score', 'Last Day'], ascending=[False, True], inplace=True)
33
  return result
34
 
35
- @st.cache(ttl=4*3600)
36
- def create_embedding(model: SentenceTransformer, data: pd.DataFrame, key: str) -> Tuple[list, list]:
37
- """Create vector embeddings from the dataset"""
 
 
38
  corpus_sentences = data[key].astype(str).tolist()
39
  corpus_embeddings = model.encode(sentences=corpus_sentences,
 
 
40
  convert_to_tensor=True,
41
  normalize_embeddings=True)
42
  return corpus_embeddings
43
 
44
- def load_dataset(columns: List) -> pd.DataFrame:
45
  """Load real-time dataset from google sheets"""
46
  sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
47
  sheet_name = 'Form Response 3'.replace(' ', '%20')
@@ -106,8 +120,6 @@ def main():
106
  columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
107
  'Experience (months)', 'Last Day', 'LinkedIn Profile']
108
  data = load_dataset(columns)
109
-
110
- # Inference
111
  model = load_model()
112
  corpus_embeddings = create_embedding(model, data, 'Previous Role')
113
 
@@ -115,6 +127,7 @@ def main():
115
  submitted = st.button('Submit')
116
 
117
  if submitted:
 
118
  st.info(f'Showing results for {job_title}')
119
  result = get_similarity_score(model, data, job_title, corpus_embeddings)
120
  result = result[columns]
 
1
+ from typing import List
2
+
3
+ import numpy as np
4
  import pandas as pd
 
5
  import streamlit as st
6
+ from sentence_transformers import SentenceTransformer, util
7
  from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
8
+ from textblob import Sentence
9
+
10
  st.set_page_config(layout='wide')
11
 
12
  @st.cache(allow_output_mutation=True)
 
14
  """Load pretrained model from SentenceTransformer"""
15
  return SentenceTransformer('minilm_sbert')
16
 
17
+ def semantic_search(model: SentenceTransformer,
18
+ query: str,
19
+ corpus_embeddings: List) -> pd.DataFrame:
20
  """Perform semantic search on the corpus"""
21
+ query_embeddings = model.encode(sentences=query,
22
+ batch_size=128,
23
+ show_progress_bar=False,
24
  convert_to_tensor=True,
25
  normalize_embeddings=True)
26
 
 
31
 
32
  return pd.DataFrame(hits[0])
33
 
34
+ def get_similarity_score(model: Sentence,
35
+ data: pd.DataFrame,
36
+ query: str,
37
+ corpus_embeddings: List) -> pd.DataFrame:
38
+ """Get similarity score for each data point and sort by similarity score and last day"""
39
+ hits = semantic_search(model, query, corpus_embeddings)
40
  result = pd.merge(data, hits, left_on='ID', right_on='corpus_id')
41
  result['Last Day'] = pd.to_datetime(result['Last Day'], format='%d/%m/%Y').dt.date
42
  result.sort_values(by=['score', 'Last Day'], ascending=[False, True], inplace=True)
43
  return result
44
 
45
+ @st.cache(ttl=2*3600)
46
+ def create_embedding(model: SentenceTransformer,
47
+ data: pd.DataFrame,
48
+ key: str) -> List:
49
+ "Maps job title from the corpus to a 384 dimensional vector embeddings"
50
  corpus_sentences = data[key].astype(str).tolist()
51
  corpus_embeddings = model.encode(sentences=corpus_sentences,
52
+ batch_size=128,
53
+ show_progress_bar=False,
54
  convert_to_tensor=True,
55
  normalize_embeddings=True)
56
  return corpus_embeddings
57
 
58
+ def load_dataset(columns: List[str]) -> pd.DataFrame:
59
  """Load real-time dataset from google sheets"""
60
  sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
61
  sheet_name = 'Form Response 3'.replace(' ', '%20')
 
120
  columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
121
  'Experience (months)', 'Last Day', 'LinkedIn Profile']
122
  data = load_dataset(columns)
 
 
123
  model = load_model()
124
  corpus_embeddings = create_embedding(model, data, 'Previous Role')
125
 
 
127
  submitted = st.button('Submit')
128
 
129
  if submitted:
130
+ print(job_title + ',' + str(pd.Timestamp.now()))
131
  st.info(f'Showing results for {job_title}')
132
  result = get_similarity_score(model, data, job_title, corpus_embeddings)
133
  result = result[columns]