Spaces:
Runtime error
Runtime error
File size: 7,791 Bytes
20f5c36 688b98f 20f5c36 9ab4f02 20f5c36 9ab4f02 688b98f 275562d 06a9e39 688b98f 20f5c36 275562d 20f5c36 fdb6b1e 688b98f fdb6b1e 688b98f fc363b7 20f5c36 275562d 059d601 502d5c9 688b98f 20f5c36 fdb6b1e 20f5c36 fdb6b1e 20f5c36 fdb6b1e 688b98f fdb6b1e 55c3ecb fdb6b1e 275562d fdb6b1e 114789e fdb6b1e 275562d fdb6b1e 275562d fdb6b1e 275562d fdb6b1e 688b98f 55c3ecb c755035 b23b643 c755035 fdb6b1e 8b89a72 55c3ecb fdb6b1e 752420c fdb6b1e 55c3ecb 55ff855 8b89a72 fdb6b1e 55c3ecb 688b98f 8b89a72 20f5c36 2a2977c 55c3ecb 688b98f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
from typing import List
import numpy as np
import pandas as pd
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
st.set_page_config(layout='wide')
@st.cache(allow_output_mutation=True)
def load_model():
"""Load pretrained model from SentenceTransformer"""
return SentenceTransformer('minilm_sbert')
def semantic_search(model: SentenceTransformer,
query: str,
corpus_embeddings: List) -> pd.DataFrame:
"""Perform semantic search on the corpus"""
query_embeddings = model.encode(sentences=query,
batch_size=128,
show_progress_bar=False,
convert_to_tensor=True,
normalize_embeddings=True)
hits = util.semantic_search(query_embeddings,
corpus_embeddings,
top_k=len(corpus_embeddings),
score_function=util.dot_score)
return pd.DataFrame(hits[0])
def get_similarity_score(model: SentenceTransformer,
data: pd.DataFrame,
query: str,
corpus_embeddings: List) -> pd.DataFrame:
"""Get similarity score for each data point and sort by similarity score and last day"""
hits = semantic_search(model, query, corpus_embeddings)
result = pd.merge(data, hits, left_on='ID', right_on='corpus_id')
result['Last Day'] = pd.to_datetime(result['Last Day'], format='%d/%m/%Y', errors='coerce').dt.date
result.sort_values(by=['score', 'Last Day'], ascending=[False, True], inplace=True)
return result
@st.cache(ttl=2*3600)
def create_embedding(model: SentenceTransformer,
data: pd.DataFrame,
key: str) -> List:
"Maps job title from the corpus to a 384 dimensional vector embeddings"
corpus_sentences = data[key].astype(str).tolist()
corpus_embeddings = model.encode(sentences=corpus_sentences,
batch_size=128,
show_progress_bar=False,
convert_to_tensor=True,
normalize_embeddings=True)
return corpus_embeddings
def load_dataset(columns: List[str]) -> pd.DataFrame:
"""Load real-time dataset from google sheets"""
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
sheet_name = 'Form Response 3'.replace(' ', '%20')
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
data = pd.read_csv(url)
data = data.iloc[: , :7]
data.columns = columns
data.insert(0, 'ID', range(len(data)))
data['Full Name'] = data['Full Name'].str.title()
data['LinkedIn Profile'] = data['LinkedIn Profile'].str.lower()
data['LinkedIn Profile'] = np.where(data['LinkedIn Profile'].str.startswith('www.linkedin.com'),
"https://" + data['LinkedIn Profile'],
data['LinkedIn Profile'])
data['LinkedIn Profile'] = np.where(data['LinkedIn Profile'].str.startswith('linkedin.com'),
"https://www." + data['LinkedIn Profile'],
data['LinkedIn Profile'])
return data
def show_aggrid_table(result: pd.DataFrame):
"""Show interactive table from similarity result"""
gb = GridOptionsBuilder.from_dataframe(result)
gb.configure_pagination(paginationAutoPageSize=True)
gb.configure_side_bar()
gb.configure_default_column(min_column_width=200)
gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children")
gb.configure_column(field='LinkedIn Profile',
headerName='LinkedIn Profile',
cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''))
grid_options = gb.build()
grid_response = AgGrid(
dataframe=result,
gridOptions=grid_options,
height=1100,
fit_columns_on_grid_load=True,
data_return_mode='AS_INPUT',
update_mode='VALUE_CHANGED',
theme='light',
enable_enterprise_modules=True,
allow_unsafe_jscode=True,
)
def show_heading():
"""Show heading made using streamlit"""
st.title('@ecommurz Talent Search Engine')
st.markdown('''
<div align="left">
[![Maintainer](https://img.shields.io/badge/maintainer-temandata-blue)](https://temandata.com/)
[![Open Source? Yes!](https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github)](https://github.com/teman-data/ecommurz-talent-search-engine)
![visitor badge](https://visitor-badge.glitch.me/badge?page_id=temandata_ecommurz-talent-search-engine)
</div>
''', unsafe_allow_html=True)
st.write('This app lets you search and sort talent by job title or relevant job descriptions from ecommurz talent list in real-time.')
def get_specific_category(model, data, category, corpus_embeddings):
"""Get specific category with confidence score > 0.45"""
data = get_similarity_score(model, data, category, corpus_embeddings)
return data[data['score'] > 0.45].shape[0]
def main():
"""Main Function"""
show_heading()
columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
'Experience (months)', 'Last Day', 'LinkedIn Profile']
data = load_dataset(columns)
model = load_model()
corpus_embeddings = create_embedding(model, data, 'Previous Role')
col1, col2, col3, col4, col5, col6, _ = st.columns([0.8, 1, 1, 1, 1.2, 1.2, 9])
with col1:
data_count = get_specific_category(model, data, 'data', corpus_embeddings)
data_bt = st.button(f'Data ({data_count})')
with col2:
finance_count = get_specific_category(model, data, 'finance', corpus_embeddings)
finance_bt = st.button(f'Finance ({finance_count})')
with col3:
marketing_count = get_specific_category(model, data, 'marketing', corpus_embeddings)
marketing_bt = st.button(f'Marketing ({marketing_count})')
with col4:
social_media_count = get_specific_category(model, data, 'social media', corpus_embeddings)
social_media_bt = st.button(f'Social Media ({social_media_count})')
with col5:
arts_design_count = get_specific_category(model, data, 'design and creative', corpus_embeddings)
arts_design_bt = st.button(f'Arts & Design ({arts_design_count})')
with col6:
computer_count = get_specific_category(model, data, 'engineer', corpus_embeddings)
computer_bt = st.button(f'Computer Science ({computer_count})')
job_title = st.text_input('Insert the job title below:', '')
submitted = st.button('Submit')
if data_bt:
job_title = 'data'
if finance_bt:
job_title = 'finance'
if marketing_bt:
job_title = 'marketing'
if social_media_bt:
job_title = 'social media'
if arts_design_bt:
job_title = 'design and creative'
if computer_bt:
job_title = 'engineer and developer'
if submitted or data_bt or finance_bt or marketing_bt or social_media_bt or arts_design_bt or computer_bt:
print(job_title + ',' + str(pd.Timestamp.now()))
st.info(f'Showing most similar results for {job_title}...')
result = get_similarity_score(model, data, job_title, corpus_embeddings)
result = result[columns]
show_aggrid_table(result)
if __name__ == '__main__':
main()
|