File size: 8,114 Bytes
20f5c36
 
 
688b98f
 
20f5c36
9ab4f02
20f5c36
9ab4f02
688b98f
 
 
275562d
06a9e39
688b98f
20f5c36
 
 
275562d
20f5c36
 
 
fdb6b1e
 
688b98f
 
 
 
 
 
fdb6b1e
688b98f
fc363b7
20f5c36
 
 
 
 
275562d
059d601
502d5c9
688b98f
 
20f5c36
 
 
 
 
fdb6b1e
 
20f5c36
 
fdb6b1e
 
 
 
20f5c36
fdb6b1e
688b98f
 
 
fdb6b1e
 
 
 
55c3ecb
 
 
 
 
 
 
 
fdb6b1e
 
 
275562d
fdb6b1e
 
 
114789e
fdb6b1e
275562d
 
fdb6b1e
 
275562d
fdb6b1e
 
 
275562d
fdb6b1e
 
 
 
 
 
 
 
688b98f
55c3ecb
 
c755035
b23b643
 
 
 
 
 
 
 
 
c755035
fdb6b1e
8b89a72
 
 
 
 
55c3ecb
 
 
 
fdb6b1e
752420c
fdb6b1e
55c3ecb
 
69aef0b
b075a88
8b89a72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69aef0b
 
 
fdb6b1e
55c3ecb
 
688b98f
8b89a72
 
 
69aef0b
 
 
8b89a72
 
 
 
 
 
 
 
 
69aef0b
20f5c36
2a2977c
55c3ecb
 
 
688b98f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from typing import List

import numpy as np
import pandas as pd
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode

st.set_page_config(layout='wide')

@st.cache(allow_output_mutation=True)
def load_model():
    """Load pretrained model from SentenceTransformer"""
    return SentenceTransformer('minilm_sbert')

def semantic_search(model: SentenceTransformer,
                    query: str,
                    corpus_embeddings: List) -> pd.DataFrame:
    """Perform semantic search on the corpus"""
    query_embeddings = model.encode(sentences=query,
                                    batch_size=128,
                                    show_progress_bar=False,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)

    hits = util.semantic_search(query_embeddings,
                                corpus_embeddings,
                                top_k=len(corpus_embeddings),
                                score_function=util.dot_score)

    return pd.DataFrame(hits[0])

def get_similarity_score(model: SentenceTransformer,
                         data: pd.DataFrame,
                         query: str,
                         corpus_embeddings: List) -> pd.DataFrame:
    """Get similarity score for each data point and sort by similarity score and last day"""
    hits = semantic_search(model, query, corpus_embeddings)
    result = pd.merge(data, hits, left_on='ID', right_on='corpus_id')
    result['Last Day'] = pd.to_datetime(result['Last Day'], format='%d/%m/%Y', errors='coerce').dt.date
    result.sort_values(by=['score', 'Last Day'], ascending=[False, True], inplace=True)
    return result

@st.cache(ttl=2*3600)
def create_embedding(model: SentenceTransformer,
                     data: pd.DataFrame,
                     key: str) -> List:
    "Maps job title from the corpus to a 384 dimensional vector embeddings"
    corpus_sentences = data[key].astype(str).tolist()
    corpus_embeddings = model.encode(sentences=corpus_sentences,
                                     batch_size=128,
                                     show_progress_bar=False,
                                     convert_to_tensor=True,
                                     normalize_embeddings=True)
    return corpus_embeddings

def load_dataset(columns: List[str]) -> pd.DataFrame:
    """Load real-time dataset from google sheets"""
    sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
    sheet_name = 'Form Response 3'.replace(' ', '%20')
    url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
    data = pd.read_csv(url)
    data  = data.iloc[: , :7]
    data.columns = columns
    data.insert(0, 'ID', range(len(data)))
    data['Full Name'] = data['Full Name'].str.title()
    data['LinkedIn Profile'] = data['LinkedIn Profile'].str.lower()
    data['LinkedIn Profile'] = np.where(data['LinkedIn Profile'].str.startswith('www.linkedin.com'),
                                        "https://" + data['LinkedIn Profile'],
                                        data['LinkedIn Profile'])
    data['LinkedIn Profile'] = np.where(data['LinkedIn Profile'].str.startswith('linkedin.com'),
                                        "https://www." + data['LinkedIn Profile'],
                                        data['LinkedIn Profile'])
    return data

def show_aggrid_table(result: pd.DataFrame):
    """Show interactive table from similarity result"""
    gb = GridOptionsBuilder.from_dataframe(result)
    gb.configure_pagination(paginationAutoPageSize=True)
    gb.configure_side_bar()
    gb.configure_default_column(min_column_width=200)
    gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children")
    gb.configure_column(field='LinkedIn Profile',
                        headerName='LinkedIn Profile',
                        cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''))

    grid_options = gb.build()

    grid_response = AgGrid(
        dataframe=result,
        gridOptions=grid_options,
        height=1100,
        fit_columns_on_grid_load=True,
        data_return_mode='AS_INPUT',
        update_mode='VALUE_CHANGED',
        theme='light',
        enable_enterprise_modules=True,
        allow_unsafe_jscode=True,
    )

def show_heading():
    """Show heading made using streamlit"""
    st.title('@ecommurz Talent Search Engine')
    st.markdown('''
        <div align="left">

        [![Maintainer](https://img.shields.io/badge/maintainer-temandata-blue)](https://temandata.com/)
        [![Open Source? Yes!](https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github)](https://github.com/teman-data/ecommurz-talent-search-engine)
        ![visitor badge](https://visitor-badge.glitch.me/badge?page_id=temandata_ecommurz-talent-search-engine)

        </div>
    ''', unsafe_allow_html=True)
    st.write('This app lets you search and sort talent by job title or relevant job descriptions from ecommurz talent list in real-time.')

def get_specific_category(model, data, category, corpus_embeddings):
    """Get specific category with confidence score > 0.45"""
    data = get_similarity_score(model, data, category, corpus_embeddings)
    return data[data['score'] > 0.45].shape[0]
    
def main():
    """Main Function"""
    show_heading()

    columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
               'Experience (months)', 'Last Day', 'LinkedIn Profile']
    data = load_dataset(columns)
    model = load_model()
    corpus_embeddings = create_embedding(model, data, 'Previous Role')
    col1, col2, col3, col4, col5, col6, col7, _ = st.columns([1.1, 1.3, 1.6, 1.65, 1.7, 2.1, 2.25, 9])
    
    with col1:
        data_count = get_specific_category(model, data, 'data', corpus_embeddings)
        data_bt = st.button(f'Data ({data_count})')
    with col2:
        finance_count = get_specific_category(model, data, 'finance', corpus_embeddings)
        finance_bt = st.button(f'Finance ({finance_count})')
    with col3:
        marketing_count = get_specific_category(model, data, 'marketing', corpus_embeddings)
        marketing_bt = st.button(f'Marketing ({marketing_count})')
    with col4:
        social_media_count = get_specific_category(model, data, 'social media', corpus_embeddings)
        social_media_bt = st.button(f'Social Media ({social_media_count})')
    with col5:
        arts_design_count = get_specific_category(model, data, 'design and creative', corpus_embeddings)
        arts_design_bt = st.button(f'Arts & Design ({arts_design_count})')
    with col6:
        computer_count = get_specific_category(model, data, 'engineer', corpus_embeddings)
        computer_bt = st.button(f'Computer Science ({computer_count})')
    with col7:
        business_count = get_specific_category(model, data, 'business and management', corpus_embeddings)
        business_bt = st.button(f'Business and Management ({business_count})')

    job_title = st.text_input('Insert the job title below:', '')
    submitted = st.button('Submit')

    if data_bt:
        job_title = 'data'
    if finance_bt:
        job_title = 'finance and accounting'
    if business_bt:
        job_title = 'business and management'    
    if marketing_bt:
        job_title = 'marketing'
    if social_media_bt:
        job_title = 'social media'
    if arts_design_bt:
        job_title = 'design and creative'
    if computer_bt:
        job_title = 'engineer and developer'

    if submitted or data_bt or finance_bt or marketing_bt or social_media_bt or arts_design_bt or computer_bt or business_bt:
        print(job_title + ',' + str(pd.Timestamp.now()))
        st.info(f'Showing most similar results for {job_title}...')
        result = get_similarity_score(model, data, job_title, corpus_embeddings)
        result = result[columns]
        show_aggrid_table(result)

if __name__ == '__main__':
    main()