File size: 4,265 Bytes
fdb6b1e
688b98f
 
 
9ab4f02
 
688b98f
 
 
275562d
9ab4f02
688b98f
fdb6b1e
275562d
fdb6b1e
 
 
688b98f
 
 
 
 
 
fdb6b1e
688b98f
275562d
 
fdb6b1e
275562d
fdb6b1e
688b98f
 
 
fdb6b1e
 
 
 
 
 
 
 
 
 
 
688b98f
 
 
fdb6b1e
 
 
 
 
 
 
275562d
fdb6b1e
 
 
114789e
fdb6b1e
275562d
 
fdb6b1e
 
275562d
fdb6b1e
 
 
275562d
fdb6b1e
 
 
 
 
 
 
 
688b98f
fdb6b1e
275562d
688b98f
fdb6b1e
 
 
 
 
 
 
 
688b98f
 
 
 
fdb6b1e
275562d
fdb6b1e
9ab4f02
275562d
 
 
 
 
 
 
9ab4f02
fdb6b1e
688b98f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from typing import List, Tuple
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import streamlit as st
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode
st.set_page_config(layout='wide')

@st.cache(allow_output_mutation=True)
def load_model():
    """Load pretrained model from SentenceTransformer"""
    return SentenceTransformer('all-MiniLM-L6-v2')

def semantic_search(model, sentence, corpus_embeddings):
    """Perform semantic search on the corpus"""
    query_embeddings = model.encode(sentence,
                                    convert_to_tensor=True,
                                    normalize_embeddings=True)

    hits = util.semantic_search(query_embeddings,
                                corpus_embeddings,
                                top_k=len(corpus_embeddings),
                                score_function=util.dot_score)

    return pd.DataFrame(hits[0])

def get_similarity_score(model, data, query, corpus_embeddings):
    """Get similarity score for each data point"""
    hits = semantic_search(model, [query], corpus_embeddings)
    result = pd.merge(data, hits, left_on='ID', right_on='corpus_id')
    result.sort_values(by='score', ascending=False, inplace=True)
    return result

@st.cache(allow_output_mutation=True)
def create_embedding(model: SentenceTransformer, data: pd.DataFrame, key: str) -> Tuple[list, list]:
    """Create vector embeddings from the dataset"""
    corpus_sentences = data[key].astype(str).tolist()
    corpus_embeddings = model.encode(sentences=corpus_sentences,
                                     show_progress_bar=True,
                                     convert_to_tensor=True,
                                     normalize_embeddings=True)
    return corpus_embeddings

def load_dataset(columns: List) -> pd.DataFrame:
    """Load real-time dataset from google sheets"""
    sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y'
    sheet_name = 'Form Response 3'.replace(' ', '%20')
    url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}'
    data = pd.read_csv(url)
    data  = data.iloc[: , :7]
    data.columns = columns
    data.insert(0, 'ID', range(len(data)))
    return data

def show_aggrid_table(result: pd.DataFrame):
    """Show interactive table from similarity result"""
    gb = GridOptionsBuilder.from_dataframe(result)
    gb.configure_pagination(paginationAutoPageSize=True)
    gb.configure_side_bar()
    gb.configure_default_column(min_column_width=200)
    gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children")
    gb.configure_column(field='LinkedIn Profile',
                        headerName='LinkedIn Profile',
                        cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''))

    grid_options = gb.build()

    grid_response = AgGrid(
        dataframe=result,
        gridOptions=grid_options,
        height=1100,
        fit_columns_on_grid_load=True,
        data_return_mode='AS_INPUT',
        update_mode='VALUE_CHANGED',
        theme='light',
        enable_enterprise_modules=True,
        allow_unsafe_jscode=True,
    )

def main():
    """Main Function"""
    st.title('Job Posting Similarity')
    st.write('This app will help you find similar job titles real-time from ecommurz google sheets.')

    columns = ['Timestamp', 'Full Name', 'Company', 'Previous Role',
               'Experience', 'Last Day', 'LinkedIn Profile']
    data = load_dataset(columns)
    model = load_model()
    corpus_embeddings = create_embedding(model, data, 'Previous Role')

    job_title = st.text_input('Insert the job title below:', '')
    submitted = st.button('Submit')

    if submitted:
        st.info(f'Showing results for {job_title}')
        result = get_similarity_score(model, data, job_title, corpus_embeddings)
        result = result[columns]

        # st.download_button(
        #     "Download Table",
        #     result.to_csv().encode('utf-8'),
        #     "result.csv",
        #     "text/csv",
        #     key='download-csv'
        # )

        show_aggrid_table(result)

if __name__ == '__main__':
    main()