Spaces:
Runtime error
Runtime error
import pandas as pd | |
from sentence_transformers import SentenceTransformer, util | |
import streamlit as st | |
from st_aggrid import AgGrid, GridOptionsBuilder, JsCode | |
from cpu_unpickler import cpu_unpickler | |
st.set_page_config(layout='wide') | |
def load_model(): | |
return SentenceTransformer('all-MiniLM-L6-v2') | |
def find_top_similar(sentence, corpus_sentences, corpus_embeddings): | |
# preprocess query | |
model = load_model() | |
query_embeddings = model.encode(sentence, convert_to_tensor=True) # encode to tensor | |
# query_embeddings = query_embeddings.to('cuda') # put into gpu | |
query_embeddings = util.normalize_embeddings(query_embeddings) # normalize | |
# find the closest 5 sentences of the corpus for each query sentence based on cosine similarity | |
hits = util.semantic_search(query_embeddings, | |
corpus_embeddings, | |
top_k=len(corpus_embeddings), | |
score_function=util.dot_score) | |
hits = hits[0] # get the hits for the first query | |
# Create dataframe to store top searches | |
records = [] | |
for hit in hits[0:len(corpus_embeddings)]: | |
records.append(corpus_sentences[hit['corpus_id']]) | |
return records | |
def top_k_similarity(df, query, corpus_sentences, corpus_embeddings): | |
hits = find_top_similar([query], corpus_sentences, corpus_embeddings) | |
res = pd.DataFrame() | |
for h in hits: | |
s = df[df['Last job role'] == h] | |
res = pd.concat([res, s]) | |
return res | |
def get_result(df, query, corpus_sentences, corpus_embeddings): | |
result = top_k_similarity(df, query, corpus_sentences, corpus_embeddings) | |
result.drop_duplicates(inplace=True) | |
return result | |
def load_embedding(): | |
"""Loads the embeddings from the pickle file""" | |
with open('corpus_embeddings.pkl', 'rb') as file: | |
cache_data = cpu_unpickler(file).load() | |
corpus_sentences = cache_data['sentences'] | |
corpus_embeddings = cache_data['embeddings'] | |
return corpus_sentences, corpus_embeddings | |
def main(): | |
# get dataset | |
sheet_id = '1KeuPPVw9gueNmMrQXk1uGFlY9H1vvhErMLiX_ZVRv_Y' | |
sheet_name = 'Form Response 3'.replace(' ', '%20') | |
url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}' | |
df = pd.read_csv(url) | |
df = df.iloc[: , :7] | |
# get embeddings | |
corpus_sentences, corpus_embeddings = load_embedding() | |
# streamlit form | |
st.title('Job Posting Similarity') | |
job_title = st.text_input('Insert the job title below:', '') | |
submitted = st.button('Submit') | |
if submitted: | |
st.info(f'Showing results for { job_title}') | |
result = get_result(df, job_title, corpus_sentences, corpus_embeddings) | |
result.reset_index(drop=True, inplace=True) | |
result.index += 1 | |
st.download_button( | |
"Press to Download", | |
result.to_csv().encode('utf-8'), | |
"result.csv", | |
"text/csv", | |
key='download-csv' | |
) | |
gb = GridOptionsBuilder.from_dataframe(result) | |
gb.configure_pagination(paginationAutoPageSize=True) # Add pagination | |
# gb.configure_side_bar() #Add a sidebar | |
# gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children") #Enable multi-row selection | |
gb.configure_column("LinkedIn Link", | |
headerName="LinkedIn Link", | |
# cellRenderer=JsCode('''function(params) {return '<a href=params.value + '" target="_blank">'+ params.value+'</a>'}'''), | |
cellRenderer=JsCode('''function(params) {return `<a href=${params.value} target="_blank">${params.value}</a>`}'''), | |
width=300) | |
gridOptions = gb.build() | |
grid_response = AgGrid( | |
dataframe=result, | |
gridOptions=gridOptions, | |
height=1100, | |
fit_columns_on_grid_load=True, | |
data_return_mode='AS_INPUT', | |
update_mode='VALUE_CHANGED', | |
theme='light', | |
enable_enterprise_modules=True, | |
allow_unsafe_jscode=True, | |
) | |
if __name__ == '__main__': | |
main() | |