Feliks Zaslavskiy
UI updates. fix column name
ea4c492
raw
history blame
4.07 kB
import streamlit as st
import pandas as pd
import numpy as np
import torch
from transformers import AlbertTokenizer, AlbertModel
from sklearn.metrics.pairwise import cosine_similarity
# base is smaller, vs large
model_size='base'
tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
def get_embedding(input_text):
encoded_input = tokenizer(input_text, return_tensors='pt')
input_ids = encoded_input.input_ids
#input_num_tokens = input_ids.shape[1]
#print( "Number of input tokens: " + str(input_num_tokens))
#print("Length of input: " + str(len(input_text)))
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
print( "Tokens : " + ' '.join(list_of_tokens))
with torch.no_grad():
outputs = model(**encoded_input)
last_hidden_states = outputs[0]
sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
#sentence_embedding = output.last_hidden_state[0][0]
return sentence_embedding.tolist()
st. set_page_config(layout="wide")
st.title('Upload the Address Dataset')
st.markdown('Upload an Excel file to view the data in a table.')
uploaded_file = st.file_uploader('Choose a file', type='xlsx')
if uploaded_file is not None:
data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH', dtype=str)
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
# Data cleaning CAQH
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)) \
+ data_caqh['city'].astype(str) + ', '\
+ data_caqh['state'].astype(str) + ', ' \
+ data_caqh['postalcode'].astype(str)
# Data cleaning NDB
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
+ data_ndb['zip_pls_4_cd'].astype(str))
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
+ data_ndb['cty_nm'].astype(str) + ', ' \
+ data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
# App
data_caqh['embedding'] = data_caqh['full-addr'].apply(get_embedding)
data_ndb['embedding'] = data_ndb['full-addr'].apply(get_embedding)
data_caqh['matched-addr'] = ''
for i, row in data_caqh.iterrows():
max_similarity = 0
matched_row = None
for j, ndb_row in data_ndb.iterrows():
sim = cosine_similarity([row['embedding']], [ndb_row['embedding']])
if sim > max_similarity:
max_similarity = sim
matched_row = ndb_row
if max_similarity >= 0.98:
data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
data_caqh.at[i, 'similarity-score'] = max_similarity
else:
data_caqh.at[i, 'matched-addr'] = 'No Matches'
# Drop columns not needed for display
data_caqh.drop(columns=['embedding'], inplace=True)
data_ndb.drop(columns=['embedding'], inplace=True)
st.header('CAQH addresses and matches')
st.dataframe(data_caqh, use_container_width=True)
st.header('NDB data')
st.dataframe(data_ndb, use_container_width=True)
# calculate the embedding of each item.
#st.dataframe(data_caqh)
# Do some matching
#data_caqh.loc[data_caqh['full-addr'] == '1000 Vale Terrace, Vista, CA, 92084', 'matched-addr'] = '456 Main St'
#time.sleep(10)
#st.dataframe(data_caqh)