Feliks Zaslavskiy
update req
69cba78
raw
history blame
7.64 kB
import math
import streamlit as st
import pandas as pd
import numpy as np
import torch
from transformers import AlbertTokenizer, AlbertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from io import BytesIO
# base is smaller, vs large
model_size='base'
tokenizer = AlbertTokenizer.from_pretrained('albert-' + model_size + '-v2')
model = AlbertModel.from_pretrained('albert-' + model_size + '-v2')
model_sbert = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')
# for regular burt 0.98
similarity_threshold = 0.8
def get_sbert_embedding(input_text):
embedding = model_sbert.encode(input_text)
return embedding.tolist()
def get_embedding(input_text):
encoded_input = tokenizer(input_text, return_tensors='pt')
input_ids = encoded_input.input_ids
#input_num_tokens = input_ids.shape[1]
#print( "Number of input tokens: " + str(input_num_tokens))
#print("Length of input: " + str(len(input_text)))
list_of_tokens = tokenizer.convert_ids_to_tokens(input_ids.view(-1).tolist())
#print( "Tokens : " + ' '.join(list_of_tokens))
with torch.no_grad():
outputs = model(**encoded_input)
last_hidden_states = outputs[0]
sentence_embedding = torch.mean(last_hidden_states[0], dim=0)
#sentence_embedding = output.last_hidden_state[0][0]
return sentence_embedding.tolist()
st.set_page_config(layout="wide")
st.title('Upload the Address Dataset')
st.markdown('Upload an Excel file to view the data in a table.')
uploaded_file = st.file_uploader('Choose a file', type='xlsx')
if uploaded_file is not None:
data_caqh = pd.read_excel(uploaded_file, sheet_name='CAQH', dtype=str)
data_ndb = pd.read_excel(uploaded_file, sheet_name='NDB', dtype=str)
# Data cleaning CAQH
data_caqh['postalcode'] = data_caqh['postalcode'].astype(str).apply(lambda x: x[:5] + '-' + x[5:] if len(x) > 5 and not '-' in x else x)
data_caqh['full-addr'] = data_caqh['address1'].astype(str) + ', ' \
+ np.where(data_caqh['address2'].isnull(), '' , data_caqh['address2'].astype(str)+ ', ') \
+ data_caqh['city'].astype(str) + ', '\
+ data_caqh['state'].astype(str) + ', ' \
+ data_caqh['postalcode'].astype(str)
st.write(f"CAQH before duplicate removal {len(data_caqh)}")
data_caqh.drop_duplicates(subset='full-addr',inplace=True)
data_caqh = data_caqh.reset_index(drop=True) # reset the index.
st.write(f"CAQH after duplicate removal {len(data_caqh)}")
# Data cleaning NDB
data_ndb['zip_pls_4_cd'] = data_ndb['zip_pls_4_cd'].astype(str).apply(lambda x: x if (x[-1] != '0' and x[-1] != '1') else '')
data_ndb['zip_cd_zip_pls_4_cd'] = data_ndb['zip_cd'].astype(str) +\
np.where( data_ndb['zip_pls_4_cd'] == '', '', '-' \
+ data_ndb['zip_pls_4_cd'].astype(str))
data_ndb['full-addr'] = data_ndb['adr_ln_1_txt'].astype(str).str.strip() + ', ' \
+ data_ndb['cty_nm'].astype(str).str.strip() + ', ' \
+ data_ndb['st_cd'].astype(str) + ', ' + data_ndb['zip_cd_zip_pls_4_cd']
# Calculate similarity For CAQH
num_items = len(data_caqh)
progress_bar = st.progress(0)
total_steps = 100
step_size = math.ceil(num_items / total_steps)
data_caqh['embedding'] = 0
embedding_col_index = data_caqh.columns.get_loc('embedding')
full_addr_col_index = data_caqh.columns.get_loc('full-addr')
for i in range(total_steps):
# Update progress bar
progress = (i + 1) / total_steps
# Process a batch of rows
start = i * step_size
end = start + step_size
stop_iter = False
if end >= num_items:
end = num_items
stop_iter = True
data_caqh.iloc[start:end, embedding_col_index] = data_caqh.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)
progress_bar.progress(value=progress, text=f"CAQH embeddings: {(i + 1) * step_size} processed out of {num_items}")
if stop_iter:
break
st.write(f"Embeddings for CAQH calculated")
# Calculate similarity For NDB
num_items = len(data_ndb)
progress_bar = st.progress(0)
total_steps = 100
step_size = math.ceil(num_items / total_steps)
data_ndb['embedding'] = 0
embedding_col_index = data_ndb.columns.get_loc('embedding')
full_addr_col_index = data_ndb.columns.get_loc('full-addr')
for i in range(total_steps):
# Update progress bar
progress = (i + 1) / total_steps
# Process a batch of rows
start = i * step_size
end = start + step_size
stop_iter = False
if end >= num_items:
end = num_items
stop_iter = True
# or get_embedding
data_ndb.iloc[start:end, embedding_col_index] = data_ndb.iloc[start:end, full_addr_col_index].apply(get_sbert_embedding)
progress_bar.progress(value=progress, text=f"NDB embeddings: {(i + 1) * step_size} processed out of {num_items}")
if stop_iter:
break
st.write(f"Embeddings for NDB calculated... matching")
progress_bar = st.progress(0)
num_items = len(data_caqh)
for i, row in data_caqh.iterrows():
max_similarity = 0
matched_row = None
for j, ndb_row in data_ndb.iterrows():
sim = cosine_similarity([row['embedding']], [ndb_row['embedding']])
if sim > max_similarity:
max_similarity = sim
matched_row = ndb_row
if max_similarity >= similarity_threshold:
data_caqh.at[i, 'matched-addr'] = matched_row['full-addr']
data_caqh.at[i, 'similarity-score'] = max_similarity
else:
print(f"max similarity was {max_similarity}")
data_caqh.at[i, 'matched-addr'] = 'No Matches'
progress = i / num_items
if progress > 1.0:
progress = 1.0
progress_bar.progress(value=progress, text=f"matching similarities - {i} done out of {num_items}")
# Drop columns not needed for display
data_caqh.drop(columns=['embedding'], inplace=True)
data_ndb.drop(columns=['embedding'], inplace=True)
st.header('CAQH addresses and matches')
st.dataframe(data_caqh, use_container_width=True)
# Calculate stats.
total_items = len(data_caqh)
item_without_matches = data_caqh['matched-addr'].value_counts().get('No Matches', 0)
items_with_matches = total_items - item_without_matches;
percent_matched = (items_with_matches/total_items)*100.0
st.write(f"From total matches {total_items}, {items_with_matches} items matched, {item_without_matches} items did not match, {percent_matched:.2f}% matched")
# Create an in-memory binary stream
output = BytesIO()
# Save the DataFrame to the binary stream as an Excel file
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
data_caqh.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()
# Get the binary data from the stream
data = output.getvalue()
# Add a download button for the Excel file
st.download_button(
label='Download CAQH matches as Excel file',
data=data,
file_name='data.xlsx',
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
)
st.header('NDB data')
st.dataframe(data_ndb, use_container_width=True)