spc_healthcareClaimAnomalies / uix /pages /lit_anom_unsuperv.py
kidcoconut's picture
merged github/demo_huggingFace into runner/main
75660bd
raw
history blame
9.75 kB
#--- anomaly detection - unsupervised page
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import lib.claims as libClaims
import lib.providers as libProviders
import lib.utils as libUtils
import sys
description = "Anomaly Detection - Unsupervised"
m_kblnTraceOn = False #--- enable/disable module level tracing
def run():
#--- note: in python, you need to specify global scope for fxns to access module-level variables
global m_kblnTraceOn
print("\nINFO (lit_about.run) loading ", description, " page ...")
try:
#--- page settings
if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): Initialize Page Settings ...")
st.header("Claims Anomalies - Unsupervised Approach (KMeans)")
#--- provide file drag/drop capability
m_blnDisableDragDrop = False
if(not m_blnDisableDragDrop):
#btnSave = st.button("Save")
pklDropped = st.file_uploader("Upload a Claims Dataset", type=["pkl"])
m_blnDisableDragDrop = (pklDropped is None)
#--- show: raw claims data analysis
if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): load raw claims data ...")
if (m_blnDisableDragDrop):
pdfClaims = libClaims.load_claims(False)
else:
pdfClaims = pd.read_pickle(pklDropped)
#--- show: raw claims data analysis
if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show Raw Claims Dataframe ...")
pdfClaims = libClaims.load_claims(False)
#--- get unsupervised predictions
#pdfFeatEng = libClaims.do_featEng(pdfClaims)
pdfPred = libClaims.get_kmeansPredict(pdfClaims)
pdfSample = pdfPred.sample(100)
pdfSample['providerId'] = pdfSample['Provider'].str[3:].astype(np.float64)
#--- save this file locally as a pkl
#btnSave_testFile(pdfClaims, pdfPred)
#--- table of claims and clusters, sorted by InscClaimAmt Reimbursed
pdfTopClaims = pdfSample.sort_values(by=["cluster", "InscClaimAmtReimbursed"], ascending=False)
if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show $claims reimbursed by cluster ...")
st.markdown("(Top) Ins Claim Reimbursed by Cluster")
st.dataframe(pdfTopClaims)
#--- chart cluster data distribution
chart_clusterDistr(pdfSample)
col1, col2, col3 = st.columns(3)
#--- chart KMeans clusters": InscClaimAmtReimbursed
#chart_KMeansClusters(pdfSample, "Age", "InscClaimAmtReimbursed", col1)
#chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col2)
chart_KMeansClusters(pdfSample, "providerId", "AdmittedDays", col1)
chart_KMeansClusters(pdfSample, "providerId", "DeductibleAmtPaid", col2)
chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col3)
chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_KidneyDisease", col1)
chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_Heartfailure", col2)
chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_ObstrPulmonary", col3)
chart_KMeansClusters(pdfSample, "AdmittedDays", "DeductibleAmtPaid", col1)
chart_KMeansClusters(pdfSample, "AdmittedDays", "InscClaimAmtReimbursed", col2)
chart_KMeansClusters(pdfSample, "DeductibleAmtPaid", "InscClaimAmtReimbursed", col3)
#--- chart cluster bars
#chart_KMeansBars(pdfSample, "cluster", "InscClaimAmtReimbursed", col1)
#chart_KMeansBars(pdfSample, "cluster", "DeductibleAmtPaid", col2)
#chart_KMeansBars(pdfSample, "cluster", "IPAnnualReimbursementAmt", col1)
#chart_KMeansBars(pdfSample, "cluster", "IPAnnualDeductibleAmt", col2)
#chart_KMeansBars(pdfSample, "cluster", "OPAnnualReimbursementAmt", col1)
#chart_KMeansBars(pdfSample, "cluster", "OPAnnualDeductibleAmt", col2)
#chart_KMeansBars(pdfSample, "cluster", "ChronicCond_Heartfailure", col1)
#chart_KMeansBars(pdfSample, "cluster", "ChronicCond_KidneyDisease", col2)
except TypeError as e:
print("ERROR (litAnomUnsuperv.run_typeError): ", e)
except:
e = sys.exc_info()
print("ERROR (litAnomUnsuperv.run_genError): ", e)
def chart_clusterDistr(pdfSample):
#pdfClustDistr = pdfSample['cluster'].value_counts()
pdfBar = pdfSample
pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]
kstrTitle = "(KMeans Clusters) Claims data"
#--- chart
fig = go.Figure(
layout=dict(
legend=dict(groupclick="toggleitem"),
xaxis=dict(title='cluster'),
yaxis=dict(title='#data points')
)
)
fig.add_trace(
go.Bar(
x=pdfCluster0['cluster'],
y=pdfCluster0['cluster'].value_counts(),
name='cluster0'
)
)
if (pdfCluster1.shape[0]>0):
fig.add_trace(
go.Bar(
x=pdfCluster1['cluster'],
y=pdfCluster1['cluster'].value_counts(),
name='cluster1'
))
if (pdfCluster2.shape[0]>0):
fig.add_trace(
go.Bar(
x=pdfCluster2['cluster'],
y=pdfCluster2['cluster'].value_counts(),
name='cluster2'
))
st.plotly_chart(fig, use_container_width=True)
def chart_KMeansClusters(pdfSample, strXFeature, strYFeature, stCol):
pdfScatter = pdfSample
pdfCluster0 = pdfScatter[pdfScatter['cluster'] == 0]
pdfCluster1 = pdfScatter[pdfScatter['cluster'] == 1]
pdfCluster2 = pdfScatter[pdfScatter['cluster'] == 2]
kstrTitle = "(KMeans Clusters) Claims data"
#--- chart
fig = go.Figure(
layout=dict(
legend=dict(groupclick="toggleitem"),
xaxis=dict(title=strXFeature),
yaxis=dict(title=strYFeature)
)
)
fig.add_trace(
go.Scatter(
x=pdfCluster0[strXFeature],
y=pdfCluster0[strYFeature],
text="claimId: " + pdfCluster0['ClaimID'],
mode='markers',
name='cluster0'
)
)
if (pdfCluster1.shape[0]>0):
fig.add_trace(
go.Scatter(
x=pdfCluster1[strXFeature],
y=pdfCluster1[strYFeature],
mode='markers',
name='cluster1'
))
if (pdfCluster2.shape[0]>0):
fig.add_trace(
go.Scatter(
x=pdfCluster2[strXFeature],
y=pdfCluster2[strYFeature],
mode='markers',
name='cluster2'
))
stCol.plotly_chart(fig, use_container_width=True)
def chart_KMeansBars(pdfSample, strXFeature, strYFeature, stCol):
pdfBar = pdfSample
pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]
kstrTitle = "(KMeans Clusters) Claims data"
#--- chart
fig = go.Figure(
layout=dict(
legend=dict(groupclick="toggleitem"),
xaxis=dict(title=strXFeature),
yaxis=dict(title=strYFeature)
)
)
fig.add_trace(
go.Bar(
x=pdfCluster0[strXFeature],
y=pdfCluster0[strYFeature],
name='cluster0'
)
)
if (pdfCluster1.shape[0]>0):
fig.add_trace(
go.Bar(
x=pdfCluster1[strXFeature],
y=pdfCluster1[strYFeature],
name='cluster1'
))
if (pdfCluster2.shape[0]>0):
fig.add_trace(
go.Bar(
x=pdfCluster2[strXFeature],
y=pdfCluster2[strYFeature],
name='cluster2'
))
stCol.plotly_chart(fig, use_container_width=True)
def btnSave_testFile(pdfClaims, pdfPred):
#--- get all claims for all anoms
""" print("TRACE (lit_anom_unsuperv.btnSave_testFile) query anoms ... ", pdfPred.head(10))
pdfAnomClaims = pdfPred[pdfPred['hasAnom?'] > 0]
#pdfAnomProv = pdfAnomProv['Provider']
#--- filter claims by anomProviders
print("TRACE (lit_anom_unsuperv.btnSave_testFile) filter claims ... ")
pdfClaimAnom = pdfClaims[pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
pdfClaimNoAnom = pdfClaims[~pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
lngNumAnoms = len(pdfClaimAnom.index)
lngNumOk = len(pdfClaimNoAnom.index)
print("TRACE (lit_anom_unsuperv.btnSave_testFile) #anoms: ", lngNumAnoms, ", !anoms: ", lngNumOk)
#--- get a sample for remaining records
print("TRACE (lit_anom_unsuperv.btnSave_testFile) sampling claims ... ")
pdfSave = pd.concat([pdfClaimAnom.sample(frac=0.6), pdfClaimNoAnom.sample(frac=0.1)]) """
pdfSave = pdfClaims.sample(frac=0.1)
print("TRACE (lit_anom_unsuperv.btnSave_testFile) saving ... ")
saveProviderTestData(pdfSave)
def saveProviderTestData(pdfTestData):
#--- save the file
from datetime import date
import time
import pickle
strDteNow = date.today().strftime('%Y%m%d')
strTimeNow = time.strftime('%H%M%S')
strProvTestFile = libUtils.pth_data + strDteNow + strTimeNow + "_claimsTestSample.pkl"
#pd.to_pickle(pdfClaims.sample(200), strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
pdfTestData.to_pickle(strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)