Spaces:

kidcoconut
/

spc_healthcareClaimAnomalies

Runtime error

File size: 9,750 Bytes

75660bd

#--- anomaly detection - unsupervised page
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

import lib.claims as libClaims
import lib.providers as libProviders
import lib.utils as libUtils
import sys

description = "Anomaly Detection - Unsupervised"
m_kblnTraceOn = False                                  #--- enable/disable module level tracing

def run():
    #--- note:  in python, you need to specify global scope for fxns to access module-level variables 
    global m_kblnTraceOn
    print("\nINFO (lit_about.run)  loading ", description, " page ...") 


    try:

        #--- page settings
        if (m_kblnTraceOn):  print("TRACE (litAnomUnSuperv.run):  Initialize Page Settings ...")
        st.header("Claims Anomalies - Unsupervised Approach (KMeans)")


        #--- provide file drag/drop capability
        m_blnDisableDragDrop = False
        if(not m_blnDisableDragDrop): 
            #btnSave = st.button("Save")
            pklDropped = st.file_uploader("Upload a Claims Dataset", type=["pkl"])
            m_blnDisableDragDrop = (pklDropped is None)


        #--- show:  raw claims data analysis
        if (m_kblnTraceOn):  print("TRACE (litAnomUnSuperv.run):  load raw claims data ...")
        if (m_blnDisableDragDrop):
            pdfClaims = libClaims.load_claims(False)
        else:
            pdfClaims = pd.read_pickle(pklDropped)        

        #--- show:  raw claims data analysis
        if (m_kblnTraceOn):  print("TRACE (litAnomUnsuperv.run):  Show Raw Claims Dataframe ...")
        pdfClaims = libClaims.load_claims(False)


        #--- get unsupervised predictions
        #pdfFeatEng = libClaims.do_featEng(pdfClaims)
        pdfPred = libClaims.get_kmeansPredict(pdfClaims)
        pdfSample = pdfPred.sample(100)
        pdfSample['providerId'] = pdfSample['Provider'].str[3:].astype(np.float64)


        #--- save this file locally as a pkl
        #btnSave_testFile(pdfClaims, pdfPred)


        #--- table of claims and clusters, sorted by InscClaimAmt Reimbursed
        pdfTopClaims = pdfSample.sort_values(by=["cluster", "InscClaimAmtReimbursed"], ascending=False)
        if (m_kblnTraceOn):  print("TRACE (litAnomUnsuperv.run):  Show $claims reimbursed by cluster ...")
        st.markdown("(Top) Ins Claim Reimbursed by Cluster")
        st.dataframe(pdfTopClaims)

        
        #--- chart cluster data distribution
        chart_clusterDistr(pdfSample)


        col1, col2, col3 = st.columns(3)


        #--- chart KMeans clusters":  InscClaimAmtReimbursed
        #chart_KMeansClusters(pdfSample, "Age", "InscClaimAmtReimbursed", col1)
        #chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col2)        
        
        chart_KMeansClusters(pdfSample, "providerId", "AdmittedDays", col1)
        chart_KMeansClusters(pdfSample, "providerId", "DeductibleAmtPaid", col2)
        chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col3)

        chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_KidneyDisease", col1)
        chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_Heartfailure", col2)
        chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_ObstrPulmonary", col3)

        chart_KMeansClusters(pdfSample, "AdmittedDays", "DeductibleAmtPaid", col1)
        chart_KMeansClusters(pdfSample, "AdmittedDays", "InscClaimAmtReimbursed", col2)
        chart_KMeansClusters(pdfSample, "DeductibleAmtPaid", "InscClaimAmtReimbursed", col3)



        #--- chart cluster bars
        #chart_KMeansBars(pdfSample, "cluster", "InscClaimAmtReimbursed", col1)
        #chart_KMeansBars(pdfSample, "cluster", "DeductibleAmtPaid", col2)

        #chart_KMeansBars(pdfSample, "cluster", "IPAnnualReimbursementAmt", col1)
        #chart_KMeansBars(pdfSample, "cluster", "IPAnnualDeductibleAmt", col2)
    
        #chart_KMeansBars(pdfSample, "cluster", "OPAnnualReimbursementAmt", col1)
        #chart_KMeansBars(pdfSample, "cluster", "OPAnnualDeductibleAmt", col2)
    
        #chart_KMeansBars(pdfSample, "cluster", "ChronicCond_Heartfailure", col1)
        #chart_KMeansBars(pdfSample, "cluster", "ChronicCond_KidneyDisease", col2)

    except TypeError as e:
        print("ERROR (litAnomUnsuperv.run_typeError):  ", e)

    except:
        e = sys.exc_info()
        print("ERROR (litAnomUnsuperv.run_genError):  ", e)        



def chart_clusterDistr(pdfSample):
    #pdfClustDistr = pdfSample['cluster'].value_counts()
    pdfBar = pdfSample
    pdfCluster0 = pdfBar[pdfBar['cluster'] == 0] 
    pdfCluster1 = pdfBar[pdfBar['cluster'] == 1] 
    pdfCluster2 = pdfBar[pdfBar['cluster'] == 2] 

    kstrTitle = "(KMeans Clusters) Claims data"
    #--- chart 
    fig = go.Figure(
        layout=dict(
            legend=dict(groupclick="toggleitem"),
            xaxis=dict(title='cluster'),
            yaxis=dict(title='#data points')
        )
    )

    fig.add_trace(
        go.Bar(
            x=pdfCluster0['cluster'],
            y=pdfCluster0['cluster'].value_counts(),
            name='cluster0'
        )
    )

    if (pdfCluster1.shape[0]>0):
        fig.add_trace(
            go.Bar(
                x=pdfCluster1['cluster'],
                y=pdfCluster1['cluster'].value_counts(),
                name='cluster1'
            )) 

    if (pdfCluster2.shape[0]>0):
        fig.add_trace(
            go.Bar(
                x=pdfCluster2['cluster'],
                y=pdfCluster2['cluster'].value_counts(),
                name='cluster2'
            )) 
    st.plotly_chart(fig, use_container_width=True)


def chart_KMeansClusters(pdfSample, strXFeature, strYFeature, stCol):
    pdfScatter = pdfSample
    pdfCluster0 = pdfScatter[pdfScatter['cluster'] == 0] 
    pdfCluster1 = pdfScatter[pdfScatter['cluster'] == 1] 
    pdfCluster2 = pdfScatter[pdfScatter['cluster'] == 2] 

    kstrTitle = "(KMeans Clusters) Claims data"
    #--- chart 
    fig = go.Figure(
        layout=dict(
            legend=dict(groupclick="toggleitem"),
            xaxis=dict(title=strXFeature),
            yaxis=dict(title=strYFeature)
        )
    )

    fig.add_trace(
        go.Scatter(
            x=pdfCluster0[strXFeature],
            y=pdfCluster0[strYFeature],
            text="claimId: " + pdfCluster0['ClaimID'],
            mode='markers',
            name='cluster0'
        )
    )

    if (pdfCluster1.shape[0]>0):
        fig.add_trace(
            go.Scatter(
                x=pdfCluster1[strXFeature],
                y=pdfCluster1[strYFeature],
                mode='markers',
                name='cluster1'
            )) 

    if (pdfCluster2.shape[0]>0):
        fig.add_trace(
            go.Scatter(
                x=pdfCluster2[strXFeature],
                y=pdfCluster2[strYFeature],
                mode='markers',
                name='cluster2'
            )) 
    stCol.plotly_chart(fig, use_container_width=True)


def chart_KMeansBars(pdfSample, strXFeature, strYFeature, stCol):
    pdfBar = pdfSample
    pdfCluster0 = pdfBar[pdfBar['cluster'] == 0] 
    pdfCluster1 = pdfBar[pdfBar['cluster'] == 1] 
    pdfCluster2 = pdfBar[pdfBar['cluster'] == 2] 

    kstrTitle = "(KMeans Clusters) Claims data"
    #--- chart 
    fig = go.Figure(
        layout=dict(
            legend=dict(groupclick="toggleitem"),
            xaxis=dict(title=strXFeature),
            yaxis=dict(title=strYFeature)
        )
    )

    fig.add_trace(
        go.Bar(
            x=pdfCluster0[strXFeature],
            y=pdfCluster0[strYFeature],
            name='cluster0'
        )
    )

    if (pdfCluster1.shape[0]>0):
        fig.add_trace(
            go.Bar(
                x=pdfCluster1[strXFeature],
                y=pdfCluster1[strYFeature],
                name='cluster1'
            )) 

    if (pdfCluster2.shape[0]>0):
        fig.add_trace(
            go.Bar(
                x=pdfCluster2[strXFeature],
                y=pdfCluster2[strYFeature],
                name='cluster2'
            )) 
    stCol.plotly_chart(fig, use_container_width=True)



def btnSave_testFile(pdfClaims, pdfPred):
    #--- get all claims for all anoms
    """     print("TRACE (lit_anom_unsuperv.btnSave_testFile)  query anoms ... ", pdfPred.head(10))
        pdfAnomClaims = pdfPred[pdfPred['hasAnom?'] > 0] 
        #pdfAnomProv = pdfAnomProv['Provider']

        #--- filter claims by anomProviders
        print("TRACE (lit_anom_unsuperv.btnSave_testFile)  filter claims ... ")
        pdfClaimAnom = pdfClaims[pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
        pdfClaimNoAnom = pdfClaims[~pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
        lngNumAnoms = len(pdfClaimAnom.index)
        lngNumOk = len(pdfClaimNoAnom.index)
        print("TRACE (lit_anom_unsuperv.btnSave_testFile)  #anoms: ", lngNumAnoms, ",  !anoms: ", lngNumOk)

        #--- get a sample for remaining records
        print("TRACE (lit_anom_unsuperv.btnSave_testFile)  sampling claims ... ")
        pdfSave = pd.concat([pdfClaimAnom.sample(frac=0.6), pdfClaimNoAnom.sample(frac=0.1)]) """

    pdfSave = pdfClaims.sample(frac=0.1)

    print("TRACE (lit_anom_unsuperv.btnSave_testFile)  saving ... ")
    saveProviderTestData(pdfSave)


def saveProviderTestData(pdfTestData):

    #--- save the file
    from datetime import date
    import time
    import pickle
    strDteNow = date.today().strftime('%Y%m%d')
    strTimeNow = time.strftime('%H%M%S')
    strProvTestFile = libUtils.pth_data + strDteNow + strTimeNow + "_claimsTestSample.pkl"
    #pd.to_pickle(pdfClaims.sample(200), strProvTestFile,  protocol=pickle.HIGHEST_PROTOCOL) 
    pdfTestData.to_pickle(strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)