Spaces:

kidcoconut
/

spc_healthcareClaimAnomalies

Runtime error

App Files Files Community

spc_healthcareClaimAnomalies / uix /pages /lit_anom_unsuperv.py

kidcoconut

merged github/demo_huggingFace into runner/main

75660bd over 1 year ago

raw

history blame

9.75 kB

	#--- anomaly detection - unsupervised page
	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go

	import lib.claims as libClaims
	import lib.providers as libProviders
	import lib.utils as libUtils
	import sys

	description = "Anomaly Detection - Unsupervised"
	m_kblnTraceOn = False #--- enable/disable module level tracing

	def run():
	#--- note: in python, you need to specify global scope for fxns to access module-level variables
	global m_kblnTraceOn
	print("\nINFO (lit_about.run) loading ", description, " page ...")


	try:

	#--- page settings
	if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): Initialize Page Settings ...")
	st.header("Claims Anomalies - Unsupervised Approach (KMeans)")


	#--- provide file drag/drop capability
	m_blnDisableDragDrop = False
	if(not m_blnDisableDragDrop):
	#btnSave = st.button("Save")
	pklDropped = st.file_uploader("Upload a Claims Dataset", type=["pkl"])
	m_blnDisableDragDrop = (pklDropped is None)


	#--- show: raw claims data analysis
	if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): load raw claims data ...")
	if (m_blnDisableDragDrop):
	pdfClaims = libClaims.load_claims(False)
	else:
	pdfClaims = pd.read_pickle(pklDropped)

	#--- show: raw claims data analysis
	if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show Raw Claims Dataframe ...")
	pdfClaims = libClaims.load_claims(False)


	#--- get unsupervised predictions
	#pdfFeatEng = libClaims.do_featEng(pdfClaims)
	pdfPred = libClaims.get_kmeansPredict(pdfClaims)
	pdfSample = pdfPred.sample(100)
	pdfSample['providerId'] = pdfSample['Provider'].str[3:].astype(np.float64)


	#--- save this file locally as a pkl
	#btnSave_testFile(pdfClaims, pdfPred)


	#--- table of claims and clusters, sorted by InscClaimAmt Reimbursed
	pdfTopClaims = pdfSample.sort_values(by=["cluster", "InscClaimAmtReimbursed"], ascending=False)
	if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show $claims reimbursed by cluster ...")
	st.markdown("(Top) Ins Claim Reimbursed by Cluster")
	st.dataframe(pdfTopClaims)


	#--- chart cluster data distribution
	chart_clusterDistr(pdfSample)


	col1, col2, col3 = st.columns(3)


	#--- chart KMeans clusters": InscClaimAmtReimbursed
	#chart_KMeansClusters(pdfSample, "Age", "InscClaimAmtReimbursed", col1)
	#chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col2)

	chart_KMeansClusters(pdfSample, "providerId", "AdmittedDays", col1)
	chart_KMeansClusters(pdfSample, "providerId", "DeductibleAmtPaid", col2)
	chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col3)

	chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_KidneyDisease", col1)
	chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_Heartfailure", col2)
	chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_ObstrPulmonary", col3)

	chart_KMeansClusters(pdfSample, "AdmittedDays", "DeductibleAmtPaid", col1)
	chart_KMeansClusters(pdfSample, "AdmittedDays", "InscClaimAmtReimbursed", col2)
	chart_KMeansClusters(pdfSample, "DeductibleAmtPaid", "InscClaimAmtReimbursed", col3)



	#--- chart cluster bars
	#chart_KMeansBars(pdfSample, "cluster", "InscClaimAmtReimbursed", col1)
	#chart_KMeansBars(pdfSample, "cluster", "DeductibleAmtPaid", col2)

	#chart_KMeansBars(pdfSample, "cluster", "IPAnnualReimbursementAmt", col1)
	#chart_KMeansBars(pdfSample, "cluster", "IPAnnualDeductibleAmt", col2)

	#chart_KMeansBars(pdfSample, "cluster", "OPAnnualReimbursementAmt", col1)
	#chart_KMeansBars(pdfSample, "cluster", "OPAnnualDeductibleAmt", col2)

	#chart_KMeansBars(pdfSample, "cluster", "ChronicCond_Heartfailure", col1)
	#chart_KMeansBars(pdfSample, "cluster", "ChronicCond_KidneyDisease", col2)

	except TypeError as e:
	print("ERROR (litAnomUnsuperv.run_typeError): ", e)

	except:
	e = sys.exc_info()
	print("ERROR (litAnomUnsuperv.run_genError): ", e)



	def chart_clusterDistr(pdfSample):
	#pdfClustDistr = pdfSample['cluster'].value_counts()
	pdfBar = pdfSample
	pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
	pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
	pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]

	kstrTitle = "(KMeans Clusters) Claims data"
	#--- chart
	fig = go.Figure(
	layout=dict(
	legend=dict(groupclick="toggleitem"),
	xaxis=dict(title='cluster'),
	yaxis=dict(title='#data points')
	)
	)

	fig.add_trace(
	go.Bar(
	x=pdfCluster0['cluster'],
	y=pdfCluster0['cluster'].value_counts(),
	name='cluster0'
	)
	)

	if (pdfCluster1.shape[0]>0):
	fig.add_trace(
	go.Bar(
	x=pdfCluster1['cluster'],
	y=pdfCluster1['cluster'].value_counts(),
	name='cluster1'
	))

	if (pdfCluster2.shape[0]>0):
	fig.add_trace(
	go.Bar(
	x=pdfCluster2['cluster'],
	y=pdfCluster2['cluster'].value_counts(),
	name='cluster2'
	))
	st.plotly_chart(fig, use_container_width=True)


	def chart_KMeansClusters(pdfSample, strXFeature, strYFeature, stCol):
	pdfScatter = pdfSample
	pdfCluster0 = pdfScatter[pdfScatter['cluster'] == 0]
	pdfCluster1 = pdfScatter[pdfScatter['cluster'] == 1]
	pdfCluster2 = pdfScatter[pdfScatter['cluster'] == 2]

	kstrTitle = "(KMeans Clusters) Claims data"
	#--- chart
	fig = go.Figure(
	layout=dict(
	legend=dict(groupclick="toggleitem"),
	xaxis=dict(title=strXFeature),
	yaxis=dict(title=strYFeature)
	)
	)

	fig.add_trace(
	go.Scatter(
	x=pdfCluster0[strXFeature],
	y=pdfCluster0[strYFeature],
	text="claimId: " + pdfCluster0['ClaimID'],
	mode='markers',
	name='cluster0'
	)
	)

	if (pdfCluster1.shape[0]>0):
	fig.add_trace(
	go.Scatter(
	x=pdfCluster1[strXFeature],
	y=pdfCluster1[strYFeature],
	mode='markers',
	name='cluster1'
	))

	if (pdfCluster2.shape[0]>0):
	fig.add_trace(
	go.Scatter(
	x=pdfCluster2[strXFeature],
	y=pdfCluster2[strYFeature],
	mode='markers',
	name='cluster2'
	))
	stCol.plotly_chart(fig, use_container_width=True)


	def chart_KMeansBars(pdfSample, strXFeature, strYFeature, stCol):
	pdfBar = pdfSample
	pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
	pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
	pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]

	kstrTitle = "(KMeans Clusters) Claims data"
	#--- chart
	fig = go.Figure(
	layout=dict(
	legend=dict(groupclick="toggleitem"),
	xaxis=dict(title=strXFeature),
	yaxis=dict(title=strYFeature)
	)
	)

	fig.add_trace(
	go.Bar(
	x=pdfCluster0[strXFeature],
	y=pdfCluster0[strYFeature],
	name='cluster0'
	)
	)

	if (pdfCluster1.shape[0]>0):
	fig.add_trace(
	go.Bar(
	x=pdfCluster1[strXFeature],
	y=pdfCluster1[strYFeature],
	name='cluster1'
	))

	if (pdfCluster2.shape[0]>0):
	fig.add_trace(
	go.Bar(
	x=pdfCluster2[strXFeature],
	y=pdfCluster2[strYFeature],
	name='cluster2'
	))
	stCol.plotly_chart(fig, use_container_width=True)



	def btnSave_testFile(pdfClaims, pdfPred):
	#--- get all claims for all anoms
	""" print("TRACE (lit_anom_unsuperv.btnSave_testFile) query anoms ... ", pdfPred.head(10))
	pdfAnomClaims = pdfPred[pdfPred['hasAnom?'] > 0]
	#pdfAnomProv = pdfAnomProv['Provider']

	#--- filter claims by anomProviders
	print("TRACE (lit_anom_unsuperv.btnSave_testFile) filter claims ... ")
	pdfClaimAnom = pdfClaims[pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
	pdfClaimNoAnom = pdfClaims[~pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
	lngNumAnoms = len(pdfClaimAnom.index)
	lngNumOk = len(pdfClaimNoAnom.index)
	print("TRACE (lit_anom_unsuperv.btnSave_testFile) #anoms: ", lngNumAnoms, ", !anoms: ", lngNumOk)

	#--- get a sample for remaining records
	print("TRACE (lit_anom_unsuperv.btnSave_testFile) sampling claims ... ")
	pdfSave = pd.concat([pdfClaimAnom.sample(frac=0.6), pdfClaimNoAnom.sample(frac=0.1)]) """

	pdfSave = pdfClaims.sample(frac=0.1)

	print("TRACE (lit_anom_unsuperv.btnSave_testFile) saving ... ")
	saveProviderTestData(pdfSave)


	def saveProviderTestData(pdfTestData):

	#--- save the file
	from datetime import date
	import time
	import pickle
	strDteNow = date.today().strftime('%Y%m%d')
	strTimeNow = time.strftime('%H%M%S')
	strProvTestFile = libUtils.pth_data + strDteNow + strTimeNow + "_claimsTestSample.pkl"
	#pd.to_pickle(pdfClaims.sample(200), strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
	pdfTestData.to_pickle(strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)