Spaces:
Runtime error
Runtime error
Commit
·
75660bd
1
Parent(s):
7a90042
merged github/demo_huggingFace into runner/main
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- __init__.py +0 -0
- bin/models/__init__.py +0 -0
- bin/models/claims_kmn_py3816_sk111hp_22cols.pkl +3 -0
- bin/models/claims_stdScl_py3816_sk111hp_27cols.pkl +3 -0
- bin/models/gbc_trainVal_confusionMatrix_colab.png +0 -0
- bin/models/kmn_elbow.png +0 -0
- bin/models/lgr_precisionRecallCurve_colab.png +0 -0
- bin/models/lgr_rocCurve_colab.png +0 -0
- bin/models/lgr_trainVal_confusionMatrix_colab.png +0 -0
- bin/models/lgr_trainVal_probPred_colab.png +0 -0
- bin/models/prov_gbc_py3816_sk111hp_32cols.pkl +3 -0
- bin/models/prov_stdScl_py3816_sk111hp_32cols.pkl +3 -0
- bin/models/svm_trainVal_confusionMatrix_colab.png +0 -0
- data/demo_data/20230210165948_provTestSample.pkl +3 -0
- data/demo_data/20230210170628_claimsTestSample.pkl +3 -0
- data/test_claims.pkl +3 -0
- lib/__init__.py +0 -0
- lib/claims.py +258 -0
- lib/models/__init__.py +0 -0
- lib/models/mdl_autoenc.py +55 -0
- lib/models/mdl_kmeans.py +155 -0
- lib/models/mdl_logR.py +41 -0
- lib/models/mdl_svm.py +40 -0
- lib/models/mdl_utils.py +256 -0
- lib/models/mdl_xgb.py +66 -0
- lib/providers.py +170 -0
- lib/utils.py +23 -0
- lit_index.py +25 -0
- main.py +97 -0
- routes/__init__.py +0 -0
- routes/api/__init__.py +0 -0
- routes/api/rte_api.py +67 -0
- routes/qa/__init__.py +0 -0
- routes/qa/rte_claims.py +139 -0
- routes/qa/rte_providers.py +188 -0
- routes/qa/rte_qa.py +17 -0
- templ/templ_results.html +4 -0
- templ/templ_showDataframe.html +15 -0
- uix/__init__.py +0 -0
- uix/images/image1.jpg +0 -0
- uix/images/image1.jpg:Zone.Identifier +3 -0
- uix/lit_packages.py +36 -0
- uix/lit_sidebar.py +99 -0
- uix/pages/__init__.py +0 -0
- uix/pages/lit_about.py +24 -0
- uix/pages/lit_anom_superv.py +368 -0
- uix/pages/lit_anom_unsuperv.py +280 -0
- uix/pages/lit_claimAnalysis.py +75 -0
- uix/pages/lit_home.py +41 -0
- uix/pages/lit_modelPerf.py +6 -0
__init__.py
ADDED
File without changes
|
bin/models/__init__.py
ADDED
File without changes
|
bin/models/claims_kmn_py3816_sk111hp_22cols.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a3f3e93e08cc64a169e199232261b55250ca7c6599522cea2c2821d99edb554
|
3 |
+
size 2234618
|
bin/models/claims_stdScl_py3816_sk111hp_27cols.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84d5379c031897e5df1ecbc6b07dca005873955818285f98cb1c8ec5d291c581
|
3 |
+
size 1779
|
bin/models/gbc_trainVal_confusionMatrix_colab.png
ADDED
![]() |
bin/models/kmn_elbow.png
ADDED
![]() |
bin/models/lgr_precisionRecallCurve_colab.png
ADDED
![]() |
bin/models/lgr_rocCurve_colab.png
ADDED
![]() |
bin/models/lgr_trainVal_confusionMatrix_colab.png
ADDED
![]() |
bin/models/lgr_trainVal_probPred_colab.png
ADDED
![]() |
bin/models/prov_gbc_py3816_sk111hp_32cols.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a740ba487ec674f9eafdc68f360e98be7b4b834fac0a6a79f9b82bac583d710f
|
3 |
+
size 45135
|
bin/models/prov_stdScl_py3816_sk111hp_32cols.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2137e7adba591cb760d4b73c561f84da5cc2aa49235000b274304f37d98582b7
|
3 |
+
size 2094
|
bin/models/svm_trainVal_confusionMatrix_colab.png
ADDED
![]() |
data/demo_data/20230210165948_provTestSample.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fb380a5e4cfed980e8514bcee519f45de67556ffdd09e3eaf9d1f635c1c77d79
|
3 |
+
size 7419701
|
data/demo_data/20230210170628_claimsTestSample.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5b35e7014ab77ba73140f875637903691bb4f22019d2b956a320b5d0b5c8aa2
|
3 |
+
size 6418423
|
data/test_claims.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:478c2854368471f7db414171bb5c1fc8904fbea49fb5aa3091a58f41443f8bed
|
3 |
+
size 61997959
|
lib/__init__.py
ADDED
File without changes
|
lib/claims.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import lib.utils as libPaths
|
3 |
+
|
4 |
+
from lib.models import mdl_utils, mdl_xgb, mdl_logR, mdl_svm
|
5 |
+
from lib.models import mdl_autoenc, mdl_kmeans
|
6 |
+
|
7 |
+
|
8 |
+
m_blnTraceOn = True
|
9 |
+
m_blnTrace2On = False
|
10 |
+
|
11 |
+
#--- load, merge data from file
|
12 |
+
m_kstrDataPath = libPaths.pth_data
|
13 |
+
m_kstrModelPath = libPaths.pth_model
|
14 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
def getPath_defPklClaims(blnIsTrain=False):
|
19 |
+
global m_kstrDataPath
|
20 |
+
strPrefix="test_"
|
21 |
+
if (blnIsTrain): strPrefix = "train_"
|
22 |
+
strPth_pklClaims = m_kstrDataPath + strPrefix + 'claims.pkl'
|
23 |
+
return strPth_pklClaims
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def load_claims(blnIsTrain=False, blnForceCsv=False):
|
28 |
+
if (blnForceCsv):
|
29 |
+
pdfClaims = loadCsv_claims(blnIsTrain)
|
30 |
+
else:
|
31 |
+
pdfClaims = loadPkl_claims(blnIsTrain)
|
32 |
+
return pdfClaims
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def loadCsv_claims(blnIsTrain=False):
|
37 |
+
global m_kstrDataPath
|
38 |
+
#--- load all csv test data
|
39 |
+
if (blnIsTrain):
|
40 |
+
print("INFO (loadCsv_claimsData): load train data ...")
|
41 |
+
strPthProvider = m_kstrDataPath + 'Train-1542865627584.csv'
|
42 |
+
strPthBenef = m_kstrDataPath + 'Train_Beneficiarydata-1542865627584.csv'
|
43 |
+
strPthInpat = m_kstrDataPath + 'Train_Inpatientdata-1542865627584.csv'
|
44 |
+
strPthOutpat = m_kstrDataPath + 'Train_Outpatientdata-1542865627584.csv'
|
45 |
+
else:
|
46 |
+
print("INFO (loadCsv_claimsData): load test data ...")
|
47 |
+
strPthProvider = m_kstrDataPath + 'Test-1542969243754.csv'
|
48 |
+
strPthBenef = m_kstrDataPath + 'Test_Beneficiarydata-1542969243754.csv'
|
49 |
+
strPthInpat = m_kstrDataPath + 'Test_Inpatientdata-1542969243754.csv'
|
50 |
+
strPthOutpat = m_kstrDataPath + 'Test_Outpatientdata-1542969243754.csv'
|
51 |
+
|
52 |
+
#--- output: pandas data frame
|
53 |
+
pdfProvider = pd.read_csv(strPthProvider)
|
54 |
+
pdfBenef = pd.read_csv(strPthBenef)
|
55 |
+
pdfInpat = pd.read_csv(strPthInpat)
|
56 |
+
pdfOutpat = pd.read_csv(strPthOutpat)
|
57 |
+
|
58 |
+
#--- data engineering
|
59 |
+
pdfBenef = prep_benefData(pdfBenef)
|
60 |
+
pdfInpat = prep_inpatData(pdfInpat)
|
61 |
+
|
62 |
+
#--- merge inpatient and outpatient data (assert: 31 cols)
|
63 |
+
aryMergeCols = list(pdfOutpat.columns)
|
64 |
+
pdfAllpat = pdfInpat.merge(pdfOutpat, on=aryMergeCols, how='outer')
|
65 |
+
|
66 |
+
#--- +merge beneficiary data
|
67 |
+
pdfAllPatBenef = pdfAllpat.merge(pdfBenef, on='BeneID', how='inner')
|
68 |
+
|
69 |
+
#--- +merge provider data
|
70 |
+
pdfAllPatBenefProv = pdfAllPatBenef.merge(pdfProvider, on='Provider', how='inner')
|
71 |
+
|
72 |
+
#--- export data
|
73 |
+
strPth_pklClaims = getPath_defPklClaims(blnIsTrain)
|
74 |
+
print("TRACE (claims.loadCsv_claims): pkl claim data file path ... ", strPth_pklClaims)
|
75 |
+
pdfAllPatBenefProv.to_pickle(strPth_pklClaims)
|
76 |
+
|
77 |
+
#print("INFO (csvClaims.shape): ", pdfTest_allPatBenefProv.shape)
|
78 |
+
return pdfAllPatBenefProv
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
def loadCsv_testClaims():
|
83 |
+
#--- TODO: make optional arg test or train data
|
84 |
+
return loadCsv_claims(False)
|
85 |
+
|
86 |
+
|
87 |
+
|
88 |
+
def loadPkl_claims(blnIsTrain=False):
|
89 |
+
strPth_pklClaims = getPath_defPklClaims(blnIsTrain)
|
90 |
+
try:
|
91 |
+
pdfClaims = pd.read_pickle(strPth_pklClaims)
|
92 |
+
except FileNotFoundError:
|
93 |
+
#--- catch: there is no pickle file
|
94 |
+
#--- load from csv instead; will create pkl files for next time
|
95 |
+
pdfClaims = loadCsv_claims(blnIsTrain)
|
96 |
+
return pdfClaims
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
#--- feat eng
|
101 |
+
def do_featEng(pdfLoaded, blnIsTrain=False):
|
102 |
+
if (m_blnTrace2On): print("INFO (claims.doFeatEng): blnIsTrain, ", blnIsTrain)
|
103 |
+
|
104 |
+
#--- remove cols
|
105 |
+
aryColsToDrop = ['BeneID', 'ClaimID', 'ClaimStartDt','ClaimEndDt','AttendingPhysician',
|
106 |
+
'OperatingPhysician', 'OtherPhysician', 'ClmDiagnosisCode_1',
|
107 |
+
'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
|
108 |
+
'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7',
|
109 |
+
'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
|
110 |
+
'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
|
111 |
+
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
|
112 |
+
'ClmAdmitDiagnosisCode', 'AdmissionDt',
|
113 |
+
'DischargeDt', 'DiagnosisGroupCode','DOB', 'DOD',
|
114 |
+
'State', 'County']
|
115 |
+
pdfFeatEng = pdfLoaded.drop(columns=aryColsToDrop, axis=1)
|
116 |
+
|
117 |
+
#--- flag categorical cols
|
118 |
+
pdfFeatEng.Gender = pdfFeatEng.Gender.astype('category')
|
119 |
+
pdfFeatEng.Race = pdfFeatEng.Race.astype('category')
|
120 |
+
|
121 |
+
#--- one-hot-encoding
|
122 |
+
pdfFeatEng = pd.get_dummies(pdfFeatEng, columns=['Gender', 'Race'], drop_first=True)
|
123 |
+
if (blnIsTrain):
|
124 |
+
#--- one-hot encode the potential fraud column (for training data only)
|
125 |
+
try:
|
126 |
+
#print("INFO (claims.doFeatEng): one-hot encoding potential fraud")
|
127 |
+
pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'Yes', 'PotentialFraud'] = 1
|
128 |
+
pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'No', 'PotentialFraud'] = 0
|
129 |
+
except KeyError:
|
130 |
+
#--- likely column not found; invalid fxn call
|
131 |
+
print("ERROR (claims.doFeatEng): Potential Fraud col not found")
|
132 |
+
|
133 |
+
pdfFeatEng.loc[pdfFeatEng['RenalDiseaseIndicator'] == 'Y', 'RenalDiseaseIndicator'] = 1
|
134 |
+
pdfFeatEng['DeductibleAmtPaid'].fillna(0, inplace=True)
|
135 |
+
pdfFeatEng['AdmittedDays'].fillna(0, inplace=True)
|
136 |
+
|
137 |
+
#--- check for correlated cols
|
138 |
+
|
139 |
+
#--- add new features to assist with predictions
|
140 |
+
pdfFeatEng['InscClaimReimbursement_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['InscClaimAmtReimbursed'].transform('mean')
|
141 |
+
pdfFeatEng['DeductibleAmtPaid_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['DeductibleAmtPaid'].transform('mean')
|
142 |
+
|
143 |
+
pdfFeatEng['IPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualReimbursementAmt'].transform('mean')
|
144 |
+
pdfFeatEng['IPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualDeductibleAmt'].transform('mean')
|
145 |
+
|
146 |
+
pdfFeatEng['OPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualReimbursementAmt'].transform('mean')
|
147 |
+
pdfFeatEng['OPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualDeductibleAmt'].transform('mean')
|
148 |
+
return pdfFeatEng
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
#--- data eng on inpatient data
|
153 |
+
def prep_inpatData(pdfInpat):
|
154 |
+
#--- calc admitted days
|
155 |
+
pdfInpat['AdmissionDt'] = pd.to_datetime(pdfInpat['AdmissionDt'], format='%Y-%m-%d')
|
156 |
+
pdfInpat['DischargeDt'] = pd.to_datetime(pdfInpat['DischargeDt'], format='%Y-%m-%d')
|
157 |
+
pdfInpat['AdmittedDays'] = round((pdfInpat['DischargeDt'] - pdfInpat['AdmissionDt']).dt.days + 1)
|
158 |
+
return pdfInpat
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
#--- data eng on beneficiary data
|
163 |
+
def prep_benefData(pdfBenef):
|
164 |
+
#--- chronic condition cols; change any vals of 2 to 0
|
165 |
+
aryCols = ['ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
|
166 |
+
'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
|
167 |
+
'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
|
168 |
+
'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
|
169 |
+
'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
|
170 |
+
'ChronicCond_stroke']
|
171 |
+
|
172 |
+
for strVal in aryCols:
|
173 |
+
pdfBenef.replace({strVal: 2}, 0, inplace=True)
|
174 |
+
|
175 |
+
#--- fill missing data: persons age
|
176 |
+
kstrDatetime = '2019-12-01' #--- the est datetime for the dataset
|
177 |
+
pdfBenef['DOB'] = pd.to_datetime(pdfBenef['DOB'], format = '%Y-%m-%d')
|
178 |
+
pdfBenef['DOD'] = pd.to_datetime(pdfBenef['DOD'], format = '%Y-%m-%d')
|
179 |
+
pdfBenef['Age'] = round((pdfBenef['DOD'] - pdfBenef['DOB']).dt.days/365)
|
180 |
+
pdfBenef['Age'].fillna(round(((pd.to_datetime(kstrDatetime, format='%Y-%m-%d') - pdfBenef['DOB']).dt.days)/365), inplace=True)
|
181 |
+
|
182 |
+
#--- add an isDead flag column
|
183 |
+
pdfBenef.loc[pdfBenef['DOD'].isna(), 'DeadOrNot'] = 0
|
184 |
+
pdfBenef.loc[pdfBenef['DOD'].notna(), 'DeadOrNot'] = 1
|
185 |
+
|
186 |
+
return pdfBenef
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
|
192 |
+
|
193 |
+
def get_kmeansPredict(pdfTestClaims):
|
194 |
+
|
195 |
+
#--- load test data
|
196 |
+
pdfClaims = pdfTestClaims
|
197 |
+
#print("INFO (claims.get_kmeansPredict) pdfClaims.shape): ", pdfClaims.shape)
|
198 |
+
|
199 |
+
#--- perform featEng, std scaling
|
200 |
+
print("TRACE: claims.kmeansPredict perform featEng, stdScaling ...")
|
201 |
+
pdfFeatEng = mdl_kmeans.do_featEng(pdfClaims, False, False)
|
202 |
+
npaScaled = mdl_utils.doClaims_stdScaler(pdfFeatEng, False)
|
203 |
+
pdfScaled = mdl_utils.doClaims_stdScaler_toPdf(npaScaled)
|
204 |
+
#print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
|
205 |
+
|
206 |
+
#--- get the pre-fit kmeans clusters
|
207 |
+
#--- predict/label clusters against data points
|
208 |
+
print("TRACE: claims.kmeansPredict perform kmeans predict ...")
|
209 |
+
ndaPredict = mdl_kmeans.predict(pdfScaled)
|
210 |
+
#print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
|
211 |
+
|
212 |
+
pdfPredict = pd.DataFrame(ndaPredict)
|
213 |
+
#print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
|
214 |
+
|
215 |
+
#--- stitch the data with the labels
|
216 |
+
print("TRACE: claims.kmeansPredict stitch labels with results ...")
|
217 |
+
pdfResults = pdfTestClaims
|
218 |
+
#print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
|
219 |
+
|
220 |
+
pdfResults.insert(0, "cluster", pdfPredict[0])
|
221 |
+
return pdfResults
|
222 |
+
|
223 |
+
|
224 |
+
|
225 |
+
def get_kmeansFit(pdfTestClaims):
|
226 |
+
|
227 |
+
pdfClaims = pdfTestClaims
|
228 |
+
pdfFeatEng = do_featEng(pdfClaims, False) #--- not grouped by provider
|
229 |
+
|
230 |
+
|
231 |
+
#--- perform standard scaling; get fit then transform
|
232 |
+
npaScaled = mdl_utils.do_stdScaler(pdfFeatEng, False) #--- grouped by provider
|
233 |
+
pdfScaled = mdl_utils.do_stdScaler_toPdf(npaScaled)
|
234 |
+
#print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
|
235 |
+
|
236 |
+
#--- SKIP: perform PCA; then kmeans fit (this was done to determine the KMeans params)
|
237 |
+
#--- get Kmeans object, instantiated with trained args, and fit to test/prod scaled data
|
238 |
+
#--- OR ... assume that the kmeans is already fit, and we now want to predict which cluster each data point appears in
|
239 |
+
mdlKmeans = mdl_kmeans.fit(pdfScaled)
|
240 |
+
"""
|
241 |
+
|
242 |
+
pdfPredict = pd.DataFrame(ndaPredict)
|
243 |
+
#print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
|
244 |
+
|
245 |
+
#--- stitch the grouped data with the labels
|
246 |
+
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
|
247 |
+
#print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
|
248 |
+
|
249 |
+
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
|
250 |
+
|
251 |
+
Notes:
|
252 |
+
- train_final = trainAllPatientDetailsProvider #--- ungrouped data (558211, 27); has PotentialFraud cols
|
253 |
+
- train_final = pd.get_dummies #--- post one-hot encoding (558211, 25=27-2+4) ; -Gender-Race + 4*(Gender+Race one-hot encoding)
|
254 |
+
- y, X: X.shape = (558211, 27); y.shape=(558211,1) #--- X popped PotentialFraud, and dropped Provider
|
255 |
+
- train_final[cluster_labels] = mdlKMeans.labels
|
256 |
+
"""
|
257 |
+
|
258 |
+
return mdlKmeans
|
lib/models/__init__.py
ADDED
File without changes
|
lib/models/mdl_autoenc.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.decomposition import PCA
|
4 |
+
import lib.utils as libPaths
|
5 |
+
import pickle
|
6 |
+
|
7 |
+
|
8 |
+
m_kstrFile = __file__
|
9 |
+
m_kstrDataPath = libPaths.pth_data
|
10 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
11 |
+
m_kstrPcaModelPath = m_kstrBinModelPath + 'pca_unsuperv_colab.pkl'
|
12 |
+
m_kstrEncModelPath = m_kstrBinModelPath + 'enc_keras_seq/'
|
13 |
+
|
14 |
+
|
15 |
+
#--- Supervised: autoencoder - Principal Component Analysis
|
16 |
+
def load_encFromKeras():
|
17 |
+
from tensorflow import keras
|
18 |
+
mdlAnoms = keras.models.load_model(m_kstrEncModelPath)
|
19 |
+
return mdlAnoms
|
20 |
+
|
21 |
+
|
22 |
+
def load_pcaFromPkl():
|
23 |
+
with open(m_kstrPcaModelPath, 'rb') as filPkl:
|
24 |
+
# load using pickle de-serializer
|
25 |
+
mdlAnoms = pickle.load(filPkl)
|
26 |
+
return mdlAnoms
|
27 |
+
|
28 |
+
|
29 |
+
def save_encToKeras(mdlAnoms):
|
30 |
+
mdlAnoms.save(m_kstrEncModelPath)
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
def predict(pdfScaled):
|
35 |
+
|
36 |
+
#--- Pre: Transforming train and test dataframes based on PCA
|
37 |
+
mdlPCA = load_pcaFromPkl() #--- this is a pre-fit model based on training
|
38 |
+
npaPca = mdlPCA.transform(pdfScaled)
|
39 |
+
print("INFO (" + m_kstrFile + ".predict) npaPca.shape: ", npaPca.shape)
|
40 |
+
|
41 |
+
|
42 |
+
#--- predict on unseen data
|
43 |
+
mdlEnc = load_encFromKeras()
|
44 |
+
npaPredict = mdlEnc.predict(npaPca[:,:29])
|
45 |
+
print("INFO (" + m_kstrFile + ".predict) npaPredict.shape: ", npaPredict.shape)
|
46 |
+
#--- expected: 297, 29?
|
47 |
+
return npaPredict
|
48 |
+
|
49 |
+
|
50 |
+
"""
|
51 |
+
def train(pdfTrainData):
|
52 |
+
mdlAnoms = PCA() #---- TODO: this is Keras Sequential
|
53 |
+
mdlAnoms.fit(pdfTrainData.values)
|
54 |
+
save_encToKeras(mdlAnoms)
|
55 |
+
return mdlAnoms """
|
lib/models/mdl_kmeans.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.cluster import KMeans
|
2 |
+
import lib.utils as libPaths
|
3 |
+
import pickle
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
m_kstrFile = __file__
|
8 |
+
m_kstrDataPath = libPaths.pth_data
|
9 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
10 |
+
|
11 |
+
#m_kstrPcaModelPath = m_kstrBinModelPath + 'pca_kmeans_unsuperv_colab.pkl'
|
12 |
+
#m_kstrPcaModelPath = m_kstrBinModelPath + 'pca_kmeans_unsuperv_colab_v1.2.1.pkl'
|
13 |
+
m_kstrPcaModelPath_111 = m_kstrBinModelPath + 'claims_pca_v1.1.1_27cols.pkl' #--- ERROR: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given
|
14 |
+
m_kstrPcaModelPath_121 = m_kstrBinModelPath + 'claims_pca_v1.2.1_27cols.pkl'
|
15 |
+
m_kstrPcaModelPath_claims_py3816_sk111hp = m_kstrBinModelPath + 'claims_pca_py3816_sk111hp_27cols.pkl'
|
16 |
+
m_kstrPcaModelPath = m_kstrPcaModelPath_claims_py3816_sk111hp
|
17 |
+
|
18 |
+
#m_kstrKmeansModelPath = m_kstrBinModelPath + 'kmeans_unsuperv_colab.pkl'
|
19 |
+
#m_kstrKmeansModelPath = m_kstrBinModelPath + 'kmn_unsuperv_colab_v1.2.1.pkl'
|
20 |
+
m_kstrModelPath_111 = m_kstrBinModelPath + 'claims_kmn_v1.1.1_22cols.pkl' #--- ERROR: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given
|
21 |
+
m_kstrModelPath_121 = m_kstrBinModelPath + 'claims_kmn_v1.2.1_22cols.pkl'
|
22 |
+
m_kstrModelPath_claims_py3816_sk111hp = m_kstrBinModelPath + 'claims_kmn_py3816_sk111hp_22cols.pkl'
|
23 |
+
m_kstrKmeansModelPath = m_kstrModelPath_claims_py3816_sk111hp
|
24 |
+
|
25 |
+
m_blnTraceOn = True
|
26 |
+
|
27 |
+
|
28 |
+
#--- unsupervised: Logistic Regession
|
29 |
+
def load_pcaFromPkl():
|
30 |
+
with open(m_kstrPcaModelPath, 'rb') as filPkl:
|
31 |
+
mdlAnoms = pickle.load(filPkl)
|
32 |
+
return mdlAnoms
|
33 |
+
|
34 |
+
|
35 |
+
#--- unsupervised: KMeans
|
36 |
+
def load_kmeansFromPkl():
|
37 |
+
with open(m_kstrKmeansModelPath, 'rb') as filPkl:
|
38 |
+
mdlAnoms = pickle.load(filPkl)
|
39 |
+
return mdlAnoms
|
40 |
+
|
41 |
+
|
42 |
+
def save_pcaToPkl(mdlAnoms):
|
43 |
+
with open(m_kstrPcaModelPath, 'wb') as filPkl:
|
44 |
+
pickle.dump(mdlAnoms, filPkl)
|
45 |
+
return mdlAnoms
|
46 |
+
|
47 |
+
|
48 |
+
def save_kmeansToPkl(mdlAnoms):
|
49 |
+
with open(m_kstrKmeansModelPath, 'wb') as filPkl:
|
50 |
+
pickle.dump(mdlAnoms, filPkl)
|
51 |
+
return mdlAnoms
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
#--- determine which points can be labelled against which clusters
|
56 |
+
def predict(pdfScaled):
|
57 |
+
#--- load a persisted fit kmeans model
|
58 |
+
#--- predict will assign labels onto a similarly scaled data frame
|
59 |
+
|
60 |
+
|
61 |
+
#--- Note: reverse chron through the code ...
|
62 |
+
#--- 4. KMeans was fit on X-reduced (22 cols)
|
63 |
+
#--- 3. X_reduced was a reduced column set of X-scaled (27 -> 22; Dropped 5 cols: DeadOrNot; and hotEncoded Gender and Race)
|
64 |
+
#--- 2. x_scaled was transformed through stdScaler
|
65 |
+
#--- 1. StdScaler was fit on X to produce X-scaled (X has 27 cols)
|
66 |
+
pdfReduced = pdfScaled[['InscClaimAmtReimbursed', 'DeductibleAmtPaid',
|
67 |
+
'AdmittedDays', 'RenalDiseaseIndicator', 'NoOfMonths_PartACov',
|
68 |
+
'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
|
69 |
+
'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
|
70 |
+
'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
|
71 |
+
'ChronicCond_Depression', 'ChronicCond_Diabetes',
|
72 |
+
'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
|
73 |
+
'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
|
74 |
+
'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
|
75 |
+
'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Age']]
|
76 |
+
|
77 |
+
#--- prefit Kmeans clustering - was fit on trained pdfReduced
|
78 |
+
#--- Note: if we want to understand how kmeans performs on test/prod data, we need to predict
|
79 |
+
mdlKMeans = load_kmeansFromPkl()
|
80 |
+
#ndaPredict = mdlKMeans.predict(pdfScaled) #20230208: ValueError: X has 27 features, but KMeans is expecting 22 features as input.
|
81 |
+
ndaPredict = mdlKMeans.predict(pdfReduced) #ValueError: X has 22 features, but KMeans is expecting 27 features as input.
|
82 |
+
return ndaPredict
|
83 |
+
|
84 |
+
|
85 |
+
#--- feat eng
|
86 |
+
def do_featEng(pdfLoaded, blnIsTrain=False, hasGroupByProviderCols=True):
|
87 |
+
print("INFO (mdl_kmeans.doFeatEng): blnIsTrain, ", blnIsTrain)
|
88 |
+
|
89 |
+
#--- columns_to_remove
|
90 |
+
aryColsToDrop = ['BeneID', 'ClaimID', 'ClaimStartDt','ClaimEndDt','AttendingPhysician',
|
91 |
+
'OperatingPhysician', 'OtherPhysician', 'ClmDiagnosisCode_1',
|
92 |
+
'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
|
93 |
+
'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7',
|
94 |
+
'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
|
95 |
+
'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
|
96 |
+
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
|
97 |
+
'ClmAdmitDiagnosisCode', 'AdmissionDt',
|
98 |
+
'DischargeDt', 'DiagnosisGroupCode','DOB', 'DOD',
|
99 |
+
'State', 'County']
|
100 |
+
pdfFeatEng = pdfLoaded.drop(columns=aryColsToDrop, axis=1)
|
101 |
+
|
102 |
+
#--- flag categorical cols
|
103 |
+
pdfFeatEng.Gender = pdfFeatEng.Gender.astype('category')
|
104 |
+
pdfFeatEng.Race = pdfFeatEng.Race.astype('category')
|
105 |
+
|
106 |
+
#--- one-hot-encoding
|
107 |
+
pdfFeatEng = pd.get_dummies(pdfFeatEng, columns=['Gender', 'Race'], drop_first=True)
|
108 |
+
if (blnIsTrain):
|
109 |
+
#--- one-hot encode the potential fraud column (for training data only)
|
110 |
+
try:
|
111 |
+
#print("INFO (claims.doFeatEng): one-hot encoding potential fraud")
|
112 |
+
pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'Yes', 'PotentialFraud'] = 1
|
113 |
+
pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'No', 'PotentialFraud'] = 0
|
114 |
+
except KeyError:
|
115 |
+
#--- likely column not found; invalid fxn call
|
116 |
+
print("ERROR (claims.doFeatEng): Potential Fraud col not found")
|
117 |
+
|
118 |
+
pdfFeatEng.loc[pdfFeatEng['RenalDiseaseIndicator'] == 'Y', 'RenalDiseaseIndicator'] = 1
|
119 |
+
pdfFeatEng['DeductibleAmtPaid'].fillna(0, inplace=True)
|
120 |
+
pdfFeatEng['AdmittedDays'].fillna(0, inplace=True)
|
121 |
+
|
122 |
+
#--- check for correlated cols
|
123 |
+
|
124 |
+
#--- add new features to assist with predictions
|
125 |
+
if (hasGroupByProviderCols):
|
126 |
+
pdfFeatEng['InscClaimReimbursement_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['InscClaimAmtReimbursed'].transform('mean')
|
127 |
+
pdfFeatEng['DeductibleAmtPaid_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['DeductibleAmtPaid'].transform('mean')
|
128 |
+
|
129 |
+
pdfFeatEng['IPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualReimbursementAmt'].transform('mean')
|
130 |
+
pdfFeatEng['IPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualDeductibleAmt'].transform('mean')
|
131 |
+
|
132 |
+
pdfFeatEng['OPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualReimbursementAmt'].transform('mean')
|
133 |
+
pdfFeatEng['OPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualDeductibleAmt'].transform('mean')
|
134 |
+
return pdfFeatEng
|
135 |
+
|
136 |
+
|
137 |
+
def fit(pdfScaled):
|
138 |
+
#--- determine the centroids of the kmeans clusters
|
139 |
+
#--- refit kmeans clustering according to the pre-scaled data provided
|
140 |
+
#--- note: this all assumes that the nature of the data and the number of clusters remain unchanged
|
141 |
+
m_klngNumClusters = 3
|
142 |
+
if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit) instantiate KMeans ...")
|
143 |
+
mdlKMeans = KMeans(n_clusters=m_klngNumClusters, max_iter=50, random_state=2022) #--- #clusters was learned from training
|
144 |
+
|
145 |
+
if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit) fitting data (scaled) ...")
|
146 |
+
mdlKMeans.fit(pdfScaled) #--- fit on test/prod data
|
147 |
+
|
148 |
+
return mdlKMeans #--- this ibject will give us all results based on kmeans
|
149 |
+
|
150 |
+
|
151 |
+
def train(pdfTrainData):
|
152 |
+
mdlAnoms = KMeans(n_clusters=3, max_iter=50, random_state=2022)
|
153 |
+
mdlAnoms.fit(pdfTrainData.values)
|
154 |
+
save_kmeansToPkl(mdlAnoms)
|
155 |
+
return mdlAnoms
|
lib/models/mdl_logR.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.linear_model import LogisticRegressionCV
|
2 |
+
import lib.utils as libPaths
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
|
6 |
+
m_kstrFile = __file__
|
7 |
+
m_kstrDataPath = libPaths.pth_data
|
8 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
9 |
+
m_kstrModelPath = m_kstrBinModelPath + 'lgr_model_colab.pkl'
|
10 |
+
|
11 |
+
|
12 |
+
#--- Supervised: Logistic Regession
|
13 |
+
def load_fromPkl():
|
14 |
+
with open(m_kstrModelPath, 'rb') as filPkl:
|
15 |
+
mdlAnoms = pickle.load(filPkl)
|
16 |
+
return mdlAnoms
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
def save_toPkl(mdlAnoms):
|
21 |
+
with open(m_kstrModelPath, 'wb') as filPkl:
|
22 |
+
pickle.dump(mdlAnoms, filPkl)
|
23 |
+
return mdlAnoms
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def predict(npaData):
|
28 |
+
#--- input: numpy.ndarray of feature eng, and scaled data
|
29 |
+
mdlAnoms = load_fromPkl()
|
30 |
+
npaPredict = mdlAnoms.predict(npaData)
|
31 |
+
|
32 |
+
print("INFO (npaPredict.shape): ", npaPredict.shape)
|
33 |
+
return npaPredict
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
def train(pdfTrainData):
|
38 |
+
mdlAnoms = LogisticRegressionCV()
|
39 |
+
mdlAnoms.fit(pdfTrainData.values)
|
40 |
+
save_toPkl(mdlAnoms)
|
41 |
+
return mdlAnoms
|
lib/models/mdl_svm.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.svm import LinearSVC
|
2 |
+
import lib.utils as libPaths
|
3 |
+
import pickle
|
4 |
+
|
5 |
+
|
6 |
+
m_kstrFile = __file__
|
7 |
+
m_kstrDataPath = libPaths.pth_data
|
8 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
9 |
+
m_kstrModelPath = m_kstrBinModelPath + 'svm_model_colab.pkl'
|
10 |
+
|
11 |
+
|
12 |
+
#--- Supervised: Support Vector Machines
|
13 |
+
def load_fromPkl():
|
14 |
+
with open(m_kstrModelPath, 'rb') as filPkl:
|
15 |
+
mdlAnoms = pickle.load(filPkl)
|
16 |
+
return mdlAnoms
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
def save_toPkl(mdlAnoms):
|
21 |
+
with open(m_kstrModelPath, 'wb') as filPkl:
|
22 |
+
pickle.dump(mdlAnoms, filPkl)
|
23 |
+
return mdlAnoms
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def predict(npaData):
|
28 |
+
#--- input: numpy.ndarray of feature eng, and scaled data
|
29 |
+
mdlAnoms = load_fromPkl()
|
30 |
+
npaPredict = mdlAnoms.predict(npaData)
|
31 |
+
print("INFO (" + m_kstrFile + ".predict) npaPredict.shape: ", npaPredict.shape)
|
32 |
+
return npaPredict
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def train(pdfTrainData):
|
37 |
+
mdlAnoms = LinearSVC()
|
38 |
+
mdlAnoms.fit(pdfTrainData.values)
|
39 |
+
save_toPkl(mdlAnoms)
|
40 |
+
return mdlAnoms
|
lib/models/mdl_utils.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pickle
|
3 |
+
import lib.utils as libPaths
|
4 |
+
|
5 |
+
m_blnTraceOn = False
|
6 |
+
|
7 |
+
#--- load, merge data from file
|
8 |
+
m_kstrDataPath = libPaths.pth_data
|
9 |
+
m_kstrModelPath = libPaths.pth_model
|
10 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
11 |
+
|
12 |
+
#m_kstrScalerPath_claims = m_kstrBinModelPath + 'stdClaims_scaler_colab.pkl' #--- does not work for scaling claims data; from v1.0.2; using 1.1.1
|
13 |
+
#m_kstrScalerPath_claims2 = m_kstrBinModelPath + 'std_scaler_unsuperv_colab.pkl' #--- does not work; expects 32 features
|
14 |
+
#m_kstrScalerPath_claims = m_kstrBinModelPath + 'stdClaims_scaler_colab_v1.2.1.pkl'
|
15 |
+
m_kstrScalerPath_claims111 = m_kstrBinModelPath + 'claims_stdScaler_v1.1.1_27cols.pkl'
|
16 |
+
m_kstrScalerPath_claims121 = m_kstrBinModelPath + 'claims_stdScaler_v1.2.1_27cols.pkl'
|
17 |
+
m_kstrScalerPath_claims_py3816_sk111hp = m_kstrBinModelPath + 'claims_stdScl_py3816_sk111hp_27cols.pkl'
|
18 |
+
m_kstrScalerPath_claims = m_kstrScalerPath_claims_py3816_sk111hp
|
19 |
+
|
20 |
+
m_kstrScalerPath_providers111 = m_kstrBinModelPath + 'prov_stdScaler_v1.1.1_32cols.pkl'
|
21 |
+
m_kstrScalerPath_providers121 = m_kstrBinModelPath + 'prov_stdScaler_v1.2.1_32cols.pkl'
|
22 |
+
m_kstrScalerPath_prov_py3816_sk111 = m_kstrBinModelPath + 'prov_stdScl_py3816_sk111_32cols.pkl'
|
23 |
+
m_kstrScalerPath_prov_py3816_sk111hp = m_kstrBinModelPath + 'prov_stdScl_py3816_sk111hp_32cols.pkl'
|
24 |
+
m_kstrScalerPath_prov = m_kstrScalerPath_prov_py3816_sk111hp
|
25 |
+
|
26 |
+
m_kstrScalerPath_providers_superv = m_kstrBinModelPath + 'gbc_scaler.pkl'
|
27 |
+
m_kstrScalerPath_providers_train = m_kstrBinModelPath + "stdProvider_scaler.pkl"
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
def doProviders_stdScaler(pdfFeatEng, blnIsTrain=False, hasGroupByProviderCols=True):
|
32 |
+
print("INFO (claims.do_stdScaler): blnIsTrain, ", blnIsTrain)
|
33 |
+
|
34 |
+
#--- Note: prediction runs on X_val
|
35 |
+
'''
|
36 |
+
#--- WARN: The default value of numeric_only in DataFrameGroupBy.sum is deprecated.
|
37 |
+
# In a future version, numeric_only will default to False. Either specify
|
38 |
+
# numeric_only or select only columns which should be valid for the function.
|
39 |
+
'''
|
40 |
+
|
41 |
+
#--- WARN: this code groups all data by provider; any predictions will also be by provider
|
42 |
+
pdfGroupBy = pdfFeatEng
|
43 |
+
if (hasGroupByProviderCols):
|
44 |
+
pdfGroupBy = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
|
45 |
+
|
46 |
+
X = pdfGroupBy
|
47 |
+
|
48 |
+
try:
|
49 |
+
X = X.drop(columns=['Provider'], axis=1) #--- cannot scale; text
|
50 |
+
except KeyError:
|
51 |
+
#--- likely column not found; invalid fxn call
|
52 |
+
print("ERROR (mdlUtils.doProviders_stdScaler): Provider col not found")
|
53 |
+
|
54 |
+
try:
|
55 |
+
X = X.drop(columns=['PotentialFraud'], axis=1)
|
56 |
+
except KeyError:
|
57 |
+
#--- likely column not found; invalid fxn call
|
58 |
+
if (blnIsTrain): print("ERROR (mdlUtils.doProviders_stdScaler): Potential Fraud col not found")
|
59 |
+
|
60 |
+
|
61 |
+
#--- apply std scaler
|
62 |
+
#--- WARN: scaling is also grouped by provider
|
63 |
+
if (m_blnTraceOn): print("INFO (mdlUtils.doProviders_stdScaler) cols: ", X.columns) #--- 32cols
|
64 |
+
X_std = fitProviders_txfStdScaler(X, blnIsTrain)
|
65 |
+
return X_std
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
def doClaims_stdScaler(pdfFeatEng, blnIsTrain=False):
|
70 |
+
print("INFO (mdlUtils.doClaims_stdScaler): blnIsTrain, ", blnIsTrain)
|
71 |
+
|
72 |
+
#--- Note: prediction runs on X_val
|
73 |
+
'''
|
74 |
+
#--- WARN: The default value of numeric_only in DataFrameGroupBy.sum is deprecated.
|
75 |
+
# In a future version, numeric_only will default to False. Either specify
|
76 |
+
# numeric_only or select only columns which should be valid for the function.
|
77 |
+
'''
|
78 |
+
|
79 |
+
#--- WARN: this code groups all data by provider; any predictions will also be by provider
|
80 |
+
X = pdfFeatEng
|
81 |
+
|
82 |
+
try:
|
83 |
+
X = X.drop(columns=['Provider'], axis=1) #--- cannot scale; text
|
84 |
+
except KeyError:
|
85 |
+
#--- likely column not found; invalid fxn call
|
86 |
+
print("ERROR (mdlUtils.do_stdScaler): Provider col not found")
|
87 |
+
|
88 |
+
try:
|
89 |
+
X = X.drop(columns=['PotentialFraud'], axis=1)
|
90 |
+
except KeyError:
|
91 |
+
#--- likely column not found; invalid fxn call
|
92 |
+
if (blnIsTrain): print("ERROR (mdlUtils.do_stdScaler): Potential Fraud col not found")
|
93 |
+
|
94 |
+
|
95 |
+
#--- apply std scaler
|
96 |
+
#--- WARN: scaling is also grouped by provider
|
97 |
+
#print("INFO (mdlUtils.doClaims_stdScaler) cols: ", X.columns)
|
98 |
+
X_std = fitClaims_txfStdScaler(X, blnIsTrain)
|
99 |
+
return X_std
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
def doProviders_stdScaler_toPdf(npaScaled):
|
104 |
+
#--- NOTE: the list of cols came from doProvider_stdScaler; print(X.columns)
|
105 |
+
aryCols = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'AdmittedDays',
|
106 |
+
'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
|
107 |
+
'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
|
108 |
+
'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
|
109 |
+
'ChronicCond_Depression', 'ChronicCond_Diabetes',
|
110 |
+
'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
|
111 |
+
'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
|
112 |
+
'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
|
113 |
+
'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Age', 'DeadOrNot',
|
114 |
+
'Gender_2', 'Race_2', 'Race_3', 'Race_5',
|
115 |
+
'ClaimReimbursement_ProviderAvg',
|
116 |
+
'ClaimReimbursement_AttendingPhysician',
|
117 |
+
'ClaimReimbursement_OperatingPhysician',
|
118 |
+
'DeductibleAmtPaid_ProviderAvg', 'DeductibleAmtPaid_AttendingPhysician',
|
119 |
+
'DeductibleAmtPaid_OperatingPhysician']
|
120 |
+
|
121 |
+
#npaScaled = do_stdScaler(pdfFeatEng)
|
122 |
+
pdfScaled = pd.DataFrame(npaScaled, columns=aryCols)
|
123 |
+
return pdfScaled
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
def doClaims_stdScaler_toPdf(npaScaled):
|
128 |
+
#--- NOTE: the list of cols came from doClaims_stdScaler; print(X.columns)
|
129 |
+
aryCols = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'AdmittedDays',
|
130 |
+
'RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
|
131 |
+
'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
|
132 |
+
'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
|
133 |
+
'ChronicCond_Depression', 'ChronicCond_Diabetes',
|
134 |
+
'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
|
135 |
+
'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
|
136 |
+
'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
|
137 |
+
'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Age', 'DeadOrNot',
|
138 |
+
'Gender_2', 'Race_2', 'Race_3', 'Race_5']
|
139 |
+
|
140 |
+
#npaScaled = do_stdScaler(pdfFeatEng)
|
141 |
+
pdfScaled = pd.DataFrame(npaScaled, columns=aryCols)
|
142 |
+
return pdfScaled
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
def fitClaims_stdScaler(pdfData, blnIsTrain=False):
|
148 |
+
#--- apply scaler
|
149 |
+
#--- WARN: scaling is not grouped by provider
|
150 |
+
from sklearn.preprocessing import StandardScaler
|
151 |
+
|
152 |
+
#--- note: this is a numpy.ndarray
|
153 |
+
#--- we need to fit the scaler, and then save as a pkl file
|
154 |
+
#strScalerPath = m_kstrScalerPath_claims
|
155 |
+
strScalerPath = m_kstrScalerPath_claims
|
156 |
+
# strScalerPath = m_kstrBinModelPath + "stdClaims_scaler_colab.pkl"
|
157 |
+
if (m_blnTraceOn): print("INFO (lib.model.fitClaims_stdScalar): ", strScalerPath)
|
158 |
+
if (blnIsTrain):
|
159 |
+
scaler = StandardScaler()
|
160 |
+
sclFit = scaler.fit(pdfData)
|
161 |
+
#--- if we train locally; write out to gbc_scalar.pkl
|
162 |
+
#--- we do not want to overwrite the colab version used for test
|
163 |
+
strScalerPath = m_kstrBinModelPath + "stdClaims_scaler.pkl"
|
164 |
+
if (m_blnTraceOn): print("INFO (lib.model.fit_stdScalar) Using local pkl for Train: ", strScalerPath)
|
165 |
+
with open(strScalerPath, 'wb') as filPkl:
|
166 |
+
pickle.dump(sclFit, filPkl)
|
167 |
+
else:
|
168 |
+
#--- we need to load the pkl file
|
169 |
+
import sklearn
|
170 |
+
if (m_blnTraceOn): print("INFO (lib.model.fit_stdScalar) Using colab pkl for Test: ", strScalerPath)
|
171 |
+
with open(strScalerPath, 'rb') as filPkl:
|
172 |
+
sclFit = pickle.load(filPkl)
|
173 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) sclFit.type: ", type(sclFit))
|
174 |
+
|
175 |
+
#--- testing
|
176 |
+
scaler = StandardScaler()
|
177 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) StdScaler.version: ", scaler.__getstate__()['_sklearn_version'])
|
178 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) sclFit.version: " , sclFit.__getstate__()['_sklearn_version'])
|
179 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) sklearn.version: " , sklearn.__version__)
|
180 |
+
return sclFit
|
181 |
+
|
182 |
+
|
183 |
+
|
184 |
+
def fitProviders_stdScaler(pdfData, blnIsTrain=False):
|
185 |
+
#--- apply scaler
|
186 |
+
#--- WARN: scaling is also grouped by provider
|
187 |
+
from sklearn.preprocessing import StandardScaler
|
188 |
+
|
189 |
+
#--- note: this is a numpy.ndarray
|
190 |
+
#--- we need to fit the scaler, and then save as a pkl file
|
191 |
+
#strScalerPath = m_kstrScalerPath_providers
|
192 |
+
#strScalerPath = m_kstrScalerPath_providers_train
|
193 |
+
strScalerPath = m_kstrScalerPath_prov
|
194 |
+
print("INFO (libModel.fitProviders_stdScalar): ", strScalerPath)
|
195 |
+
if (blnIsTrain):
|
196 |
+
scaler = StandardScaler()
|
197 |
+
sclFit = scaler.fit(pdfData)
|
198 |
+
#--- if we train locally; write out to gbc_scalar.pkl
|
199 |
+
#--- we do not want to overwrite the colab version used for test
|
200 |
+
strScalerPath = m_kstrScalerPath_providers_train #--- works for provider training
|
201 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using local pkl for Train: ", strScalerPath)
|
202 |
+
with open(strScalerPath, 'wb') as filPkl:
|
203 |
+
pickle.dump(sclFit, filPkl)
|
204 |
+
else:
|
205 |
+
#--- we need to load the pkl file
|
206 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using colab pkl for Test: ", strScalerPath)
|
207 |
+
with open(strScalerPath, 'rb') as filPkl:
|
208 |
+
sclFit = pickle.load(filPkl)
|
209 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) sclFit.type: ", type(sclFit))
|
210 |
+
return sclFit
|
211 |
+
|
212 |
+
|
213 |
+
|
214 |
+
def fitProviders_stdScalerSuperv(pdfData, blnIsTrain=False):
|
215 |
+
#--- apply scaler
|
216 |
+
#--- WARN: scaling is also grouped by provider
|
217 |
+
from sklearn.preprocessing import StandardScaler
|
218 |
+
|
219 |
+
#--- note: this is a numpy.ndarray
|
220 |
+
#--- we need to fit the scaler, and then save as a pkl file
|
221 |
+
strScalerPath = m_kstrScalerPath_prov
|
222 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar): ", strScalerPath)
|
223 |
+
if (blnIsTrain):
|
224 |
+
scaler = StandardScaler()
|
225 |
+
sclFit = scaler.fit(pdfData)
|
226 |
+
#--- if we train locally; write out to gbc_scalar.pkl
|
227 |
+
#--- we do not want to overwrite the colab version used for test
|
228 |
+
strScalerPath = m_kstrBinModelPath + "stdProvider_scaler.pkl"
|
229 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using local pkl for Train: ", strScalerPath)
|
230 |
+
with open(strScalerPath, 'wb') as filPkl:
|
231 |
+
pickle.dump(sclFit, filPkl)
|
232 |
+
else:
|
233 |
+
#--- we need to load the pkl file
|
234 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using colab pkl for Test: ", strScalerPath)
|
235 |
+
with open(strScalerPath, 'rb') as filPkl:
|
236 |
+
sclFit = pickle.load(filPkl)
|
237 |
+
if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) sclFit.type: ", type(sclFit))
|
238 |
+
return sclFit
|
239 |
+
|
240 |
+
|
241 |
+
|
242 |
+
def fitProviders_txfStdScaler(pdfData, blnIsTrain=False):
|
243 |
+
from sklearn.preprocessing import StandardScaler
|
244 |
+
sclFit = fitProviders_stdScaler(pdfData, blnIsTrain)
|
245 |
+
X_std = sclFit.transform(pdfData)
|
246 |
+
return X_std
|
247 |
+
|
248 |
+
|
249 |
+
|
250 |
+
def fitClaims_txfStdScaler(pdfData, blnIsTrain=False):
|
251 |
+
from sklearn.preprocessing import StandardScaler
|
252 |
+
sclFit = fitClaims_stdScaler(pdfData, blnIsTrain)
|
253 |
+
|
254 |
+
|
255 |
+
X_std = sclFit.transform(pdfData)
|
256 |
+
return X_std
|
lib/models/mdl_xgb.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.ensemble import GradientBoostingClassifier
|
3 |
+
import lib.utils as libPaths
|
4 |
+
import pickle
|
5 |
+
import sys
|
6 |
+
|
7 |
+
|
8 |
+
m_kstrFile = __file__
|
9 |
+
m_kstrDataPath = libPaths.pth_data
|
10 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
11 |
+
m_kstrModelPath_gbc = m_kstrBinModelPath + 'gbc_model_colab.pkl'
|
12 |
+
m_kstrModelPath_prov111 = m_kstrBinModelPath + 'prov_gbc_v1.1.1_32cols.pkl' #--- ERROR: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given
|
13 |
+
m_kstrModelPath_prov121 = m_kstrBinModelPath + 'prov_gbc_v1.2.1_32cols.pkl'
|
14 |
+
m_kstrModelPath_prov_py3816_sk111hp = m_kstrBinModelPath + 'prov_gbc_py3816_sk111hp_32cols.pkl'
|
15 |
+
m_kstrModelPath = m_kstrModelPath_prov_py3816_sk111hp
|
16 |
+
|
17 |
+
m_blnTraceOn = True
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
#--- Supervised: xg boost; gradient boosting classifier
|
22 |
+
def load_fromPkl():
|
23 |
+
try:
|
24 |
+
with open(m_kstrModelPath, 'rb') as filPkl:
|
25 |
+
mdlAnoms = pickle.load(filPkl)
|
26 |
+
return mdlAnoms
|
27 |
+
|
28 |
+
except:
|
29 |
+
e = sys.exc_info()
|
30 |
+
print("ERROR (mdl_xgb.load_fromPkl_genError): ", e)
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
def save_toPkl(mdlAnoms):
|
35 |
+
with open(m_kstrModelPath, 'wb') as filPkl:
|
36 |
+
pickle.dump(mdlAnoms, filPkl)
|
37 |
+
return mdlAnoms
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
def predict(npaData):
|
42 |
+
|
43 |
+
try:
|
44 |
+
#--- input: numpy.ndarray of feature eng, and scaled data
|
45 |
+
mdlAnoms = load_fromPkl()
|
46 |
+
if (m_blnTraceOn): print("TRACE (mdl_xgb.predict): data loaded ... ")
|
47 |
+
npaPredict = mdlAnoms.predict(npaData)
|
48 |
+
|
49 |
+
except:
|
50 |
+
e = sys.exc_info()
|
51 |
+
print("ERROR (mdl_xgb.predict_genError1): ", e)
|
52 |
+
|
53 |
+
|
54 |
+
#--- AttributeError: 'GradientBoostingClassifier' object has no attribute '_loss'
|
55 |
+
#--- version of scikit-learn? Monika: ?.?.? ; Iain: 1.2.0
|
56 |
+
|
57 |
+
#print("INFO (type.npaPredict): ", type(npaPredict))
|
58 |
+
#if (m_blnTraceOn): print("TRACE (mdl_xgb.predict) npaPredict.shape: ", npaPredict.shape)
|
59 |
+
return npaPredict
|
60 |
+
|
61 |
+
|
62 |
+
def train(pdfTrainData):
|
63 |
+
mdlAnoms = GradientBoostingClassifier()
|
64 |
+
mdlAnoms.fit(pdfTrainData.values)
|
65 |
+
save_toPkl(mdlAnoms)
|
66 |
+
return mdlAnoms
|
lib/providers.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import lib.utils as libPaths
|
3 |
+
import lib.claims as libClaims
|
4 |
+
|
5 |
+
from lib.models import mdl_utils, mdl_xgb, mdl_logR, mdl_svm
|
6 |
+
from lib.models import mdl_autoenc, mdl_kmeans
|
7 |
+
import sys
|
8 |
+
|
9 |
+
m_blnTraceOn = True
|
10 |
+
m_blnTrace2On = False
|
11 |
+
|
12 |
+
#--- load, merge data from file
|
13 |
+
m_kstrDataPath = libPaths.pth_data
|
14 |
+
m_kstrModelPath = libPaths.pth_model
|
15 |
+
m_kstrBinModelPath = libPaths.pth_binModels
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
def load_providers(blnIsTrain=False):
|
20 |
+
|
21 |
+
pdfClaims = libClaims.loadPkl_claims(blnIsTrain)
|
22 |
+
pdfClaims = pdfClaims.drop(['ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
|
23 |
+
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
|
24 |
+
'Gender', 'Race', 'County'], axis=1)
|
25 |
+
pdfProviders = pdfClaims.groupby(['Provider'], as_index=False).agg('sum')
|
26 |
+
return pdfProviders
|
27 |
+
|
28 |
+
|
29 |
+
#--- feat eng
|
30 |
+
def do_featEng(pdfClaimsFeatEng, blnIsTrain=False):
|
31 |
+
if (m_blnTraceOn): print("TRACE (providers.doFeatEng): blnIsTrain, ", blnIsTrain)
|
32 |
+
pdfFeatEng = pdfClaimsFeatEng
|
33 |
+
|
34 |
+
#--- add new features to assist with predictions
|
35 |
+
pdfFeatEng['InscClaimReimbursement_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['InscClaimAmtReimbursed'].transform('mean')
|
36 |
+
pdfFeatEng['DeductibleAmtPaid_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['DeductibleAmtPaid'].transform('mean')
|
37 |
+
|
38 |
+
pdfFeatEng['IPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualReimbursementAmt'].transform('mean')
|
39 |
+
pdfFeatEng['IPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualDeductibleAmt'].transform('mean')
|
40 |
+
|
41 |
+
pdfFeatEng['OPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualReimbursementAmt'].transform('mean')
|
42 |
+
pdfFeatEng['OPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualDeductibleAmt'].transform('mean')
|
43 |
+
return pdfFeatEng
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
def get_logrPredict(pdfTestClaims):
|
48 |
+
|
49 |
+
#--- logistic regression predictions; load test data
|
50 |
+
pdfClaims = pdfTestClaims
|
51 |
+
#print("INFO (providers.get_logrPredict) pdfClaims.shape): ", pdfClaims.shape)
|
52 |
+
|
53 |
+
pdfFeatEng = do_featEng(pdfClaims, False)
|
54 |
+
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
|
55 |
+
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
|
56 |
+
#print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
|
57 |
+
|
58 |
+
ndaPredict = mdl_logR.predict(npaScaled)
|
59 |
+
#print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
|
60 |
+
|
61 |
+
pdfPredict = pd.DataFrame(ndaPredict)
|
62 |
+
#print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
|
63 |
+
|
64 |
+
#--- stitch the grouped data with the labels
|
65 |
+
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
|
66 |
+
#print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
|
67 |
+
|
68 |
+
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
|
69 |
+
return pdfResults
|
70 |
+
|
71 |
+
|
72 |
+
def get_svmPredict(pdfTestClaims):
|
73 |
+
|
74 |
+
#--- support vector machine predictions; load test data
|
75 |
+
pdfClaims = pdfTestClaims
|
76 |
+
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfClaims.shape: ", pdfClaims.shape)
|
77 |
+
|
78 |
+
pdfFeatEng = do_featEng(pdfClaims, False)
|
79 |
+
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
|
80 |
+
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
|
81 |
+
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) npaScaled.shape: ", npaScaled.shape)
|
82 |
+
|
83 |
+
ndaPredict = mdl_svm.predict(npaScaled)
|
84 |
+
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) npaPredict.shape: ", ndaPredict.shape)
|
85 |
+
|
86 |
+
pdfPredict = pd.DataFrame(ndaPredict)
|
87 |
+
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfPredict.shape: ", pdfPredict.shape)
|
88 |
+
|
89 |
+
#--- stitch the grouped data with the labels
|
90 |
+
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
|
91 |
+
if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfResults.shape: ", pdfResults.shape)
|
92 |
+
|
93 |
+
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
|
94 |
+
return pdfResults
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
def get_xgbPredict(pdfTestClaims):
|
99 |
+
|
100 |
+
try:
|
101 |
+
#--- load test data
|
102 |
+
pdfClaims = pdfTestClaims
|
103 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfClaims.shape): ", pdfClaims.shape)
|
104 |
+
|
105 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doFeatEng (provider) ... ")
|
106 |
+
pdfFeatEng = do_featEng(pdfClaims, False)
|
107 |
+
|
108 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doStdScaler ... ")
|
109 |
+
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
|
110 |
+
|
111 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doStdScaler_toPdf ... ")
|
112 |
+
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
|
113 |
+
#if (m_blnTraceOn): print("TRACE (predict.npaScaled.shape1): ", npaScaled.shape)
|
114 |
+
|
115 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) run prediction ... ")
|
116 |
+
ndaPredict = mdl_xgb.predict(npaScaled)
|
117 |
+
#if (m_blnTraceOn): print("TRACE (predict.npaPredict.shape2): ", ndaPredict.shape)
|
118 |
+
|
119 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) convert to dataframe ... ")
|
120 |
+
pdfPredict = pd.DataFrame(ndaPredict)
|
121 |
+
pdfAnoms = pdfPredict[pdfPredict[0] > 0]
|
122 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfPredict.shape: ", pdfPredict.shape)
|
123 |
+
if (m_blnTraceOn): print("TRACE (providers.get_xgbPredict) #anoms: ", len(pdfAnoms.index))
|
124 |
+
|
125 |
+
#--- group data by provider
|
126 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) group claims by provider ... ")
|
127 |
+
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
|
128 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfResults.shape: ", pdfResults.shape)
|
129 |
+
|
130 |
+
#--- stitch the grouped data with the labels
|
131 |
+
if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) merge labels into dataset ... ")
|
132 |
+
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
|
133 |
+
|
134 |
+
except:
|
135 |
+
e = sys.exc_info()
|
136 |
+
print("ERROR (providers.get_xgbPredict_genError): ", e)
|
137 |
+
|
138 |
+
|
139 |
+
if (m_blnTraceOn): print("TRACE (providers.get_xgbPredict) proc complete; return ... ")
|
140 |
+
return pdfResults
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
def get_encPredict(pdfTestClaims):
|
145 |
+
|
146 |
+
#--- principal component analysis predictions; load test data
|
147 |
+
pdfClaims = pdfTestClaims
|
148 |
+
if (m_blnTraceOn): print("TRACE (providers.get_encPredict) ppdfClaims.shape: ", pdfClaims.shape)
|
149 |
+
|
150 |
+
pdfFeatEng = do_featEng(pdfClaims, False) #--- not grouped by provider
|
151 |
+
|
152 |
+
|
153 |
+
#--- perform standard scaling; get fit then transform
|
154 |
+
npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False) #--- grouped by provider
|
155 |
+
pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
|
156 |
+
#print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
|
157 |
+
|
158 |
+
#--- perform PCA; then autoencode predict
|
159 |
+
ndaPredict = mdl_autoenc.predict(pdfScaled)
|
160 |
+
#print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
|
161 |
+
|
162 |
+
pdfPredict = pd.DataFrame(ndaPredict)
|
163 |
+
#print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
|
164 |
+
|
165 |
+
#--- stitch the grouped data with the labels
|
166 |
+
pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
|
167 |
+
#print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
|
168 |
+
|
169 |
+
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
|
170 |
+
return pdfResults
|
lib/utils.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#--- note: this file is loaded by fastapi and streamlit,
|
2 |
+
# # so keep it independant of those libs
|
3 |
+
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
pth_pwd = Path(__file__).resolve().parent
|
7 |
+
pth_appRoot = pth_pwd.parent
|
8 |
+
|
9 |
+
pth_root = str(pth_appRoot) + "/"
|
10 |
+
pth_api = pth_root + "api/"
|
11 |
+
pth_bin = pth_root + "bin/"
|
12 |
+
pth_binModels = pth_root + "bin/models/"
|
13 |
+
pth_data = pth_root + "data/"
|
14 |
+
pth_lib = pth_root + "lib/"
|
15 |
+
pth_libModels = pth_root + "models/"
|
16 |
+
pth_model = pth_root + "model/"
|
17 |
+
pth_qa = pth_root + "qa/"
|
18 |
+
pth_routes = pth_root + "routes/"
|
19 |
+
pth_templ = pth_root + "templ/"
|
20 |
+
pth_uix = pth_root + "uix/"
|
21 |
+
|
22 |
+
m_klngMaxRecords = 100
|
23 |
+
m_klngSampleSize = 25
|
lit_index.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
toExecute: (from root app folder) ... streamlit run lit_index.py
|
3 |
+
'''
|
4 |
+
import streamlit as st
|
5 |
+
#from uix import lit_sidebar as lit_sideBar
|
6 |
+
import uix.lit_sidebar as litSideBar
|
7 |
+
|
8 |
+
|
9 |
+
#--- streamlit: specify title and logo
|
10 |
+
st.set_page_config(
|
11 |
+
page_title='Healthcare Claims - ML Anomaly Detection',
|
12 |
+
#page_icon='https://cdn.freebiesupply.com/logos/thumbs/1x/nvidia-logo.png',
|
13 |
+
layout="wide")
|
14 |
+
st.header("Healthcare ML Claims Anomaly Detection")
|
15 |
+
st.markdown('---')
|
16 |
+
|
17 |
+
|
18 |
+
#--- streamlit: add a sidebar
|
19 |
+
litSideBar.init()
|
20 |
+
|
21 |
+
|
22 |
+
#if __name__ == '__main__':
|
23 |
+
# st.run("main:app", host="0.0.0.0", port=48300, reload=True)
|
24 |
+
|
25 |
+
#aryPkg[moduleNames.index(page)].run()
|
main.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
purpose:
|
3 |
+
'''
|
4 |
+
|
5 |
+
from fastapi import FastAPI
|
6 |
+
from fastapi.responses import HTMLResponse
|
7 |
+
from fastapi import APIRouter, Request, Response
|
8 |
+
from fastapi.templating import Jinja2Templates
|
9 |
+
import uvicorn
|
10 |
+
|
11 |
+
|
12 |
+
from lib import claims as libClaims, providers as libProviders
|
13 |
+
import lib.utils as libUtils
|
14 |
+
from lib.models import mdl_utils as libMdlUtils
|
15 |
+
|
16 |
+
|
17 |
+
#--- imported route handlers
|
18 |
+
from routes.api.rte_api import rteApi
|
19 |
+
from routes.qa.rte_qa import rteQa
|
20 |
+
from routes.qa.rte_claims import rteClaims
|
21 |
+
from routes.qa.rte_providers import rteProv
|
22 |
+
|
23 |
+
|
24 |
+
#--- fastAPI self doc descriptors
|
25 |
+
description = """
|
26 |
+
Fourthbrain Capstone: MLE10 Cohort
|
27 |
+
|
28 |
+
The Healthcare Claims Anomaly API is provided to assist with
|
29 |
+
|
30 |
+
## Claims Analysis
|
31 |
+
## Supervised Provider Predictions - Anomaly Detection (XGBoost)
|
32 |
+
## Unsupervised Claim Predictions - Anomaly Detection (KMeans Cluster)
|
33 |
+
|
34 |
+
You will be able to:
|
35 |
+
* Analyze Claims data
|
36 |
+
* Identify potential Provider Anomalies
|
37 |
+
* Idenitfy potential Claim Anomalies
|
38 |
+
"""
|
39 |
+
|
40 |
+
app = FastAPI(
|
41 |
+
title="App: Healthcare Claims - Anomaly Detection",
|
42 |
+
description=description,
|
43 |
+
version="0.0.1",
|
44 |
+
terms_of_service="http://example.com/terms/",
|
45 |
+
contact={
|
46 |
+
"name": "Iain McKone",
|
47 |
+
"email": "iain.mckone@gmail.com",
|
48 |
+
},
|
49 |
+
license_info={
|
50 |
+
"name": "Apache 2.0",
|
51 |
+
"url": "https://www.apache.org/licenses/LICENSE-2.0.html",
|
52 |
+
},
|
53 |
+
)
|
54 |
+
|
55 |
+
|
56 |
+
#--- configure route handlers
|
57 |
+
app.include_router(rteApi, prefix="/api")
|
58 |
+
app.include_router(rteQa, prefix="/qa")
|
59 |
+
app.include_router(rteClaims, prefix="/claims")
|
60 |
+
app.include_router(rteProv, prefix="/providers")
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
m_kstrPath_templ = libUtils.pth_templ
|
65 |
+
m_templRef = Jinja2Templates(directory=str(m_kstrPath_templ))
|
66 |
+
|
67 |
+
|
68 |
+
def get_jinja2Templ(request: Request, pdfResults, strParamTitle, lngNumRecords, blnIsTrain=False, blnIsSample=False):
|
69 |
+
lngNumRecords = min(lngNumRecords, libUtils.m_klngMaxRecords)
|
70 |
+
if (blnIsTrain): strParamTitle = strParamTitle + " - Training Data"
|
71 |
+
if (not blnIsTrain): strParamTitle = strParamTitle + " - Test Data"
|
72 |
+
if (blnIsSample): lngNumRecords = libUtils.m_klngSampleSize
|
73 |
+
strParamTitle = strParamTitle + " - max " + str(lngNumRecords) + " rows"
|
74 |
+
|
75 |
+
pdfClaims = pdfResults.sample(lngNumRecords)
|
76 |
+
htmlClaims = pdfClaims.to_html(classes='table table-striped')
|
77 |
+
kstrTempl = 'templ_showDataframe.html'
|
78 |
+
jsonContext = {'request': request,
|
79 |
+
'paramTitle': strParamTitle,
|
80 |
+
'paramDataframe': htmlClaims
|
81 |
+
}
|
82 |
+
result = m_templRef.TemplateResponse(kstrTempl, jsonContext)
|
83 |
+
return result
|
84 |
+
|
85 |
+
|
86 |
+
#--- get main ui/ux entry point
|
87 |
+
@app.get('/')
|
88 |
+
def index():
|
89 |
+
return {
|
90 |
+
"message": "Landing page: Capstone Healthcare Anomaly Detection"
|
91 |
+
}
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
if __name__ == '__main__':
|
96 |
+
uvicorn.run("main:app", host="0.0.0.0", port=48300, reload=True)
|
97 |
+
#CMD ["uvicorn", "main:app", "--host=0.0.0.0", "--reload"]
|
routes/__init__.py
ADDED
File without changes
|
routes/api/__init__.py
ADDED
File without changes
|
routes/api/rte_api.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, Request, Response
|
2 |
+
from fastapi.responses import JSONResponse
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import json
|
6 |
+
|
7 |
+
import lib.claims as libClaims
|
8 |
+
from lib.models import mdl_utils, mdl_xgb
|
9 |
+
|
10 |
+
|
11 |
+
rteApi = APIRouter()
|
12 |
+
|
13 |
+
|
14 |
+
#--- return json for claims data (merged)
|
15 |
+
#--- note: current is kaggle, but future could include from yyyymm filter
|
16 |
+
@rteApi.get('/claims', response_class = JSONResponse)
|
17 |
+
def api_getClaims(request: Request, response: Response):
|
18 |
+
pdfClaims = libClaims.load_claims()
|
19 |
+
jsonSample = pdfClaims.head(50).to_json(orient="records", indent=4)
|
20 |
+
result = json.loads(jsonSample)
|
21 |
+
return result
|
22 |
+
|
23 |
+
|
24 |
+
#--- return json for featEng
|
25 |
+
@rteApi.get('/claims/doFeatEng/', response_class = JSONResponse)
|
26 |
+
def tst_claims_featEng():
|
27 |
+
pdfClaims = libClaims.load_claims()
|
28 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
29 |
+
jsonSample = pdfClaims.head(50).to_json(orient="records", indent=4)
|
30 |
+
result = json.loads(jsonSample)
|
31 |
+
return result
|
32 |
+
|
33 |
+
|
34 |
+
@rteApi.get('/claims/doStdScaling/', response_class = JSONResponse)
|
35 |
+
def tst_claims_stdScaling():
|
36 |
+
pdfClaims = libClaims.load_claims()
|
37 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
38 |
+
pdfScaled = mdl_utils.doClaims_stdScaler_toPdf(pdfFeatEng)
|
39 |
+
|
40 |
+
jsonSample = pdfClaims.head(50).to_json(orient="records", indent=4)
|
41 |
+
result = json.loads(jsonSample)
|
42 |
+
return result
|
43 |
+
|
44 |
+
|
45 |
+
@rteApi.get('/claims/predict/superv', response_class = JSONResponse)
|
46 |
+
@rteApi.get('/claims/predict/xgb', response_class = JSONResponse)
|
47 |
+
def predict_xgb():
|
48 |
+
#--- load test data
|
49 |
+
pdfClaims = libClaims.load_claims()
|
50 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
51 |
+
|
52 |
+
npaScaled = mdl_utils.do_stdScaler(pdfFeatEng)
|
53 |
+
pdfScaled = mdl_utils.do_stdScaler_toPdf(npaScaled)
|
54 |
+
|
55 |
+
ndaPredict = mdl_xgb.predict(npaScaled)
|
56 |
+
pdfPredict = pd.DataFrame(ndaPredict)
|
57 |
+
|
58 |
+
#--- stitch the grouped data with the labels
|
59 |
+
pdfResults = pdfScaled.copy()
|
60 |
+
pdfResults.insert(0, "hasAnom?", pdfPredict[0])
|
61 |
+
|
62 |
+
#--- filter to only those rows that are flagged with an anomaly
|
63 |
+
pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
|
64 |
+
|
65 |
+
jsonSample = pdfResults.head(50).to_json(orient="records", indent=4)
|
66 |
+
result = json.loads(jsonSample)
|
67 |
+
return result
|
routes/qa/__init__.py
ADDED
File without changes
|
routes/qa/rte_claims.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, Request, Response
|
2 |
+
from fastapi.responses import HTMLResponse
|
3 |
+
|
4 |
+
|
5 |
+
import main as libMain
|
6 |
+
from lib import utils as libUtils, claims as libClaims
|
7 |
+
from lib.models import mdl_utils as libMdlUtils
|
8 |
+
|
9 |
+
|
10 |
+
m_kstrFile = __file__
|
11 |
+
m_blnTraceOn = True
|
12 |
+
|
13 |
+
m_kstrPath_templ = libUtils.pth_templ
|
14 |
+
|
15 |
+
|
16 |
+
rteClaims = APIRouter()
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
#--- get claims data
|
21 |
+
def claims_loadData(request: Request, response: Response, blnIsTrain=False, blnIsSample=False, blnForceCsv=False):
|
22 |
+
|
23 |
+
pdfClaims = libClaims.load_claims(blnIsTrain)
|
24 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
25 |
+
strParamTitle = "Claims"
|
26 |
+
|
27 |
+
return libMain.get_jinja2Templ(request, pdfClaims, strParamTitle, lngNumRecords, blnIsTrain, blnIsSample)
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
@rteClaims.get('/data/loadCsv/', response_class = HTMLResponse)
|
32 |
+
def claims_loadCsv(request: Request, response: Response):
|
33 |
+
#--- forces a reload of csv's in case a refresh is required
|
34 |
+
pdfClaims = libClaims.load_claims(False, True)
|
35 |
+
pdfClaims = libClaims.load_claims(True, True)
|
36 |
+
return claims_loadData(request, response, True,False)
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
@rteClaims.get('/data/train/', response_class = HTMLResponse)
|
41 |
+
def claims_loadTrainData(request: Request, response: Response, blnIsSample=False):
|
42 |
+
return claims_loadData(request, response, True, blnIsSample)
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
@rteClaims.get('/data/train/sample', response_class = HTMLResponse)
|
47 |
+
def claims_loadTrainSample(request: Request, response: Response):
|
48 |
+
return claims_loadTrainData(request, response, True)
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
@rteClaims.get('/data/test/', response_class = HTMLResponse)
|
53 |
+
def claims_loadTestData(request: Request, response: Response, blnIsSample=False):
|
54 |
+
return claims_loadData(request, response, False, blnIsSample)
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
@rteClaims.get('/data/test/sample', response_class = HTMLResponse)
|
59 |
+
def claims_loadTestSample(request: Request, response: Response):
|
60 |
+
return claims_loadTestData(request, response, True)
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
@rteClaims.get('/doStdScaling/', response_class = HTMLResponse)
|
65 |
+
def claims_stdScaling(request: Request, response: Response, blnIsTrain=False):
|
66 |
+
pdfClaims = libClaims.load_claims(blnIsTrain)
|
67 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims, blnIsTrain, False)
|
68 |
+
npaScaled = libMdlUtils.doClaims_stdScaler(pdfFeatEng, blnIsTrain)
|
69 |
+
pdfScaled = libMdlUtils.doClaims_stdScaler_toPdf(npaScaled)
|
70 |
+
|
71 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
72 |
+
blnIsSample = True
|
73 |
+
|
74 |
+
strParamTitle = "Std Scaled Claims"
|
75 |
+
return libMain.get_jinja2Templ(request, pdfScaled, strParamTitle, lngNumRecords, blnIsTrain, blnIsSample)
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
@rteClaims.get('/doStdScaling/train', response_class = HTMLResponse)
|
80 |
+
def claims_stdScalingTrain(request: Request, response: Response):
|
81 |
+
return claims_stdScaling(request, response, True)
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
@rteClaims.get('/doStdScaling/test', response_class = HTMLResponse)
|
86 |
+
def claims_stdScalingTest(request: Request, response: Response):
|
87 |
+
return claims_stdScaling(request, response, False)
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
@rteClaims.get('/doFeatEng/', response_class = HTMLResponse)
|
92 |
+
def claims_doFeatEng(request: Request, response: Response, blnIsTrain=False):
|
93 |
+
pdfClaims = libClaims.load_claims(blnIsTrain)
|
94 |
+
pdfFeatEng_claims = libClaims.do_featEng(pdfClaims, blnIsTrain)
|
95 |
+
|
96 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
97 |
+
blnIsSample = True
|
98 |
+
|
99 |
+
strParamTitle = "Feature Engineered Claims"
|
100 |
+
|
101 |
+
return libMain.get_jinja2Templ(request, pdfFeatEng_claims, strParamTitle,
|
102 |
+
lngNumRecords, blnIsTrain, True)
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
@rteClaims.get('/predict/kmeans', response_class = HTMLResponse)
|
107 |
+
def predict_kmeans(request: Request, response: Response):
|
108 |
+
|
109 |
+
#--- load test data, perform featEng, stdScaling, and fit to Kmeans args
|
110 |
+
pdfClaims = libClaims.load_claims(False)
|
111 |
+
print("TRACE: claims.predict.kmeans getting prediction ...")
|
112 |
+
pdfResults = libClaims.get_kmeansPredict(pdfClaims)
|
113 |
+
print("TRACE: claims.predict.kmeans prepping response ...")
|
114 |
+
|
115 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
116 |
+
blnIsSample = False
|
117 |
+
strParamTitle = "Predictions (KMeans Clusters)"
|
118 |
+
|
119 |
+
return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
|
120 |
+
lngNumRecords, False, blnIsSample)
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
@rteClaims.get('/fit/kmeans', response_class = HTMLResponse)
|
125 |
+
def fit_kmeans(request: Request, response: Response):
|
126 |
+
|
127 |
+
#--- load test data, perform featEng, stdScaling, and fit to Kmeans args
|
128 |
+
pdfClaims = libClaims.load_claims(False)
|
129 |
+
mdlKMeans = libClaims.get_kmeansFit(pdfClaims)
|
130 |
+
|
131 |
+
#--- inspect KMeans data; clusters, centers, sizes
|
132 |
+
#lstCenters = mdlKMeans.cluster_centers_
|
133 |
+
lstIdx = range(len(mdlKMeans.cluster_centers_))
|
134 |
+
if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit_kmeans) lstIdx: ", lstIdx)
|
135 |
+
|
136 |
+
lstSize = [sum(mdlKMeans.labels_ == idx) for idx,_ in enumerate(lstIdx)]
|
137 |
+
if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit_kmeans) lstSize: ", lstSize)
|
138 |
+
|
139 |
+
return
|
routes/qa/rte_providers.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter, Request, Response
|
2 |
+
from fastapi.responses import HTMLResponse
|
3 |
+
from fastapi.templating import Jinja2Templates
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
import main as libMain
|
7 |
+
from lib import utils as libUtils, claims as libClaims, providers as libProviders
|
8 |
+
from lib.models import mdl_utils as libMdlUtils
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
m_kstrFile = __file__
|
13 |
+
m_blnTraceOn = True
|
14 |
+
|
15 |
+
m_kstrPath_templ = libUtils.pth_templ
|
16 |
+
m_templRef = Jinja2Templates(directory=str(m_kstrPath_templ))
|
17 |
+
|
18 |
+
|
19 |
+
rteProv = APIRouter()
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
#--- get claims data
|
24 |
+
def providers_loadData(request: Request, response: Response, blnIsTrain=False, blnIsSample=False):
|
25 |
+
|
26 |
+
pdfProviders = libProviders.load_providers(blnIsTrain)
|
27 |
+
|
28 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
29 |
+
strParamTitle = "Providers"
|
30 |
+
|
31 |
+
return libMain.get_jinja2Templ(request, pdfProviders, strParamTitle, lngNumRecords, blnIsTrain, blnIsSample)
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
@rteProv.get('/data/train/', response_class = HTMLResponse)
|
36 |
+
def providers_loadTrainData(request: Request, response: Response, blnIsSample=False):
|
37 |
+
return providers_loadData(request, response, True, blnIsSample)
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
@rteProv.get('/data/train/sample', response_class = HTMLResponse)
|
42 |
+
def providers_loadTrainSample(request: Request, response: Response):
|
43 |
+
return providers_loadTrainData(request, response, True)
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
@rteProv.get('/data/test/', response_class = HTMLResponse)
|
48 |
+
def providers_loadTestData(request: Request, response: Response, blnIsSample=False):
|
49 |
+
return providers_loadData(request, response, False, blnIsSample)
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
@rteProv.get('/data/test/sample', response_class = HTMLResponse)
|
54 |
+
def providers_loadTestSample(request: Request, response: Response):
|
55 |
+
return providers_loadTestData(request, response, True)
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
@rteProv.get('/doFeatEng/', response_class = HTMLResponse)
|
60 |
+
def providers_featEng(request: Request, response: Response, blnIsTrain=False):
|
61 |
+
pdfClaims = libClaims.load_claims(blnIsTrain)
|
62 |
+
pdfFeatEng_claims = libClaims.do_featEng(pdfClaims, blnIsTrain)
|
63 |
+
pdfFeatEng_providers = libProviders.do_featEng(pdfFeatEng_claims)
|
64 |
+
|
65 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
66 |
+
blnIsSample = True
|
67 |
+
|
68 |
+
strParamTitle = "Feature Engineered Claims Grouped by Provider"
|
69 |
+
|
70 |
+
return libMain.get_jinja2Templ(request, pdfFeatEng_providers, strParamTitle,
|
71 |
+
lngNumRecords, blnIsTrain, True)
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
@rteProv.get('/doFeatEng/train', response_class = HTMLResponse)
|
76 |
+
def providers_featEngTrain(request: Request, response: Response):
|
77 |
+
return providers_featEng(request, response, True)
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
@rteProv.get('/doFeatEng/test', response_class = HTMLResponse)
|
82 |
+
def providers_featEngTest(request: Request, response: Response):
|
83 |
+
return providers_featEng(request, response, False)
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
@rteProv.get('/doStdScaling/', response_class = HTMLResponse)
|
88 |
+
def providers_stdScaling(request: Request, response: Response, blnIsTrain=False):
|
89 |
+
pdfClaims = libClaims.load_claims(blnIsTrain)
|
90 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims, blnIsTrain)
|
91 |
+
npaScaled = libMdlUtils.doProviders_stdScaler(pdfFeatEng, blnIsTrain)
|
92 |
+
pdfScaled = libMdlUtils.doProviders_stdScaler_toPdf(npaScaled)
|
93 |
+
|
94 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
95 |
+
blnIsSample = True
|
96 |
+
|
97 |
+
strParamTitle = "Std Scaled Claims Grouped by Provider"
|
98 |
+
return libMain.get_jinja2Templ(request, pdfScaled, strParamTitle,
|
99 |
+
lngNumRecords, blnIsTrain, blnIsSample)
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
@rteProv.get('/doStdScaling/train', response_class = HTMLResponse)
|
104 |
+
def providers_stdScalingTrain(request: Request, response: Response):
|
105 |
+
return providers_stdScaling(request, response, True)
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
@rteProv.get('/doStdScaling/test', response_class = HTMLResponse)
|
110 |
+
def providers_stdScalingTest(request: Request, response: Response):
|
111 |
+
return providers_stdScaling(request, response, False)
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
@rteProv.get('/predict/superv', response_class = HTMLResponse)
|
116 |
+
@rteProv.get('/predict/xgb', response_class = HTMLResponse)
|
117 |
+
def predict_supervised_xgb(request: Request, response: Response):
|
118 |
+
|
119 |
+
#--- load test data
|
120 |
+
#--- filter to only those rows that are flagged with an anomaly
|
121 |
+
pdfClaims = libClaims.load_claims(False)
|
122 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
123 |
+
pdfResults = libProviders.get_xgbPredict(pdfFeatEng)
|
124 |
+
pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
|
125 |
+
|
126 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
127 |
+
blnIsSample = True
|
128 |
+
strParamTitle = "Provider Predictions (Gradient Boosting Classifier)"
|
129 |
+
|
130 |
+
return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
|
131 |
+
lngNumRecords, False, blnIsSample)
|
132 |
+
|
133 |
+
|
134 |
+
|
135 |
+
@rteProv.get('/predict/logr', response_class = HTMLResponse)
|
136 |
+
def predict_supervised_logr(request: Request, response: Response):
|
137 |
+
|
138 |
+
#--- load test data
|
139 |
+
#--- filter to only those rows that are flagged with an anomaly
|
140 |
+
pdfClaims = libClaims.load_claims(False)
|
141 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
142 |
+
pdfResults = libProviders.get_logrPredict(pdfFeatEng)
|
143 |
+
pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
|
144 |
+
|
145 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
146 |
+
blnIsSample = True
|
147 |
+
strParamTitle = "Provider Predictions (Logistic Regression)"
|
148 |
+
|
149 |
+
return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
|
150 |
+
lngNumRecords, False, blnIsSample)
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
@rteProv.get('/predict/svm', response_class = HTMLResponse)
|
155 |
+
def predict_supervised_svm(request: Request, response: Response):
|
156 |
+
|
157 |
+
#--- load test data
|
158 |
+
#--- filter to only those rows that are flagged with an anomaly
|
159 |
+
pdfClaims = libClaims.load_claims(False)
|
160 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
161 |
+
pdfResults = libProviders.get_svmPredict(pdfFeatEng)
|
162 |
+
pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
|
163 |
+
|
164 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
165 |
+
blnIsSample = True
|
166 |
+
strParamTitle = "Provider Predictions (Support Vector Machines)"
|
167 |
+
|
168 |
+
return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
|
169 |
+
lngNumRecords, False, blnIsSample)
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
@rteProv.get('/predict/enc', response_class = HTMLResponse)
|
174 |
+
def predict_kerasSeq(request: Request, response: Response):
|
175 |
+
|
176 |
+
#--- load test data
|
177 |
+
#--- filter to only those rows that are flagged with an anomaly
|
178 |
+
pdfClaims = libClaims.load_claims(False)
|
179 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
180 |
+
pdfResults = libProviders.get_encPredict(pdfFeatEng)
|
181 |
+
pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
|
182 |
+
|
183 |
+
lngNumRecords = libUtils.m_klngMaxRecords
|
184 |
+
blnIsSample = True
|
185 |
+
strParamTitle = "Claims Predictions (Transformer/Encoder - Keras Sequential)"
|
186 |
+
|
187 |
+
return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
|
188 |
+
lngNumRecords, False, blnIsSample)
|
routes/qa/rte_qa.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import APIRouter
|
2 |
+
|
3 |
+
|
4 |
+
m_kstrFile = __file__
|
5 |
+
m_blnTraceOn = True
|
6 |
+
|
7 |
+
|
8 |
+
rteQa = APIRouter()
|
9 |
+
|
10 |
+
|
11 |
+
@rteQa.get('/')
|
12 |
+
@rteQa.get('/verif')
|
13 |
+
@rteQa.get('/valid')
|
14 |
+
def qa_entry():
|
15 |
+
return {
|
16 |
+
"message": "qa routing - For verification, validation"
|
17 |
+
}
|
templ/templ_results.html
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<body>{{ dataframe | safe }}</body>
|
4 |
+
</html>
|
templ/templ_showDataframe.html
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<title>Fourthbrain Capstone: Healthcare Anomalies</title>
|
6 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
|
10 |
+
<h2>{{ paramTitle }}:</h2>
|
11 |
+
|
12 |
+
<!-- Mark data as safe, otherwise it will be rendered as a string -->
|
13 |
+
{{ paramDataframe | safe }}
|
14 |
+
</body>
|
15 |
+
</html>
|
uix/__init__.py
ADDED
File without changes
|
uix/images/image1.jpg
ADDED
![]() |
uix/images/image1.jpg:Zone.Identifier
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[ZoneTransfer]
|
2 |
+
LastWriterPackageFamilyName=Microsoft.Windows.Photos_8wekyb3d8bbwe
|
3 |
+
ZoneId=3
|
uix/lit_packages.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
|
3 |
+
|
4 |
+
#--- return a list of streamlit packages/pages to render
|
5 |
+
def packages():
|
6 |
+
#---
|
7 |
+
ary_pkg = []
|
8 |
+
ary_pkg.extend(['lit_continentData',
|
9 |
+
'lit_countryData'
|
10 |
+
])
|
11 |
+
'''
|
12 |
+
ary_pkg.extend(['lit_claimAnalysis',
|
13 |
+
'lit_claimAnomalies'
|
14 |
+
])
|
15 |
+
'''
|
16 |
+
return ary_pkg
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
def get_aryPkgDescr():
|
21 |
+
#--- load list of pages to display
|
22 |
+
aryDescr = []
|
23 |
+
aryPkgs = []
|
24 |
+
|
25 |
+
aryModules = packages()
|
26 |
+
for modname in aryModules:
|
27 |
+
m = importlib.import_module('.'+ modname,'uix')
|
28 |
+
aryPkgs.append(m)
|
29 |
+
|
30 |
+
#--- use the module description attribute if it exists
|
31 |
+
#--- otherwise use the module name
|
32 |
+
try:
|
33 |
+
aryDescr.append(m.description)
|
34 |
+
except:
|
35 |
+
aryDescr.append(modname)
|
36 |
+
return [aryDescr, aryPkgs]
|
uix/lit_sidebar.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import importlib
|
3 |
+
from uix import lit_packages
|
4 |
+
|
5 |
+
from uix.pages import lit_home, lit_about
|
6 |
+
from uix.pages import lit_anom_superv, lit_anom_unsuperv
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
#--- alt define sidebar pages
|
11 |
+
m_aryPages = {
|
12 |
+
"Home": lit_home, #--- TODO: update
|
13 |
+
#"Provider Analysis": lit_providerAnalysis,
|
14 |
+
#"Claims Analysis": lit_claimAnalysis,
|
15 |
+
"Provider Anoms - Supervised": lit_anom_superv,
|
16 |
+
"Claim Anoms - UnSupervised": lit_anom_unsuperv,
|
17 |
+
#"MLE Model Performance": lit_about, #--- TODO: update
|
18 |
+
"About": lit_about
|
19 |
+
}
|
20 |
+
|
21 |
+
|
22 |
+
#--- define module-level vars
|
23 |
+
m_aryModNames = lit_packages.packages()
|
24 |
+
m_aryDescr = []
|
25 |
+
m_aryMods = []
|
26 |
+
|
27 |
+
def init():
|
28 |
+
#--- upper panel
|
29 |
+
with st.sidebar:
|
30 |
+
kstrUrl_image = ""
|
31 |
+
st.sidebar.image(kstrUrl_image, width=200)
|
32 |
+
st.sidebar.markdown('Visualize Provider and Claims anomalies.')
|
33 |
+
|
34 |
+
|
35 |
+
#--- init checkboxes
|
36 |
+
strKey = st.sidebar.radio("Go to", list(m_aryPages.keys()))
|
37 |
+
pagSel = m_aryPages[strKey]
|
38 |
+
writePage(pagSel)
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def init_selectBox():
|
43 |
+
#--- init module array of page names, and descr
|
44 |
+
init_modDescrAry()
|
45 |
+
|
46 |
+
# Display the sidebar with a menu of apps
|
47 |
+
kstrMsg = """
|
48 |
+
__Claims Anomaly Views__
|
49 |
+
"""
|
50 |
+
with st.sidebar:
|
51 |
+
st.markdown('---')
|
52 |
+
st.markdown(kstrMsg)
|
53 |
+
page = st.selectbox('Select:', m_aryModNames, format_func=fmt_modName)
|
54 |
+
|
55 |
+
#--- display sidebar footer
|
56 |
+
with st.sidebar:
|
57 |
+
st.markdown('---')
|
58 |
+
st.write('Developed by Chavarria, McKone, Sharma')
|
59 |
+
st.write('Contact at iain.mckone@gmail.com')
|
60 |
+
|
61 |
+
# Run the chosen app
|
62 |
+
m_aryMods[m_aryModNames.index(page)].run()
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
def init_modDescrAry():
|
67 |
+
#--- init global array of page names, and descr
|
68 |
+
#--- note: you need to specify global scope for fxns to access module-level variables
|
69 |
+
global m_aryMods
|
70 |
+
global m_aryDescr
|
71 |
+
|
72 |
+
m_aryMods = []
|
73 |
+
m_aryDescr = []
|
74 |
+
for modName in m_aryModNames:
|
75 |
+
modTemp = importlib.import_module('.'+modName,'uix')
|
76 |
+
m_aryMods.append(modTemp)
|
77 |
+
|
78 |
+
#--- If the module has a description attribute use that in the
|
79 |
+
#--- select box otherwise use the module name
|
80 |
+
try:
|
81 |
+
m_aryDescr.append(modTemp.description)
|
82 |
+
except:
|
83 |
+
m_aryDescr.append(modName)
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
#--- display the app descriptions instead of the module names in the selctbox
|
88 |
+
def fmt_modName(strName):
|
89 |
+
global m_aryModNames
|
90 |
+
global m_aryDescr
|
91 |
+
return m_aryDescr[m_aryModNames.index(strName)]
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
def writePage(uixFile):
|
96 |
+
#--- writes out the page for the selected combo
|
97 |
+
|
98 |
+
# _reload_module(page)
|
99 |
+
uixFile.run()
|
uix/pages/__init__.py
ADDED
File without changes
|
uix/pages/lit_about.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#--- about page
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
description = "About"
|
5 |
+
def run():
|
6 |
+
|
7 |
+
print("\nINFO (lit_about.run) loading ", description, " page ...")
|
8 |
+
|
9 |
+
#---
|
10 |
+
#st.experimental_memo.clear() #--- try to clear cache each time this page is hit
|
11 |
+
#st.cache_data.clear()
|
12 |
+
|
13 |
+
st.markdown('### About')
|
14 |
+
st.markdown('### MLE10 Capstone: Healthcare Anomaly Detection')
|
15 |
+
st.markdown('#### Team: McKone, Sharma, Chavarria, Lederer')
|
16 |
+
|
17 |
+
st.markdown('Kaggle Claims Data:')
|
18 |
+
st.markdown('https://www.kaggle.com/code/rohitrox/medical-provider-fraud-detection/data')
|
19 |
+
st.markdown(
|
20 |
+
"""
|
21 |
+
About page
|
22 |
+
""",
|
23 |
+
unsafe_allow_html=True,
|
24 |
+
)
|
uix/pages/lit_anom_superv.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#--- anomaly detection - supervised page
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.express as px
|
5 |
+
import plotly.graph_objects as go
|
6 |
+
|
7 |
+
import lib.claims as libClaims
|
8 |
+
import lib.providers as libProviders
|
9 |
+
import lib.utils as libUtils
|
10 |
+
|
11 |
+
import sys
|
12 |
+
|
13 |
+
description = "Anomaly Detection - Supervised"
|
14 |
+
m_kblnTraceOn = True #--- enable/disable module level tracing
|
15 |
+
|
16 |
+
def run():
|
17 |
+
#--- note: in python, you need to specify global scope for fxns to access module-level variables
|
18 |
+
global m_kbln_TraceOn
|
19 |
+
print("\nINFO (litAnomSuperv.run) loading ", description, " page ...")
|
20 |
+
|
21 |
+
|
22 |
+
#--- page settings
|
23 |
+
if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): Initialize Page Settings ...")
|
24 |
+
st.header("Provider Anomalies - Supervised Approach (XG Boost)")
|
25 |
+
|
26 |
+
#--- provide file drag/drop capability
|
27 |
+
m_blnDisableDragDrop = False
|
28 |
+
if(not m_blnDisableDragDrop):
|
29 |
+
#btnSave = st.button("Save")
|
30 |
+
pklDropped = st.file_uploader("Upload a Claims Dataset", type=["pkl"])
|
31 |
+
m_blnDisableDragDrop = (pklDropped is None)
|
32 |
+
|
33 |
+
|
34 |
+
#if (True):
|
35 |
+
try:
|
36 |
+
|
37 |
+
#--- show: raw claims data analysis
|
38 |
+
if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): load raw claims data ...")
|
39 |
+
if (m_blnDisableDragDrop):
|
40 |
+
pdfClaims = libClaims.load_claims(False)
|
41 |
+
else:
|
42 |
+
pdfClaims = pd.read_pickle(pklDropped)
|
43 |
+
|
44 |
+
#--- get supervised predictions
|
45 |
+
if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): doFeatEng (claims) ...")
|
46 |
+
pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
47 |
+
|
48 |
+
if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): perform xgb prediction ...")
|
49 |
+
pdfPred = libProviders.get_xgbPredict(pdfFeatEng)
|
50 |
+
|
51 |
+
if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): get sample ...")
|
52 |
+
lngSampleSize = min(50, len(pdfPred.index))
|
53 |
+
pdfSample = pdfPred.sample(lngSampleSize)
|
54 |
+
|
55 |
+
#--- save a test file
|
56 |
+
#if (btnSave):
|
57 |
+
#btnSave_testFile(pdfClaims, pdfPred)
|
58 |
+
|
59 |
+
except TypeError as e:
|
60 |
+
print("ERROR (litAnomSuperv.run_typeError1): ", e)
|
61 |
+
|
62 |
+
except:
|
63 |
+
e = sys.exc_info()
|
64 |
+
print("ERROR (litAnomSuperv.run_genError1): ", e)
|
65 |
+
|
66 |
+
|
67 |
+
try:
|
68 |
+
#--- save this file locally as a pkl
|
69 |
+
#btnSave_testFile(pdfClaims, pdfPred)
|
70 |
+
|
71 |
+
|
72 |
+
#--- table sorted $insClaims reimbursed by provider
|
73 |
+
#--- display providers with predictions, sorted by InscClaimAmt Reimbursed
|
74 |
+
pdfTopClaims = pdfSample.sort_values(by=["InscClaimAmtReimbursed"], ascending=False)
|
75 |
+
if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): Show $claims reimbursed by provider ...")
|
76 |
+
st.markdown("(Top) Ins Reimbursed by Provider")
|
77 |
+
st.dataframe(pdfTopClaims)
|
78 |
+
|
79 |
+
|
80 |
+
#--- chart Top Insurance claims ($) by Provider")
|
81 |
+
chart_topInsClaimsByProvider(pdfSample)
|
82 |
+
|
83 |
+
|
84 |
+
#--- chart Top deductible amts ($) by Provider")
|
85 |
+
chart_topDeductiblePaidByProvider(pdfSample)
|
86 |
+
|
87 |
+
|
88 |
+
#--- chart Top IP Annual Reimbursement amts ($) by Provider")
|
89 |
+
chart_topIPAnnualReimbAmtByProvider(pdfSample)
|
90 |
+
|
91 |
+
|
92 |
+
#--- chart Top IP Annual Reimbursement amts ($) by Provider")
|
93 |
+
chart_topIPAnnualDeductAmtByProvider(pdfSample)
|
94 |
+
|
95 |
+
|
96 |
+
#--- chart Top IP Annual Reimbursement amts ($) by Provider")
|
97 |
+
chart_topOPAnnualReimbAmtByProvider(pdfSample)
|
98 |
+
|
99 |
+
|
100 |
+
#--- chart Top IP Annual Reimbursement amts ($) by Provider")
|
101 |
+
chart_topOPAnnualDeductAmtByProvider(pdfSample)
|
102 |
+
|
103 |
+
|
104 |
+
except TypeError as e:
|
105 |
+
print("ERROR (litAnomSuperv.run_typeError2): ", e)
|
106 |
+
|
107 |
+
except:
|
108 |
+
e = sys.exc_info()
|
109 |
+
print("ERROR (litAnomSuperv.run_genError2): ", e)
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
def chart_topOPAnnualReimbAmtByProvider(pdfSample):
|
114 |
+
pdfBar = pdfSample.sort_values(by=["OPAnnualReimbursementAmt"], ascending=False)
|
115 |
+
pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
|
116 |
+
|
117 |
+
#--- chart
|
118 |
+
fig = go.Figure(
|
119 |
+
layout=dict(
|
120 |
+
title="(Sample Anomalies) Top OP Reimb Paid ($) by Provider",
|
121 |
+
legend=dict(groupclick="toggleitem"),
|
122 |
+
)
|
123 |
+
)
|
124 |
+
|
125 |
+
fig.add_trace(
|
126 |
+
go.Bar(
|
127 |
+
x=pdfBar.Provider,
|
128 |
+
y=pdfBar.OPAnnualReimbursementAmt,
|
129 |
+
name="OP Reimb Paid",
|
130 |
+
marker_color="LightBlue",
|
131 |
+
)
|
132 |
+
)
|
133 |
+
|
134 |
+
|
135 |
+
fig.add_trace(
|
136 |
+
go.Scatter(
|
137 |
+
x=pdfAnoms.Provider,
|
138 |
+
y=pdfAnoms.OPAnnualReimbursementAmt,
|
139 |
+
mode="markers",
|
140 |
+
marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
|
141 |
+
name="Anomalies"
|
142 |
+
))
|
143 |
+
|
144 |
+
st.plotly_chart(fig, use_container_width=True)
|
145 |
+
|
146 |
+
|
147 |
+
|
148 |
+
def chart_topOPAnnualDeductAmtByProvider(pdfSample):
|
149 |
+
pdfBar = pdfSample.sort_values(by=["OPAnnualDeductibleAmt"], ascending=False)
|
150 |
+
pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
|
151 |
+
|
152 |
+
#--- chart
|
153 |
+
fig = go.Figure(
|
154 |
+
layout=dict(
|
155 |
+
title="(Sample Anomalies) Top OP Deduct Amt ($) by Provider",
|
156 |
+
legend=dict(groupclick="toggleitem"),
|
157 |
+
)
|
158 |
+
)
|
159 |
+
|
160 |
+
fig.add_trace(
|
161 |
+
go.Bar(
|
162 |
+
x=pdfBar.Provider,
|
163 |
+
y=pdfBar.OPAnnualDeductibleAmt,
|
164 |
+
name="OP Deductible Paid",
|
165 |
+
marker_color="LightBlue",
|
166 |
+
)
|
167 |
+
)
|
168 |
+
|
169 |
+
|
170 |
+
fig.add_trace(
|
171 |
+
go.Scatter(
|
172 |
+
x=pdfAnoms.Provider,
|
173 |
+
y=pdfAnoms.OPAnnualDeductibleAmt,
|
174 |
+
mode="markers",
|
175 |
+
marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
|
176 |
+
name="Anomalies"
|
177 |
+
))
|
178 |
+
|
179 |
+
st.plotly_chart(fig, use_container_width=True)
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
def chart_topIPAnnualReimbAmtByProvider(pdfSample):
|
184 |
+
pdfBar = pdfSample.sort_values(by=["IPAnnualReimbursementAmt"], ascending=False)
|
185 |
+
pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
|
186 |
+
|
187 |
+
#--- chart
|
188 |
+
fig = go.Figure(
|
189 |
+
layout=dict(
|
190 |
+
title="(Sample Anomalies) Top IP Reimb Paid ($) by Provider",
|
191 |
+
legend=dict(groupclick="toggleitem"),
|
192 |
+
)
|
193 |
+
)
|
194 |
+
|
195 |
+
fig.add_trace(
|
196 |
+
go.Bar(
|
197 |
+
x=pdfBar.Provider,
|
198 |
+
y=pdfBar.IPAnnualReimbursementAmt,
|
199 |
+
name="IP Reimb Paid",
|
200 |
+
marker_color="LightBlue",
|
201 |
+
)
|
202 |
+
)
|
203 |
+
|
204 |
+
|
205 |
+
fig.add_trace(
|
206 |
+
go.Scatter(
|
207 |
+
x=pdfAnoms.Provider,
|
208 |
+
y=pdfAnoms.IPAnnualReimbursementAmt,
|
209 |
+
mode="markers",
|
210 |
+
marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
|
211 |
+
name="Anomalies"
|
212 |
+
))
|
213 |
+
|
214 |
+
st.plotly_chart(fig, use_container_width=True)
|
215 |
+
|
216 |
+
|
217 |
+
|
218 |
+
def chart_topIPAnnualDeductAmtByProvider(pdfSample):
|
219 |
+
pdfBar = pdfSample.sort_values(by=["IPAnnualDeductibleAmt"], ascending=False)
|
220 |
+
pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
|
221 |
+
|
222 |
+
#--- chart
|
223 |
+
fig = go.Figure(
|
224 |
+
layout=dict(
|
225 |
+
title="(Sample Anomalies) Top IP Deduct Amt ($) by Provider",
|
226 |
+
legend=dict(groupclick="toggleitem"),
|
227 |
+
)
|
228 |
+
)
|
229 |
+
|
230 |
+
fig.add_trace(
|
231 |
+
go.Bar(
|
232 |
+
x=pdfBar.Provider,
|
233 |
+
y=pdfBar.IPAnnualDeductibleAmt,
|
234 |
+
name="IP Deductible Paid",
|
235 |
+
marker_color="LightBlue",
|
236 |
+
)
|
237 |
+
)
|
238 |
+
|
239 |
+
|
240 |
+
fig.add_trace(
|
241 |
+
go.Scatter(
|
242 |
+
x=pdfAnoms.Provider,
|
243 |
+
y=pdfAnoms.IPAnnualDeductibleAmt,
|
244 |
+
mode="markers",
|
245 |
+
marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
|
246 |
+
name="Anomalies"
|
247 |
+
))
|
248 |
+
|
249 |
+
st.plotly_chart(fig, use_container_width=True)
|
250 |
+
|
251 |
+
|
252 |
+
def chart_topDeductiblePaidByProvider(pdfSample):
|
253 |
+
pdfBar = pdfSample.sort_values(by=["DeductibleAmtPaid"], ascending=False)
|
254 |
+
pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
|
255 |
+
|
256 |
+
#--- chart
|
257 |
+
fig = go.Figure(
|
258 |
+
layout=dict(
|
259 |
+
title="(Sample Anomalies) Top Deductibles Paid ($) by Provider",
|
260 |
+
legend=dict(groupclick="toggleitem"),
|
261 |
+
)
|
262 |
+
)
|
263 |
+
|
264 |
+
fig.add_trace(
|
265 |
+
go.Bar(
|
266 |
+
x=pdfBar.Provider,
|
267 |
+
y=pdfBar.DeductibleAmtPaid,
|
268 |
+
name="Deductibles Paid",
|
269 |
+
marker_color="LightBlue",
|
270 |
+
#offsetgroup="anoms",
|
271 |
+
#legendgroup="anoms",
|
272 |
+
#legendgrouptitle_text="Anoms",
|
273 |
+
)
|
274 |
+
)
|
275 |
+
|
276 |
+
|
277 |
+
fig.add_trace(
|
278 |
+
go.Scatter(
|
279 |
+
x=pdfAnoms.Provider,
|
280 |
+
y=pdfAnoms.DeductibleAmtPaid,
|
281 |
+
mode="markers",
|
282 |
+
marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
|
283 |
+
#offsetgroup="anoms",
|
284 |
+
#legendgroup="anoms",
|
285 |
+
name="Anomalies"
|
286 |
+
))
|
287 |
+
|
288 |
+
st.plotly_chart(fig, use_container_width=True)
|
289 |
+
|
290 |
+
|
291 |
+
def chart_topInsClaimsByProvider(pdfSample):
|
292 |
+
pdfTopClaims = pdfSample.sort_values(by=["InscClaimAmtReimbursed"], ascending=False)
|
293 |
+
pdfAnoms = pdfTopClaims[pdfTopClaims['hasAnom?'] > 0]
|
294 |
+
|
295 |
+
#--- chart
|
296 |
+
#st.markdown("(Sample Anomalies) Top Insurance claims ($) by Provider")
|
297 |
+
fig = go.Figure(
|
298 |
+
layout=dict(
|
299 |
+
#xaxis=dict(categoryorder="category descending"),
|
300 |
+
#yaxis=dict(range=[0, 7]),
|
301 |
+
#scattermode="group",
|
302 |
+
title="(Sample Anomalies) Top Insurance claims ($) by Provider",
|
303 |
+
legend=dict(groupclick="toggleitem"),
|
304 |
+
)
|
305 |
+
)
|
306 |
+
|
307 |
+
fig.add_trace(
|
308 |
+
go.Bar(
|
309 |
+
x=pdfTopClaims.Provider,
|
310 |
+
y=pdfTopClaims.InscClaimAmtReimbursed,
|
311 |
+
name="Ins Claims Reibursed",
|
312 |
+
marker_color="LightBlue",
|
313 |
+
#offsetgroup="anoms",
|
314 |
+
#legendgroup="anoms",
|
315 |
+
#legendgrouptitle_text="Anoms",
|
316 |
+
)
|
317 |
+
)
|
318 |
+
|
319 |
+
|
320 |
+
fig.add_trace(
|
321 |
+
go.Scatter(
|
322 |
+
x=pdfAnoms.Provider,
|
323 |
+
y=pdfAnoms.InscClaimAmtReimbursed,
|
324 |
+
mode="markers",
|
325 |
+
marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
|
326 |
+
#offsetgroup="anoms",
|
327 |
+
#legendgroup="anoms",
|
328 |
+
name="Anomalies"
|
329 |
+
))
|
330 |
+
|
331 |
+
st.plotly_chart(fig, use_container_width=True)
|
332 |
+
|
333 |
+
|
334 |
+
|
335 |
+
def btnSave_testFile(pdfClaims, pdfPred):
|
336 |
+
#--- get all providers for all anoms
|
337 |
+
#print("TRACE (lit_anom_superv.btnSave_testFile) query anoms ... ", pdfPred.head(10))
|
338 |
+
pdfAnomProv = pdfPred[pdfPred['hasAnom?'] > 0]
|
339 |
+
#pdfAnomProv = pdfAnomProv['Provider']
|
340 |
+
|
341 |
+
#--- filter claims by anomProviders
|
342 |
+
print("TRACE (lit_anom_superv.btnSave_testFile) filter claims ... ")
|
343 |
+
pdfClaimAnom = pdfClaims[pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
|
344 |
+
pdfClaimNoAnom = pdfClaims[~pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
|
345 |
+
lngNumAnoms = len(pdfClaimAnom.index)
|
346 |
+
lngNumOk = len(pdfClaimNoAnom.index)
|
347 |
+
print("TRACE (lit_anom_superv.btnSave_testFile) #anoms: ", lngNumAnoms, ", !anoms: ", lngNumOk)
|
348 |
+
|
349 |
+
#--- get a sample for remaining records
|
350 |
+
print("TRACE (lit_anom_superv.btnSave_testFile) sampling claims ... ")
|
351 |
+
pdfSave = pd.concat([pdfClaimAnom.sample(frac=0.6), pdfClaimNoAnom.sample(frac=0.1)])
|
352 |
+
|
353 |
+
print("TRACE (lit_anom_superv.btnSave_testFile) saving ... ")
|
354 |
+
saveProviderTestData(pdfSave)
|
355 |
+
|
356 |
+
|
357 |
+
def saveProviderTestData(pdfTestData):
|
358 |
+
|
359 |
+
#--- save the file
|
360 |
+
from datetime import date
|
361 |
+
import time
|
362 |
+
import pickle
|
363 |
+
strDteNow = date.today().strftime('%Y%m%d')
|
364 |
+
strTimeNow = time.strftime('%H%M%S')
|
365 |
+
strProvTestFile = libUtils.pth_data + strDteNow + strTimeNow + "_provTestSample.pkl"
|
366 |
+
#pd.to_pickle(pdfClaims.sample(200), strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
|
367 |
+
pdfTestData.to_pickle(strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
|
368 |
+
|
uix/pages/lit_anom_unsuperv.py
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#--- anomaly detection - unsupervised page
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import plotly.express as px
|
6 |
+
import plotly.graph_objects as go
|
7 |
+
|
8 |
+
import lib.claims as libClaims
|
9 |
+
import lib.providers as libProviders
|
10 |
+
import lib.utils as libUtils
|
11 |
+
import sys
|
12 |
+
|
13 |
+
description = "Anomaly Detection - Unsupervised"
|
14 |
+
m_kblnTraceOn = False #--- enable/disable module level tracing
|
15 |
+
|
16 |
+
def run():
|
17 |
+
#--- note: in python, you need to specify global scope for fxns to access module-level variables
|
18 |
+
global m_kblnTraceOn
|
19 |
+
print("\nINFO (lit_about.run) loading ", description, " page ...")
|
20 |
+
|
21 |
+
|
22 |
+
try:
|
23 |
+
|
24 |
+
#--- page settings
|
25 |
+
if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): Initialize Page Settings ...")
|
26 |
+
st.header("Claims Anomalies - Unsupervised Approach (KMeans)")
|
27 |
+
|
28 |
+
|
29 |
+
#--- provide file drag/drop capability
|
30 |
+
m_blnDisableDragDrop = False
|
31 |
+
if(not m_blnDisableDragDrop):
|
32 |
+
#btnSave = st.button("Save")
|
33 |
+
pklDropped = st.file_uploader("Upload a Claims Dataset", type=["pkl"])
|
34 |
+
m_blnDisableDragDrop = (pklDropped is None)
|
35 |
+
|
36 |
+
|
37 |
+
#--- show: raw claims data analysis
|
38 |
+
if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): load raw claims data ...")
|
39 |
+
if (m_blnDisableDragDrop):
|
40 |
+
pdfClaims = libClaims.load_claims(False)
|
41 |
+
else:
|
42 |
+
pdfClaims = pd.read_pickle(pklDropped)
|
43 |
+
|
44 |
+
#--- show: raw claims data analysis
|
45 |
+
if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show Raw Claims Dataframe ...")
|
46 |
+
pdfClaims = libClaims.load_claims(False)
|
47 |
+
|
48 |
+
|
49 |
+
#--- get unsupervised predictions
|
50 |
+
#pdfFeatEng = libClaims.do_featEng(pdfClaims)
|
51 |
+
pdfPred = libClaims.get_kmeansPredict(pdfClaims)
|
52 |
+
pdfSample = pdfPred.sample(100)
|
53 |
+
pdfSample['providerId'] = pdfSample['Provider'].str[3:].astype(np.float64)
|
54 |
+
|
55 |
+
|
56 |
+
#--- save this file locally as a pkl
|
57 |
+
#btnSave_testFile(pdfClaims, pdfPred)
|
58 |
+
|
59 |
+
|
60 |
+
#--- table of claims and clusters, sorted by InscClaimAmt Reimbursed
|
61 |
+
pdfTopClaims = pdfSample.sort_values(by=["cluster", "InscClaimAmtReimbursed"], ascending=False)
|
62 |
+
if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show $claims reimbursed by cluster ...")
|
63 |
+
st.markdown("(Top) Ins Claim Reimbursed by Cluster")
|
64 |
+
st.dataframe(pdfTopClaims)
|
65 |
+
|
66 |
+
|
67 |
+
#--- chart cluster data distribution
|
68 |
+
chart_clusterDistr(pdfSample)
|
69 |
+
|
70 |
+
|
71 |
+
col1, col2, col3 = st.columns(3)
|
72 |
+
|
73 |
+
|
74 |
+
#--- chart KMeans clusters": InscClaimAmtReimbursed
|
75 |
+
#chart_KMeansClusters(pdfSample, "Age", "InscClaimAmtReimbursed", col1)
|
76 |
+
#chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col2)
|
77 |
+
|
78 |
+
chart_KMeansClusters(pdfSample, "providerId", "AdmittedDays", col1)
|
79 |
+
chart_KMeansClusters(pdfSample, "providerId", "DeductibleAmtPaid", col2)
|
80 |
+
chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col3)
|
81 |
+
|
82 |
+
chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_KidneyDisease", col1)
|
83 |
+
chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_Heartfailure", col2)
|
84 |
+
chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_ObstrPulmonary", col3)
|
85 |
+
|
86 |
+
chart_KMeansClusters(pdfSample, "AdmittedDays", "DeductibleAmtPaid", col1)
|
87 |
+
chart_KMeansClusters(pdfSample, "AdmittedDays", "InscClaimAmtReimbursed", col2)
|
88 |
+
chart_KMeansClusters(pdfSample, "DeductibleAmtPaid", "InscClaimAmtReimbursed", col3)
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
#--- chart cluster bars
|
93 |
+
#chart_KMeansBars(pdfSample, "cluster", "InscClaimAmtReimbursed", col1)
|
94 |
+
#chart_KMeansBars(pdfSample, "cluster", "DeductibleAmtPaid", col2)
|
95 |
+
|
96 |
+
#chart_KMeansBars(pdfSample, "cluster", "IPAnnualReimbursementAmt", col1)
|
97 |
+
#chart_KMeansBars(pdfSample, "cluster", "IPAnnualDeductibleAmt", col2)
|
98 |
+
|
99 |
+
#chart_KMeansBars(pdfSample, "cluster", "OPAnnualReimbursementAmt", col1)
|
100 |
+
#chart_KMeansBars(pdfSample, "cluster", "OPAnnualDeductibleAmt", col2)
|
101 |
+
|
102 |
+
#chart_KMeansBars(pdfSample, "cluster", "ChronicCond_Heartfailure", col1)
|
103 |
+
#chart_KMeansBars(pdfSample, "cluster", "ChronicCond_KidneyDisease", col2)
|
104 |
+
|
105 |
+
except TypeError as e:
|
106 |
+
print("ERROR (litAnomUnsuperv.run_typeError): ", e)
|
107 |
+
|
108 |
+
except:
|
109 |
+
e = sys.exc_info()
|
110 |
+
print("ERROR (litAnomUnsuperv.run_genError): ", e)
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
def chart_clusterDistr(pdfSample):
|
115 |
+
#pdfClustDistr = pdfSample['cluster'].value_counts()
|
116 |
+
pdfBar = pdfSample
|
117 |
+
pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
|
118 |
+
pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
|
119 |
+
pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]
|
120 |
+
|
121 |
+
kstrTitle = "(KMeans Clusters) Claims data"
|
122 |
+
#--- chart
|
123 |
+
fig = go.Figure(
|
124 |
+
layout=dict(
|
125 |
+
legend=dict(groupclick="toggleitem"),
|
126 |
+
xaxis=dict(title='cluster'),
|
127 |
+
yaxis=dict(title='#data points')
|
128 |
+
)
|
129 |
+
)
|
130 |
+
|
131 |
+
fig.add_trace(
|
132 |
+
go.Bar(
|
133 |
+
x=pdfCluster0['cluster'],
|
134 |
+
y=pdfCluster0['cluster'].value_counts(),
|
135 |
+
name='cluster0'
|
136 |
+
)
|
137 |
+
)
|
138 |
+
|
139 |
+
if (pdfCluster1.shape[0]>0):
|
140 |
+
fig.add_trace(
|
141 |
+
go.Bar(
|
142 |
+
x=pdfCluster1['cluster'],
|
143 |
+
y=pdfCluster1['cluster'].value_counts(),
|
144 |
+
name='cluster1'
|
145 |
+
))
|
146 |
+
|
147 |
+
if (pdfCluster2.shape[0]>0):
|
148 |
+
fig.add_trace(
|
149 |
+
go.Bar(
|
150 |
+
x=pdfCluster2['cluster'],
|
151 |
+
y=pdfCluster2['cluster'].value_counts(),
|
152 |
+
name='cluster2'
|
153 |
+
))
|
154 |
+
st.plotly_chart(fig, use_container_width=True)
|
155 |
+
|
156 |
+
|
157 |
+
def chart_KMeansClusters(pdfSample, strXFeature, strYFeature, stCol):
|
158 |
+
pdfScatter = pdfSample
|
159 |
+
pdfCluster0 = pdfScatter[pdfScatter['cluster'] == 0]
|
160 |
+
pdfCluster1 = pdfScatter[pdfScatter['cluster'] == 1]
|
161 |
+
pdfCluster2 = pdfScatter[pdfScatter['cluster'] == 2]
|
162 |
+
|
163 |
+
kstrTitle = "(KMeans Clusters) Claims data"
|
164 |
+
#--- chart
|
165 |
+
fig = go.Figure(
|
166 |
+
layout=dict(
|
167 |
+
legend=dict(groupclick="toggleitem"),
|
168 |
+
xaxis=dict(title=strXFeature),
|
169 |
+
yaxis=dict(title=strYFeature)
|
170 |
+
)
|
171 |
+
)
|
172 |
+
|
173 |
+
fig.add_trace(
|
174 |
+
go.Scatter(
|
175 |
+
x=pdfCluster0[strXFeature],
|
176 |
+
y=pdfCluster0[strYFeature],
|
177 |
+
text="claimId: " + pdfCluster0['ClaimID'],
|
178 |
+
mode='markers',
|
179 |
+
name='cluster0'
|
180 |
+
)
|
181 |
+
)
|
182 |
+
|
183 |
+
if (pdfCluster1.shape[0]>0):
|
184 |
+
fig.add_trace(
|
185 |
+
go.Scatter(
|
186 |
+
x=pdfCluster1[strXFeature],
|
187 |
+
y=pdfCluster1[strYFeature],
|
188 |
+
mode='markers',
|
189 |
+
name='cluster1'
|
190 |
+
))
|
191 |
+
|
192 |
+
if (pdfCluster2.shape[0]>0):
|
193 |
+
fig.add_trace(
|
194 |
+
go.Scatter(
|
195 |
+
x=pdfCluster2[strXFeature],
|
196 |
+
y=pdfCluster2[strYFeature],
|
197 |
+
mode='markers',
|
198 |
+
name='cluster2'
|
199 |
+
))
|
200 |
+
stCol.plotly_chart(fig, use_container_width=True)
|
201 |
+
|
202 |
+
|
203 |
+
def chart_KMeansBars(pdfSample, strXFeature, strYFeature, stCol):
|
204 |
+
pdfBar = pdfSample
|
205 |
+
pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
|
206 |
+
pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
|
207 |
+
pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]
|
208 |
+
|
209 |
+
kstrTitle = "(KMeans Clusters) Claims data"
|
210 |
+
#--- chart
|
211 |
+
fig = go.Figure(
|
212 |
+
layout=dict(
|
213 |
+
legend=dict(groupclick="toggleitem"),
|
214 |
+
xaxis=dict(title=strXFeature),
|
215 |
+
yaxis=dict(title=strYFeature)
|
216 |
+
)
|
217 |
+
)
|
218 |
+
|
219 |
+
fig.add_trace(
|
220 |
+
go.Bar(
|
221 |
+
x=pdfCluster0[strXFeature],
|
222 |
+
y=pdfCluster0[strYFeature],
|
223 |
+
name='cluster0'
|
224 |
+
)
|
225 |
+
)
|
226 |
+
|
227 |
+
if (pdfCluster1.shape[0]>0):
|
228 |
+
fig.add_trace(
|
229 |
+
go.Bar(
|
230 |
+
x=pdfCluster1[strXFeature],
|
231 |
+
y=pdfCluster1[strYFeature],
|
232 |
+
name='cluster1'
|
233 |
+
))
|
234 |
+
|
235 |
+
if (pdfCluster2.shape[0]>0):
|
236 |
+
fig.add_trace(
|
237 |
+
go.Bar(
|
238 |
+
x=pdfCluster2[strXFeature],
|
239 |
+
y=pdfCluster2[strYFeature],
|
240 |
+
name='cluster2'
|
241 |
+
))
|
242 |
+
stCol.plotly_chart(fig, use_container_width=True)
|
243 |
+
|
244 |
+
|
245 |
+
|
246 |
+
def btnSave_testFile(pdfClaims, pdfPred):
|
247 |
+
#--- get all claims for all anoms
|
248 |
+
""" print("TRACE (lit_anom_unsuperv.btnSave_testFile) query anoms ... ", pdfPred.head(10))
|
249 |
+
pdfAnomClaims = pdfPred[pdfPred['hasAnom?'] > 0]
|
250 |
+
#pdfAnomProv = pdfAnomProv['Provider']
|
251 |
+
|
252 |
+
#--- filter claims by anomProviders
|
253 |
+
print("TRACE (lit_anom_unsuperv.btnSave_testFile) filter claims ... ")
|
254 |
+
pdfClaimAnom = pdfClaims[pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
|
255 |
+
pdfClaimNoAnom = pdfClaims[~pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
|
256 |
+
lngNumAnoms = len(pdfClaimAnom.index)
|
257 |
+
lngNumOk = len(pdfClaimNoAnom.index)
|
258 |
+
print("TRACE (lit_anom_unsuperv.btnSave_testFile) #anoms: ", lngNumAnoms, ", !anoms: ", lngNumOk)
|
259 |
+
|
260 |
+
#--- get a sample for remaining records
|
261 |
+
print("TRACE (lit_anom_unsuperv.btnSave_testFile) sampling claims ... ")
|
262 |
+
pdfSave = pd.concat([pdfClaimAnom.sample(frac=0.6), pdfClaimNoAnom.sample(frac=0.1)]) """
|
263 |
+
|
264 |
+
pdfSave = pdfClaims.sample(frac=0.1)
|
265 |
+
|
266 |
+
print("TRACE (lit_anom_unsuperv.btnSave_testFile) saving ... ")
|
267 |
+
saveProviderTestData(pdfSave)
|
268 |
+
|
269 |
+
|
270 |
+
def saveProviderTestData(pdfTestData):
|
271 |
+
|
272 |
+
#--- save the file
|
273 |
+
from datetime import date
|
274 |
+
import time
|
275 |
+
import pickle
|
276 |
+
strDteNow = date.today().strftime('%Y%m%d')
|
277 |
+
strTimeNow = time.strftime('%H%M%S')
|
278 |
+
strProvTestFile = libUtils.pth_data + strDteNow + strTimeNow + "_claimsTestSample.pkl"
|
279 |
+
#pd.to_pickle(pdfClaims.sample(200), strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
|
280 |
+
pdfTestData.to_pickle(strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
|
uix/pages/lit_claimAnalysis.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#--- claim analysis page
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.express as px
|
5 |
+
|
6 |
+
import lib.claims as libClaims
|
7 |
+
|
8 |
+
description = "Claim Analysis"
|
9 |
+
m_kbln_traceOn = False #--- enable/disable module level tracing
|
10 |
+
|
11 |
+
|
12 |
+
def run():
|
13 |
+
#--- note: in python, you need to specify global scope for fxns to access module-level variables
|
14 |
+
global m_kbln_traceOn
|
15 |
+
|
16 |
+
try:
|
17 |
+
|
18 |
+
#--- page settings
|
19 |
+
if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Initialize Page Settings ...")
|
20 |
+
st.header("Claims Analysis")
|
21 |
+
|
22 |
+
|
23 |
+
#--- show: raw claims data analysis
|
24 |
+
if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show Raw Claims Dataframe ...")
|
25 |
+
dfClaims = libClaims.load_claims(False)
|
26 |
+
#dfClaims = libClaims.loadPkl_testClaims() #--- note: a large dataset; reduce before render
|
27 |
+
dfRaw = dfClaims.sample(25)
|
28 |
+
st.markdown("(Sample) Raw Claims Data: Providers, Beneficiaries, Physicians, Procedures, etc")
|
29 |
+
st.dataframe(dfRaw)
|
30 |
+
|
31 |
+
|
32 |
+
#--- show: data grouped by provider
|
33 |
+
pdfClaimsByProvider = dfClaims.groupby(
|
34 |
+
by=["Provider"], as_index=False).agg(
|
35 |
+
{"ClaimID":"count", "InscClaimAmtReimbursed":"sum", "DeductibleAmtPaid":"sum"}
|
36 |
+
)
|
37 |
+
st.markdown("(Sample) Raw Claims Data: Grouped by Provider")
|
38 |
+
st.dataframe(pdfClaimsByProvider.sample(25))
|
39 |
+
|
40 |
+
#--- show: bar charts
|
41 |
+
col1, col2 = st.columns(2)
|
42 |
+
|
43 |
+
#--- show $claims reimbursed by provider
|
44 |
+
if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show $claims reimbursed by provider ...")
|
45 |
+
pdfTopClaimsByProv = dfClaims.nlargest(10, "InscClaimAmtReimbursed")
|
46 |
+
fig = px.bar(pdfTopClaimsByProv,
|
47 |
+
x="Provider", y="InscClaimAmtReimbursed", title="$ Claims by Provider")
|
48 |
+
#col1.markdown("(Sample) $Claims Reimbursed by Provider")
|
49 |
+
col1.plotly_chart(fig, use_container_width=True)
|
50 |
+
|
51 |
+
#--- #claims reimbursed by provider
|
52 |
+
if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show #claims reimbursed by provider ...")
|
53 |
+
#pdfMaxClaimsByProv = dfClaims.groupby(['Provider'])['ClaimID'].count()
|
54 |
+
pdfClaimCountByProv = dfClaims.groupby(
|
55 |
+
by=["Provider"], as_index=False).agg(
|
56 |
+
{"ClaimID": "count"}
|
57 |
+
)
|
58 |
+
pdfClaimCountByProv = pdfClaimCountByProv.nlargest(10, "ClaimID")
|
59 |
+
fig = px.bar(pdfClaimCountByProv,
|
60 |
+
x="Provider", y="ClaimID", title="# Claims by Provider", barmode="group")
|
61 |
+
#col2.markdown("(Sample) #Claims Reimbursed by Provider") #--- just to even out the display
|
62 |
+
col2.plotly_chart(fig, use_container_width=True)
|
63 |
+
|
64 |
+
|
65 |
+
#--- TODO: (optimization) create a single group by dataframe; try not to recreate for each chart
|
66 |
+
if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show top $deductible paid by provider ...")
|
67 |
+
pdfDedAmtPaid = dfClaims.nlargest(10, "DeductibleAmtPaid")
|
68 |
+
fig = px.bar(pdfDedAmtPaid,
|
69 |
+
x="Provider", y="DeductibleAmtPaid", title="Deductible Paid by Provider")
|
70 |
+
col1.plotly_chart(fig, use_container_width=True)
|
71 |
+
|
72 |
+
if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): end of fxn ...")
|
73 |
+
|
74 |
+
except TypeError as e:
|
75 |
+
print("ERROR (litClaimAnalysis.run): ", e)
|
uix/pages/lit_home.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#--- about page
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
description = "Home"
|
5 |
+
def run():
|
6 |
+
|
7 |
+
print("\nINFO (lit_home.run) loading ", description, " page ...")
|
8 |
+
|
9 |
+
|
10 |
+
st.markdown('### Home')
|
11 |
+
st.markdown('### MLE10 Capstone: Healthcare Anomaly Detection')
|
12 |
+
st.markdown('\
|
13 |
+
Healthcare fraud is an expensive white-collar crime in the US and leads to an \
|
14 |
+
increase in healthcare premiums, and a reduction in quality and access to care.\
|
15 |
+
The National Health Care Anti-Fraud Association conservatively estimates that \
|
16 |
+
about 3 percent of US healthcare spending is lost to fraud per year ($300 billion \
|
17 |
+
approximately).')
|
18 |
+
|
19 |
+
st.markdown('\
|
20 |
+
Machine Learning techniques can identify current and evolving anomalies in claims \
|
21 |
+
data. As fraud becomes more sophisticated across an increasing number of annual \
|
22 |
+
transactions, an ML solution provides an opportunity to greatly reduce the effort, \
|
23 |
+
time and associated cost spent in identifying claims anomalies, and recouping any \
|
24 |
+
misappropriated funds. ')
|
25 |
+
|
26 |
+
st.markdown('\
|
27 |
+
To illustrate the capabilities of Machine Learning to identify claims anomalies, \
|
28 |
+
this capstone project team has developed two solutions: \
|
29 |
+
\n\t - a supervised Logistic Regression Model to identify potential anomalies at \
|
30 |
+
the provider level \
|
31 |
+
\n\t - an unsupervised KMeans Clustering Model to identify potential anomalies \
|
32 |
+
at the claim level.')
|
33 |
+
|
34 |
+
st.markdown(
|
35 |
+
"""
|
36 |
+
|
37 |
+
Home page
|
38 |
+
|
39 |
+
""",
|
40 |
+
unsafe_allow_html=True,
|
41 |
+
)
|
uix/pages/lit_modelPerf.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
description = "Model Performance"
|
2 |
+
|
3 |
+
def run():
|
4 |
+
import streamlit as st
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.express as px
|