kidcoconut commited on
Commit
75660bd
·
1 Parent(s): 7a90042

merged github/demo_huggingFace into runner/main

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __init__.py +0 -0
  2. bin/models/__init__.py +0 -0
  3. bin/models/claims_kmn_py3816_sk111hp_22cols.pkl +3 -0
  4. bin/models/claims_stdScl_py3816_sk111hp_27cols.pkl +3 -0
  5. bin/models/gbc_trainVal_confusionMatrix_colab.png +0 -0
  6. bin/models/kmn_elbow.png +0 -0
  7. bin/models/lgr_precisionRecallCurve_colab.png +0 -0
  8. bin/models/lgr_rocCurve_colab.png +0 -0
  9. bin/models/lgr_trainVal_confusionMatrix_colab.png +0 -0
  10. bin/models/lgr_trainVal_probPred_colab.png +0 -0
  11. bin/models/prov_gbc_py3816_sk111hp_32cols.pkl +3 -0
  12. bin/models/prov_stdScl_py3816_sk111hp_32cols.pkl +3 -0
  13. bin/models/svm_trainVal_confusionMatrix_colab.png +0 -0
  14. data/demo_data/20230210165948_provTestSample.pkl +3 -0
  15. data/demo_data/20230210170628_claimsTestSample.pkl +3 -0
  16. data/test_claims.pkl +3 -0
  17. lib/__init__.py +0 -0
  18. lib/claims.py +258 -0
  19. lib/models/__init__.py +0 -0
  20. lib/models/mdl_autoenc.py +55 -0
  21. lib/models/mdl_kmeans.py +155 -0
  22. lib/models/mdl_logR.py +41 -0
  23. lib/models/mdl_svm.py +40 -0
  24. lib/models/mdl_utils.py +256 -0
  25. lib/models/mdl_xgb.py +66 -0
  26. lib/providers.py +170 -0
  27. lib/utils.py +23 -0
  28. lit_index.py +25 -0
  29. main.py +97 -0
  30. routes/__init__.py +0 -0
  31. routes/api/__init__.py +0 -0
  32. routes/api/rte_api.py +67 -0
  33. routes/qa/__init__.py +0 -0
  34. routes/qa/rte_claims.py +139 -0
  35. routes/qa/rte_providers.py +188 -0
  36. routes/qa/rte_qa.py +17 -0
  37. templ/templ_results.html +4 -0
  38. templ/templ_showDataframe.html +15 -0
  39. uix/__init__.py +0 -0
  40. uix/images/image1.jpg +0 -0
  41. uix/images/image1.jpg:Zone.Identifier +3 -0
  42. uix/lit_packages.py +36 -0
  43. uix/lit_sidebar.py +99 -0
  44. uix/pages/__init__.py +0 -0
  45. uix/pages/lit_about.py +24 -0
  46. uix/pages/lit_anom_superv.py +368 -0
  47. uix/pages/lit_anom_unsuperv.py +280 -0
  48. uix/pages/lit_claimAnalysis.py +75 -0
  49. uix/pages/lit_home.py +41 -0
  50. uix/pages/lit_modelPerf.py +6 -0
__init__.py ADDED
File without changes
bin/models/__init__.py ADDED
File without changes
bin/models/claims_kmn_py3816_sk111hp_22cols.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a3f3e93e08cc64a169e199232261b55250ca7c6599522cea2c2821d99edb554
3
+ size 2234618
bin/models/claims_stdScl_py3816_sk111hp_27cols.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84d5379c031897e5df1ecbc6b07dca005873955818285f98cb1c8ec5d291c581
3
+ size 1779
bin/models/gbc_trainVal_confusionMatrix_colab.png ADDED
bin/models/kmn_elbow.png ADDED
bin/models/lgr_precisionRecallCurve_colab.png ADDED
bin/models/lgr_rocCurve_colab.png ADDED
bin/models/lgr_trainVal_confusionMatrix_colab.png ADDED
bin/models/lgr_trainVal_probPred_colab.png ADDED
bin/models/prov_gbc_py3816_sk111hp_32cols.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a740ba487ec674f9eafdc68f360e98be7b4b834fac0a6a79f9b82bac583d710f
3
+ size 45135
bin/models/prov_stdScl_py3816_sk111hp_32cols.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2137e7adba591cb760d4b73c561f84da5cc2aa49235000b274304f37d98582b7
3
+ size 2094
bin/models/svm_trainVal_confusionMatrix_colab.png ADDED
data/demo_data/20230210165948_provTestSample.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb380a5e4cfed980e8514bcee519f45de67556ffdd09e3eaf9d1f635c1c77d79
3
+ size 7419701
data/demo_data/20230210170628_claimsTestSample.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5b35e7014ab77ba73140f875637903691bb4f22019d2b956a320b5d0b5c8aa2
3
+ size 6418423
data/test_claims.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478c2854368471f7db414171bb5c1fc8904fbea49fb5aa3091a58f41443f8bed
3
+ size 61997959
lib/__init__.py ADDED
File without changes
lib/claims.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lib.utils as libPaths
3
+
4
+ from lib.models import mdl_utils, mdl_xgb, mdl_logR, mdl_svm
5
+ from lib.models import mdl_autoenc, mdl_kmeans
6
+
7
+
8
+ m_blnTraceOn = True
9
+ m_blnTrace2On = False
10
+
11
+ #--- load, merge data from file
12
+ m_kstrDataPath = libPaths.pth_data
13
+ m_kstrModelPath = libPaths.pth_model
14
+ m_kstrBinModelPath = libPaths.pth_binModels
15
+
16
+
17
+
18
+ def getPath_defPklClaims(blnIsTrain=False):
19
+ global m_kstrDataPath
20
+ strPrefix="test_"
21
+ if (blnIsTrain): strPrefix = "train_"
22
+ strPth_pklClaims = m_kstrDataPath + strPrefix + 'claims.pkl'
23
+ return strPth_pklClaims
24
+
25
+
26
+
27
+ def load_claims(blnIsTrain=False, blnForceCsv=False):
28
+ if (blnForceCsv):
29
+ pdfClaims = loadCsv_claims(blnIsTrain)
30
+ else:
31
+ pdfClaims = loadPkl_claims(blnIsTrain)
32
+ return pdfClaims
33
+
34
+
35
+
36
+ def loadCsv_claims(blnIsTrain=False):
37
+ global m_kstrDataPath
38
+ #--- load all csv test data
39
+ if (blnIsTrain):
40
+ print("INFO (loadCsv_claimsData): load train data ...")
41
+ strPthProvider = m_kstrDataPath + 'Train-1542865627584.csv'
42
+ strPthBenef = m_kstrDataPath + 'Train_Beneficiarydata-1542865627584.csv'
43
+ strPthInpat = m_kstrDataPath + 'Train_Inpatientdata-1542865627584.csv'
44
+ strPthOutpat = m_kstrDataPath + 'Train_Outpatientdata-1542865627584.csv'
45
+ else:
46
+ print("INFO (loadCsv_claimsData): load test data ...")
47
+ strPthProvider = m_kstrDataPath + 'Test-1542969243754.csv'
48
+ strPthBenef = m_kstrDataPath + 'Test_Beneficiarydata-1542969243754.csv'
49
+ strPthInpat = m_kstrDataPath + 'Test_Inpatientdata-1542969243754.csv'
50
+ strPthOutpat = m_kstrDataPath + 'Test_Outpatientdata-1542969243754.csv'
51
+
52
+ #--- output: pandas data frame
53
+ pdfProvider = pd.read_csv(strPthProvider)
54
+ pdfBenef = pd.read_csv(strPthBenef)
55
+ pdfInpat = pd.read_csv(strPthInpat)
56
+ pdfOutpat = pd.read_csv(strPthOutpat)
57
+
58
+ #--- data engineering
59
+ pdfBenef = prep_benefData(pdfBenef)
60
+ pdfInpat = prep_inpatData(pdfInpat)
61
+
62
+ #--- merge inpatient and outpatient data (assert: 31 cols)
63
+ aryMergeCols = list(pdfOutpat.columns)
64
+ pdfAllpat = pdfInpat.merge(pdfOutpat, on=aryMergeCols, how='outer')
65
+
66
+ #--- +merge beneficiary data
67
+ pdfAllPatBenef = pdfAllpat.merge(pdfBenef, on='BeneID', how='inner')
68
+
69
+ #--- +merge provider data
70
+ pdfAllPatBenefProv = pdfAllPatBenef.merge(pdfProvider, on='Provider', how='inner')
71
+
72
+ #--- export data
73
+ strPth_pklClaims = getPath_defPklClaims(blnIsTrain)
74
+ print("TRACE (claims.loadCsv_claims): pkl claim data file path ... ", strPth_pklClaims)
75
+ pdfAllPatBenefProv.to_pickle(strPth_pklClaims)
76
+
77
+ #print("INFO (csvClaims.shape): ", pdfTest_allPatBenefProv.shape)
78
+ return pdfAllPatBenefProv
79
+
80
+
81
+
82
+ def loadCsv_testClaims():
83
+ #--- TODO: make optional arg test or train data
84
+ return loadCsv_claims(False)
85
+
86
+
87
+
88
+ def loadPkl_claims(blnIsTrain=False):
89
+ strPth_pklClaims = getPath_defPklClaims(blnIsTrain)
90
+ try:
91
+ pdfClaims = pd.read_pickle(strPth_pklClaims)
92
+ except FileNotFoundError:
93
+ #--- catch: there is no pickle file
94
+ #--- load from csv instead; will create pkl files for next time
95
+ pdfClaims = loadCsv_claims(blnIsTrain)
96
+ return pdfClaims
97
+
98
+
99
+
100
+ #--- feat eng
101
+ def do_featEng(pdfLoaded, blnIsTrain=False):
102
+ if (m_blnTrace2On): print("INFO (claims.doFeatEng): blnIsTrain, ", blnIsTrain)
103
+
104
+ #--- remove cols
105
+ aryColsToDrop = ['BeneID', 'ClaimID', 'ClaimStartDt','ClaimEndDt','AttendingPhysician',
106
+ 'OperatingPhysician', 'OtherPhysician', 'ClmDiagnosisCode_1',
107
+ 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
108
+ 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7',
109
+ 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
110
+ 'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
111
+ 'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
112
+ 'ClmAdmitDiagnosisCode', 'AdmissionDt',
113
+ 'DischargeDt', 'DiagnosisGroupCode','DOB', 'DOD',
114
+ 'State', 'County']
115
+ pdfFeatEng = pdfLoaded.drop(columns=aryColsToDrop, axis=1)
116
+
117
+ #--- flag categorical cols
118
+ pdfFeatEng.Gender = pdfFeatEng.Gender.astype('category')
119
+ pdfFeatEng.Race = pdfFeatEng.Race.astype('category')
120
+
121
+ #--- one-hot-encoding
122
+ pdfFeatEng = pd.get_dummies(pdfFeatEng, columns=['Gender', 'Race'], drop_first=True)
123
+ if (blnIsTrain):
124
+ #--- one-hot encode the potential fraud column (for training data only)
125
+ try:
126
+ #print("INFO (claims.doFeatEng): one-hot encoding potential fraud")
127
+ pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'Yes', 'PotentialFraud'] = 1
128
+ pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'No', 'PotentialFraud'] = 0
129
+ except KeyError:
130
+ #--- likely column not found; invalid fxn call
131
+ print("ERROR (claims.doFeatEng): Potential Fraud col not found")
132
+
133
+ pdfFeatEng.loc[pdfFeatEng['RenalDiseaseIndicator'] == 'Y', 'RenalDiseaseIndicator'] = 1
134
+ pdfFeatEng['DeductibleAmtPaid'].fillna(0, inplace=True)
135
+ pdfFeatEng['AdmittedDays'].fillna(0, inplace=True)
136
+
137
+ #--- check for correlated cols
138
+
139
+ #--- add new features to assist with predictions
140
+ pdfFeatEng['InscClaimReimbursement_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['InscClaimAmtReimbursed'].transform('mean')
141
+ pdfFeatEng['DeductibleAmtPaid_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['DeductibleAmtPaid'].transform('mean')
142
+
143
+ pdfFeatEng['IPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualReimbursementAmt'].transform('mean')
144
+ pdfFeatEng['IPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualDeductibleAmt'].transform('mean')
145
+
146
+ pdfFeatEng['OPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualReimbursementAmt'].transform('mean')
147
+ pdfFeatEng['OPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualDeductibleAmt'].transform('mean')
148
+ return pdfFeatEng
149
+
150
+
151
+
152
+ #--- data eng on inpatient data
153
+ def prep_inpatData(pdfInpat):
154
+ #--- calc admitted days
155
+ pdfInpat['AdmissionDt'] = pd.to_datetime(pdfInpat['AdmissionDt'], format='%Y-%m-%d')
156
+ pdfInpat['DischargeDt'] = pd.to_datetime(pdfInpat['DischargeDt'], format='%Y-%m-%d')
157
+ pdfInpat['AdmittedDays'] = round((pdfInpat['DischargeDt'] - pdfInpat['AdmissionDt']).dt.days + 1)
158
+ return pdfInpat
159
+
160
+
161
+
162
+ #--- data eng on beneficiary data
163
+ def prep_benefData(pdfBenef):
164
+ #--- chronic condition cols; change any vals of 2 to 0
165
+ aryCols = ['ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
166
+ 'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
167
+ 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
168
+ 'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
169
+ 'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
170
+ 'ChronicCond_stroke']
171
+
172
+ for strVal in aryCols:
173
+ pdfBenef.replace({strVal: 2}, 0, inplace=True)
174
+
175
+ #--- fill missing data: persons age
176
+ kstrDatetime = '2019-12-01' #--- the est datetime for the dataset
177
+ pdfBenef['DOB'] = pd.to_datetime(pdfBenef['DOB'], format = '%Y-%m-%d')
178
+ pdfBenef['DOD'] = pd.to_datetime(pdfBenef['DOD'], format = '%Y-%m-%d')
179
+ pdfBenef['Age'] = round((pdfBenef['DOD'] - pdfBenef['DOB']).dt.days/365)
180
+ pdfBenef['Age'].fillna(round(((pd.to_datetime(kstrDatetime, format='%Y-%m-%d') - pdfBenef['DOB']).dt.days)/365), inplace=True)
181
+
182
+ #--- add an isDead flag column
183
+ pdfBenef.loc[pdfBenef['DOD'].isna(), 'DeadOrNot'] = 0
184
+ pdfBenef.loc[pdfBenef['DOD'].notna(), 'DeadOrNot'] = 1
185
+
186
+ return pdfBenef
187
+
188
+
189
+
190
+
191
+
192
+
193
+ def get_kmeansPredict(pdfTestClaims):
194
+
195
+ #--- load test data
196
+ pdfClaims = pdfTestClaims
197
+ #print("INFO (claims.get_kmeansPredict) pdfClaims.shape): ", pdfClaims.shape)
198
+
199
+ #--- perform featEng, std scaling
200
+ print("TRACE: claims.kmeansPredict perform featEng, stdScaling ...")
201
+ pdfFeatEng = mdl_kmeans.do_featEng(pdfClaims, False, False)
202
+ npaScaled = mdl_utils.doClaims_stdScaler(pdfFeatEng, False)
203
+ pdfScaled = mdl_utils.doClaims_stdScaler_toPdf(npaScaled)
204
+ #print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
205
+
206
+ #--- get the pre-fit kmeans clusters
207
+ #--- predict/label clusters against data points
208
+ print("TRACE: claims.kmeansPredict perform kmeans predict ...")
209
+ ndaPredict = mdl_kmeans.predict(pdfScaled)
210
+ #print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
211
+
212
+ pdfPredict = pd.DataFrame(ndaPredict)
213
+ #print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
214
+
215
+ #--- stitch the data with the labels
216
+ print("TRACE: claims.kmeansPredict stitch labels with results ...")
217
+ pdfResults = pdfTestClaims
218
+ #print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
219
+
220
+ pdfResults.insert(0, "cluster", pdfPredict[0])
221
+ return pdfResults
222
+
223
+
224
+
225
+ def get_kmeansFit(pdfTestClaims):
226
+
227
+ pdfClaims = pdfTestClaims
228
+ pdfFeatEng = do_featEng(pdfClaims, False) #--- not grouped by provider
229
+
230
+
231
+ #--- perform standard scaling; get fit then transform
232
+ npaScaled = mdl_utils.do_stdScaler(pdfFeatEng, False) #--- grouped by provider
233
+ pdfScaled = mdl_utils.do_stdScaler_toPdf(npaScaled)
234
+ #print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
235
+
236
+ #--- SKIP: perform PCA; then kmeans fit (this was done to determine the KMeans params)
237
+ #--- get Kmeans object, instantiated with trained args, and fit to test/prod scaled data
238
+ #--- OR ... assume that the kmeans is already fit, and we now want to predict which cluster each data point appears in
239
+ mdlKmeans = mdl_kmeans.fit(pdfScaled)
240
+ """
241
+
242
+ pdfPredict = pd.DataFrame(ndaPredict)
243
+ #print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
244
+
245
+ #--- stitch the grouped data with the labels
246
+ pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
247
+ #print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
248
+
249
+ pdfResults.insert(0, "hasAnom?", pdfPredict[0])
250
+
251
+ Notes:
252
+ - train_final = trainAllPatientDetailsProvider #--- ungrouped data (558211, 27); has PotentialFraud cols
253
+ - train_final = pd.get_dummies #--- post one-hot encoding (558211, 25=27-2+4) ; -Gender-Race + 4*(Gender+Race one-hot encoding)
254
+ - y, X: X.shape = (558211, 27); y.shape=(558211,1) #--- X popped PotentialFraud, and dropped Provider
255
+ - train_final[cluster_labels] = mdlKMeans.labels
256
+ """
257
+
258
+ return mdlKmeans
lib/models/__init__.py ADDED
File without changes
lib/models/mdl_autoenc.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.decomposition import PCA
4
+ import lib.utils as libPaths
5
+ import pickle
6
+
7
+
8
+ m_kstrFile = __file__
9
+ m_kstrDataPath = libPaths.pth_data
10
+ m_kstrBinModelPath = libPaths.pth_binModels
11
+ m_kstrPcaModelPath = m_kstrBinModelPath + 'pca_unsuperv_colab.pkl'
12
+ m_kstrEncModelPath = m_kstrBinModelPath + 'enc_keras_seq/'
13
+
14
+
15
+ #--- Supervised: autoencoder - Principal Component Analysis
16
+ def load_encFromKeras():
17
+ from tensorflow import keras
18
+ mdlAnoms = keras.models.load_model(m_kstrEncModelPath)
19
+ return mdlAnoms
20
+
21
+
22
+ def load_pcaFromPkl():
23
+ with open(m_kstrPcaModelPath, 'rb') as filPkl:
24
+ # load using pickle de-serializer
25
+ mdlAnoms = pickle.load(filPkl)
26
+ return mdlAnoms
27
+
28
+
29
+ def save_encToKeras(mdlAnoms):
30
+ mdlAnoms.save(m_kstrEncModelPath)
31
+
32
+
33
+
34
+ def predict(pdfScaled):
35
+
36
+ #--- Pre: Transforming train and test dataframes based on PCA
37
+ mdlPCA = load_pcaFromPkl() #--- this is a pre-fit model based on training
38
+ npaPca = mdlPCA.transform(pdfScaled)
39
+ print("INFO (" + m_kstrFile + ".predict) npaPca.shape: ", npaPca.shape)
40
+
41
+
42
+ #--- predict on unseen data
43
+ mdlEnc = load_encFromKeras()
44
+ npaPredict = mdlEnc.predict(npaPca[:,:29])
45
+ print("INFO (" + m_kstrFile + ".predict) npaPredict.shape: ", npaPredict.shape)
46
+ #--- expected: 297, 29?
47
+ return npaPredict
48
+
49
+
50
+ """
51
+ def train(pdfTrainData):
52
+ mdlAnoms = PCA() #---- TODO: this is Keras Sequential
53
+ mdlAnoms.fit(pdfTrainData.values)
54
+ save_encToKeras(mdlAnoms)
55
+ return mdlAnoms """
lib/models/mdl_kmeans.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.cluster import KMeans
2
+ import lib.utils as libPaths
3
+ import pickle
4
+ import pandas as pd
5
+
6
+
7
+ m_kstrFile = __file__
8
+ m_kstrDataPath = libPaths.pth_data
9
+ m_kstrBinModelPath = libPaths.pth_binModels
10
+
11
+ #m_kstrPcaModelPath = m_kstrBinModelPath + 'pca_kmeans_unsuperv_colab.pkl'
12
+ #m_kstrPcaModelPath = m_kstrBinModelPath + 'pca_kmeans_unsuperv_colab_v1.2.1.pkl'
13
+ m_kstrPcaModelPath_111 = m_kstrBinModelPath + 'claims_pca_v1.1.1_27cols.pkl' #--- ERROR: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given
14
+ m_kstrPcaModelPath_121 = m_kstrBinModelPath + 'claims_pca_v1.2.1_27cols.pkl'
15
+ m_kstrPcaModelPath_claims_py3816_sk111hp = m_kstrBinModelPath + 'claims_pca_py3816_sk111hp_27cols.pkl'
16
+ m_kstrPcaModelPath = m_kstrPcaModelPath_claims_py3816_sk111hp
17
+
18
+ #m_kstrKmeansModelPath = m_kstrBinModelPath + 'kmeans_unsuperv_colab.pkl'
19
+ #m_kstrKmeansModelPath = m_kstrBinModelPath + 'kmn_unsuperv_colab_v1.2.1.pkl'
20
+ m_kstrModelPath_111 = m_kstrBinModelPath + 'claims_kmn_v1.1.1_22cols.pkl' #--- ERROR: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given
21
+ m_kstrModelPath_121 = m_kstrBinModelPath + 'claims_kmn_v1.2.1_22cols.pkl'
22
+ m_kstrModelPath_claims_py3816_sk111hp = m_kstrBinModelPath + 'claims_kmn_py3816_sk111hp_22cols.pkl'
23
+ m_kstrKmeansModelPath = m_kstrModelPath_claims_py3816_sk111hp
24
+
25
+ m_blnTraceOn = True
26
+
27
+
28
+ #--- unsupervised: Logistic Regession
29
+ def load_pcaFromPkl():
30
+ with open(m_kstrPcaModelPath, 'rb') as filPkl:
31
+ mdlAnoms = pickle.load(filPkl)
32
+ return mdlAnoms
33
+
34
+
35
+ #--- unsupervised: KMeans
36
+ def load_kmeansFromPkl():
37
+ with open(m_kstrKmeansModelPath, 'rb') as filPkl:
38
+ mdlAnoms = pickle.load(filPkl)
39
+ return mdlAnoms
40
+
41
+
42
+ def save_pcaToPkl(mdlAnoms):
43
+ with open(m_kstrPcaModelPath, 'wb') as filPkl:
44
+ pickle.dump(mdlAnoms, filPkl)
45
+ return mdlAnoms
46
+
47
+
48
+ def save_kmeansToPkl(mdlAnoms):
49
+ with open(m_kstrKmeansModelPath, 'wb') as filPkl:
50
+ pickle.dump(mdlAnoms, filPkl)
51
+ return mdlAnoms
52
+
53
+
54
+
55
+ #--- determine which points can be labelled against which clusters
56
+ def predict(pdfScaled):
57
+ #--- load a persisted fit kmeans model
58
+ #--- predict will assign labels onto a similarly scaled data frame
59
+
60
+
61
+ #--- Note: reverse chron through the code ...
62
+ #--- 4. KMeans was fit on X-reduced (22 cols)
63
+ #--- 3. X_reduced was a reduced column set of X-scaled (27 -> 22; Dropped 5 cols: DeadOrNot; and hotEncoded Gender and Race)
64
+ #--- 2. x_scaled was transformed through stdScaler
65
+ #--- 1. StdScaler was fit on X to produce X-scaled (X has 27 cols)
66
+ pdfReduced = pdfScaled[['InscClaimAmtReimbursed', 'DeductibleAmtPaid',
67
+ 'AdmittedDays', 'RenalDiseaseIndicator', 'NoOfMonths_PartACov',
68
+ 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
69
+ 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
70
+ 'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
71
+ 'ChronicCond_Depression', 'ChronicCond_Diabetes',
72
+ 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
73
+ 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
74
+ 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
75
+ 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Age']]
76
+
77
+ #--- prefit Kmeans clustering - was fit on trained pdfReduced
78
+ #--- Note: if we want to understand how kmeans performs on test/prod data, we need to predict
79
+ mdlKMeans = load_kmeansFromPkl()
80
+ #ndaPredict = mdlKMeans.predict(pdfScaled) #20230208: ValueError: X has 27 features, but KMeans is expecting 22 features as input.
81
+ ndaPredict = mdlKMeans.predict(pdfReduced) #ValueError: X has 22 features, but KMeans is expecting 27 features as input.
82
+ return ndaPredict
83
+
84
+
85
+ #--- feat eng
86
+ def do_featEng(pdfLoaded, blnIsTrain=False, hasGroupByProviderCols=True):
87
+ print("INFO (mdl_kmeans.doFeatEng): blnIsTrain, ", blnIsTrain)
88
+
89
+ #--- columns_to_remove
90
+ aryColsToDrop = ['BeneID', 'ClaimID', 'ClaimStartDt','ClaimEndDt','AttendingPhysician',
91
+ 'OperatingPhysician', 'OtherPhysician', 'ClmDiagnosisCode_1',
92
+ 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4',
93
+ 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7',
94
+ 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10',
95
+ 'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
96
+ 'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
97
+ 'ClmAdmitDiagnosisCode', 'AdmissionDt',
98
+ 'DischargeDt', 'DiagnosisGroupCode','DOB', 'DOD',
99
+ 'State', 'County']
100
+ pdfFeatEng = pdfLoaded.drop(columns=aryColsToDrop, axis=1)
101
+
102
+ #--- flag categorical cols
103
+ pdfFeatEng.Gender = pdfFeatEng.Gender.astype('category')
104
+ pdfFeatEng.Race = pdfFeatEng.Race.astype('category')
105
+
106
+ #--- one-hot-encoding
107
+ pdfFeatEng = pd.get_dummies(pdfFeatEng, columns=['Gender', 'Race'], drop_first=True)
108
+ if (blnIsTrain):
109
+ #--- one-hot encode the potential fraud column (for training data only)
110
+ try:
111
+ #print("INFO (claims.doFeatEng): one-hot encoding potential fraud")
112
+ pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'Yes', 'PotentialFraud'] = 1
113
+ pdfFeatEng.loc[pdfFeatEng['PotentialFraud'] == 'No', 'PotentialFraud'] = 0
114
+ except KeyError:
115
+ #--- likely column not found; invalid fxn call
116
+ print("ERROR (claims.doFeatEng): Potential Fraud col not found")
117
+
118
+ pdfFeatEng.loc[pdfFeatEng['RenalDiseaseIndicator'] == 'Y', 'RenalDiseaseIndicator'] = 1
119
+ pdfFeatEng['DeductibleAmtPaid'].fillna(0, inplace=True)
120
+ pdfFeatEng['AdmittedDays'].fillna(0, inplace=True)
121
+
122
+ #--- check for correlated cols
123
+
124
+ #--- add new features to assist with predictions
125
+ if (hasGroupByProviderCols):
126
+ pdfFeatEng['InscClaimReimbursement_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['InscClaimAmtReimbursed'].transform('mean')
127
+ pdfFeatEng['DeductibleAmtPaid_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['DeductibleAmtPaid'].transform('mean')
128
+
129
+ pdfFeatEng['IPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualReimbursementAmt'].transform('mean')
130
+ pdfFeatEng['IPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualDeductibleAmt'].transform('mean')
131
+
132
+ pdfFeatEng['OPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualReimbursementAmt'].transform('mean')
133
+ pdfFeatEng['OPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualDeductibleAmt'].transform('mean')
134
+ return pdfFeatEng
135
+
136
+
137
+ def fit(pdfScaled):
138
+ #--- determine the centroids of the kmeans clusters
139
+ #--- refit kmeans clustering according to the pre-scaled data provided
140
+ #--- note: this all assumes that the nature of the data and the number of clusters remain unchanged
141
+ m_klngNumClusters = 3
142
+ if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit) instantiate KMeans ...")
143
+ mdlKMeans = KMeans(n_clusters=m_klngNumClusters, max_iter=50, random_state=2022) #--- #clusters was learned from training
144
+
145
+ if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit) fitting data (scaled) ...")
146
+ mdlKMeans.fit(pdfScaled) #--- fit on test/prod data
147
+
148
+ return mdlKMeans #--- this ibject will give us all results based on kmeans
149
+
150
+
151
+ def train(pdfTrainData):
152
+ mdlAnoms = KMeans(n_clusters=3, max_iter=50, random_state=2022)
153
+ mdlAnoms.fit(pdfTrainData.values)
154
+ save_kmeansToPkl(mdlAnoms)
155
+ return mdlAnoms
lib/models/mdl_logR.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.linear_model import LogisticRegressionCV
2
+ import lib.utils as libPaths
3
+ import pickle
4
+
5
+
6
+ m_kstrFile = __file__
7
+ m_kstrDataPath = libPaths.pth_data
8
+ m_kstrBinModelPath = libPaths.pth_binModels
9
+ m_kstrModelPath = m_kstrBinModelPath + 'lgr_model_colab.pkl'
10
+
11
+
12
+ #--- Supervised: Logistic Regession
13
+ def load_fromPkl():
14
+ with open(m_kstrModelPath, 'rb') as filPkl:
15
+ mdlAnoms = pickle.load(filPkl)
16
+ return mdlAnoms
17
+
18
+
19
+
20
+ def save_toPkl(mdlAnoms):
21
+ with open(m_kstrModelPath, 'wb') as filPkl:
22
+ pickle.dump(mdlAnoms, filPkl)
23
+ return mdlAnoms
24
+
25
+
26
+
27
+ def predict(npaData):
28
+ #--- input: numpy.ndarray of feature eng, and scaled data
29
+ mdlAnoms = load_fromPkl()
30
+ npaPredict = mdlAnoms.predict(npaData)
31
+
32
+ print("INFO (npaPredict.shape): ", npaPredict.shape)
33
+ return npaPredict
34
+
35
+
36
+
37
+ def train(pdfTrainData):
38
+ mdlAnoms = LogisticRegressionCV()
39
+ mdlAnoms.fit(pdfTrainData.values)
40
+ save_toPkl(mdlAnoms)
41
+ return mdlAnoms
lib/models/mdl_svm.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.svm import LinearSVC
2
+ import lib.utils as libPaths
3
+ import pickle
4
+
5
+
6
+ m_kstrFile = __file__
7
+ m_kstrDataPath = libPaths.pth_data
8
+ m_kstrBinModelPath = libPaths.pth_binModels
9
+ m_kstrModelPath = m_kstrBinModelPath + 'svm_model_colab.pkl'
10
+
11
+
12
+ #--- Supervised: Support Vector Machines
13
+ def load_fromPkl():
14
+ with open(m_kstrModelPath, 'rb') as filPkl:
15
+ mdlAnoms = pickle.load(filPkl)
16
+ return mdlAnoms
17
+
18
+
19
+
20
+ def save_toPkl(mdlAnoms):
21
+ with open(m_kstrModelPath, 'wb') as filPkl:
22
+ pickle.dump(mdlAnoms, filPkl)
23
+ return mdlAnoms
24
+
25
+
26
+
27
+ def predict(npaData):
28
+ #--- input: numpy.ndarray of feature eng, and scaled data
29
+ mdlAnoms = load_fromPkl()
30
+ npaPredict = mdlAnoms.predict(npaData)
31
+ print("INFO (" + m_kstrFile + ".predict) npaPredict.shape: ", npaPredict.shape)
32
+ return npaPredict
33
+
34
+
35
+
36
+ def train(pdfTrainData):
37
+ mdlAnoms = LinearSVC()
38
+ mdlAnoms.fit(pdfTrainData.values)
39
+ save_toPkl(mdlAnoms)
40
+ return mdlAnoms
lib/models/mdl_utils.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+ import lib.utils as libPaths
4
+
5
+ m_blnTraceOn = False
6
+
7
+ #--- load, merge data from file
8
+ m_kstrDataPath = libPaths.pth_data
9
+ m_kstrModelPath = libPaths.pth_model
10
+ m_kstrBinModelPath = libPaths.pth_binModels
11
+
12
+ #m_kstrScalerPath_claims = m_kstrBinModelPath + 'stdClaims_scaler_colab.pkl' #--- does not work for scaling claims data; from v1.0.2; using 1.1.1
13
+ #m_kstrScalerPath_claims2 = m_kstrBinModelPath + 'std_scaler_unsuperv_colab.pkl' #--- does not work; expects 32 features
14
+ #m_kstrScalerPath_claims = m_kstrBinModelPath + 'stdClaims_scaler_colab_v1.2.1.pkl'
15
+ m_kstrScalerPath_claims111 = m_kstrBinModelPath + 'claims_stdScaler_v1.1.1_27cols.pkl'
16
+ m_kstrScalerPath_claims121 = m_kstrBinModelPath + 'claims_stdScaler_v1.2.1_27cols.pkl'
17
+ m_kstrScalerPath_claims_py3816_sk111hp = m_kstrBinModelPath + 'claims_stdScl_py3816_sk111hp_27cols.pkl'
18
+ m_kstrScalerPath_claims = m_kstrScalerPath_claims_py3816_sk111hp
19
+
20
+ m_kstrScalerPath_providers111 = m_kstrBinModelPath + 'prov_stdScaler_v1.1.1_32cols.pkl'
21
+ m_kstrScalerPath_providers121 = m_kstrBinModelPath + 'prov_stdScaler_v1.2.1_32cols.pkl'
22
+ m_kstrScalerPath_prov_py3816_sk111 = m_kstrBinModelPath + 'prov_stdScl_py3816_sk111_32cols.pkl'
23
+ m_kstrScalerPath_prov_py3816_sk111hp = m_kstrBinModelPath + 'prov_stdScl_py3816_sk111hp_32cols.pkl'
24
+ m_kstrScalerPath_prov = m_kstrScalerPath_prov_py3816_sk111hp
25
+
26
+ m_kstrScalerPath_providers_superv = m_kstrBinModelPath + 'gbc_scaler.pkl'
27
+ m_kstrScalerPath_providers_train = m_kstrBinModelPath + "stdProvider_scaler.pkl"
28
+
29
+
30
+
31
+ def doProviders_stdScaler(pdfFeatEng, blnIsTrain=False, hasGroupByProviderCols=True):
32
+ print("INFO (claims.do_stdScaler): blnIsTrain, ", blnIsTrain)
33
+
34
+ #--- Note: prediction runs on X_val
35
+ '''
36
+ #--- WARN: The default value of numeric_only in DataFrameGroupBy.sum is deprecated.
37
+ # In a future version, numeric_only will default to False. Either specify
38
+ # numeric_only or select only columns which should be valid for the function.
39
+ '''
40
+
41
+ #--- WARN: this code groups all data by provider; any predictions will also be by provider
42
+ pdfGroupBy = pdfFeatEng
43
+ if (hasGroupByProviderCols):
44
+ pdfGroupBy = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
45
+
46
+ X = pdfGroupBy
47
+
48
+ try:
49
+ X = X.drop(columns=['Provider'], axis=1) #--- cannot scale; text
50
+ except KeyError:
51
+ #--- likely column not found; invalid fxn call
52
+ print("ERROR (mdlUtils.doProviders_stdScaler): Provider col not found")
53
+
54
+ try:
55
+ X = X.drop(columns=['PotentialFraud'], axis=1)
56
+ except KeyError:
57
+ #--- likely column not found; invalid fxn call
58
+ if (blnIsTrain): print("ERROR (mdlUtils.doProviders_stdScaler): Potential Fraud col not found")
59
+
60
+
61
+ #--- apply std scaler
62
+ #--- WARN: scaling is also grouped by provider
63
+ if (m_blnTraceOn): print("INFO (mdlUtils.doProviders_stdScaler) cols: ", X.columns) #--- 32cols
64
+ X_std = fitProviders_txfStdScaler(X, blnIsTrain)
65
+ return X_std
66
+
67
+
68
+
69
+ def doClaims_stdScaler(pdfFeatEng, blnIsTrain=False):
70
+ print("INFO (mdlUtils.doClaims_stdScaler): blnIsTrain, ", blnIsTrain)
71
+
72
+ #--- Note: prediction runs on X_val
73
+ '''
74
+ #--- WARN: The default value of numeric_only in DataFrameGroupBy.sum is deprecated.
75
+ # In a future version, numeric_only will default to False. Either specify
76
+ # numeric_only or select only columns which should be valid for the function.
77
+ '''
78
+
79
+ #--- WARN: this code groups all data by provider; any predictions will also be by provider
80
+ X = pdfFeatEng
81
+
82
+ try:
83
+ X = X.drop(columns=['Provider'], axis=1) #--- cannot scale; text
84
+ except KeyError:
85
+ #--- likely column not found; invalid fxn call
86
+ print("ERROR (mdlUtils.do_stdScaler): Provider col not found")
87
+
88
+ try:
89
+ X = X.drop(columns=['PotentialFraud'], axis=1)
90
+ except KeyError:
91
+ #--- likely column not found; invalid fxn call
92
+ if (blnIsTrain): print("ERROR (mdlUtils.do_stdScaler): Potential Fraud col not found")
93
+
94
+
95
+ #--- apply std scaler
96
+ #--- WARN: scaling is also grouped by provider
97
+ #print("INFO (mdlUtils.doClaims_stdScaler) cols: ", X.columns)
98
+ X_std = fitClaims_txfStdScaler(X, blnIsTrain)
99
+ return X_std
100
+
101
+
102
+
103
+ def doProviders_stdScaler_toPdf(npaScaled):
104
+ #--- NOTE: the list of cols came from doProvider_stdScaler; print(X.columns)
105
+ aryCols = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'AdmittedDays',
106
+ 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
107
+ 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
108
+ 'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
109
+ 'ChronicCond_Depression', 'ChronicCond_Diabetes',
110
+ 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
111
+ 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
112
+ 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
113
+ 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Age', 'DeadOrNot',
114
+ 'Gender_2', 'Race_2', 'Race_3', 'Race_5',
115
+ 'ClaimReimbursement_ProviderAvg',
116
+ 'ClaimReimbursement_AttendingPhysician',
117
+ 'ClaimReimbursement_OperatingPhysician',
118
+ 'DeductibleAmtPaid_ProviderAvg', 'DeductibleAmtPaid_AttendingPhysician',
119
+ 'DeductibleAmtPaid_OperatingPhysician']
120
+
121
+ #npaScaled = do_stdScaler(pdfFeatEng)
122
+ pdfScaled = pd.DataFrame(npaScaled, columns=aryCols)
123
+ return pdfScaled
124
+
125
+
126
+
127
+ def doClaims_stdScaler_toPdf(npaScaled):
128
+ #--- NOTE: the list of cols came from doClaims_stdScaler; print(X.columns)
129
+ aryCols = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'AdmittedDays',
130
+ 'RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
131
+ 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
132
+ 'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
133
+ 'ChronicCond_Depression', 'ChronicCond_Diabetes',
134
+ 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
135
+ 'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke',
136
+ 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
137
+ 'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'Age', 'DeadOrNot',
138
+ 'Gender_2', 'Race_2', 'Race_3', 'Race_5']
139
+
140
+ #npaScaled = do_stdScaler(pdfFeatEng)
141
+ pdfScaled = pd.DataFrame(npaScaled, columns=aryCols)
142
+ return pdfScaled
143
+
144
+
145
+
146
+
147
+ def fitClaims_stdScaler(pdfData, blnIsTrain=False):
148
+ #--- apply scaler
149
+ #--- WARN: scaling is not grouped by provider
150
+ from sklearn.preprocessing import StandardScaler
151
+
152
+ #--- note: this is a numpy.ndarray
153
+ #--- we need to fit the scaler, and then save as a pkl file
154
+ #strScalerPath = m_kstrScalerPath_claims
155
+ strScalerPath = m_kstrScalerPath_claims
156
+ # strScalerPath = m_kstrBinModelPath + "stdClaims_scaler_colab.pkl"
157
+ if (m_blnTraceOn): print("INFO (lib.model.fitClaims_stdScalar): ", strScalerPath)
158
+ if (blnIsTrain):
159
+ scaler = StandardScaler()
160
+ sclFit = scaler.fit(pdfData)
161
+ #--- if we train locally; write out to gbc_scalar.pkl
162
+ #--- we do not want to overwrite the colab version used for test
163
+ strScalerPath = m_kstrBinModelPath + "stdClaims_scaler.pkl"
164
+ if (m_blnTraceOn): print("INFO (lib.model.fit_stdScalar) Using local pkl for Train: ", strScalerPath)
165
+ with open(strScalerPath, 'wb') as filPkl:
166
+ pickle.dump(sclFit, filPkl)
167
+ else:
168
+ #--- we need to load the pkl file
169
+ import sklearn
170
+ if (m_blnTraceOn): print("INFO (lib.model.fit_stdScalar) Using colab pkl for Test: ", strScalerPath)
171
+ with open(strScalerPath, 'rb') as filPkl:
172
+ sclFit = pickle.load(filPkl)
173
+ if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) sclFit.type: ", type(sclFit))
174
+
175
+ #--- testing
176
+ scaler = StandardScaler()
177
+ if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) StdScaler.version: ", scaler.__getstate__()['_sklearn_version'])
178
+ if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) sclFit.version: " , sclFit.__getstate__()['_sklearn_version'])
179
+ if (m_blnTraceOn): print("TRACE (libModel.fitClaims_stdScalar) sklearn.version: " , sklearn.__version__)
180
+ return sclFit
181
+
182
+
183
+
184
+ def fitProviders_stdScaler(pdfData, blnIsTrain=False):
185
+ #--- apply scaler
186
+ #--- WARN: scaling is also grouped by provider
187
+ from sklearn.preprocessing import StandardScaler
188
+
189
+ #--- note: this is a numpy.ndarray
190
+ #--- we need to fit the scaler, and then save as a pkl file
191
+ #strScalerPath = m_kstrScalerPath_providers
192
+ #strScalerPath = m_kstrScalerPath_providers_train
193
+ strScalerPath = m_kstrScalerPath_prov
194
+ print("INFO (libModel.fitProviders_stdScalar): ", strScalerPath)
195
+ if (blnIsTrain):
196
+ scaler = StandardScaler()
197
+ sclFit = scaler.fit(pdfData)
198
+ #--- if we train locally; write out to gbc_scalar.pkl
199
+ #--- we do not want to overwrite the colab version used for test
200
+ strScalerPath = m_kstrScalerPath_providers_train #--- works for provider training
201
+ if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using local pkl for Train: ", strScalerPath)
202
+ with open(strScalerPath, 'wb') as filPkl:
203
+ pickle.dump(sclFit, filPkl)
204
+ else:
205
+ #--- we need to load the pkl file
206
+ if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using colab pkl for Test: ", strScalerPath)
207
+ with open(strScalerPath, 'rb') as filPkl:
208
+ sclFit = pickle.load(filPkl)
209
+ if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) sclFit.type: ", type(sclFit))
210
+ return sclFit
211
+
212
+
213
+
214
+ def fitProviders_stdScalerSuperv(pdfData, blnIsTrain=False):
215
+ #--- apply scaler
216
+ #--- WARN: scaling is also grouped by provider
217
+ from sklearn.preprocessing import StandardScaler
218
+
219
+ #--- note: this is a numpy.ndarray
220
+ #--- we need to fit the scaler, and then save as a pkl file
221
+ strScalerPath = m_kstrScalerPath_prov
222
+ if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar): ", strScalerPath)
223
+ if (blnIsTrain):
224
+ scaler = StandardScaler()
225
+ sclFit = scaler.fit(pdfData)
226
+ #--- if we train locally; write out to gbc_scalar.pkl
227
+ #--- we do not want to overwrite the colab version used for test
228
+ strScalerPath = m_kstrBinModelPath + "stdProvider_scaler.pkl"
229
+ if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using local pkl for Train: ", strScalerPath)
230
+ with open(strScalerPath, 'wb') as filPkl:
231
+ pickle.dump(sclFit, filPkl)
232
+ else:
233
+ #--- we need to load the pkl file
234
+ if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) Using colab pkl for Test: ", strScalerPath)
235
+ with open(strScalerPath, 'rb') as filPkl:
236
+ sclFit = pickle.load(filPkl)
237
+ if (m_blnTraceOn): print("TRACE (libModel.fitProviders_stdScalar) sclFit.type: ", type(sclFit))
238
+ return sclFit
239
+
240
+
241
+
242
+ def fitProviders_txfStdScaler(pdfData, blnIsTrain=False):
243
+ from sklearn.preprocessing import StandardScaler
244
+ sclFit = fitProviders_stdScaler(pdfData, blnIsTrain)
245
+ X_std = sclFit.transform(pdfData)
246
+ return X_std
247
+
248
+
249
+
250
+ def fitClaims_txfStdScaler(pdfData, blnIsTrain=False):
251
+ from sklearn.preprocessing import StandardScaler
252
+ sclFit = fitClaims_stdScaler(pdfData, blnIsTrain)
253
+
254
+
255
+ X_std = sclFit.transform(pdfData)
256
+ return X_std
lib/models/mdl_xgb.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import GradientBoostingClassifier
3
+ import lib.utils as libPaths
4
+ import pickle
5
+ import sys
6
+
7
+
8
+ m_kstrFile = __file__
9
+ m_kstrDataPath = libPaths.pth_data
10
+ m_kstrBinModelPath = libPaths.pth_binModels
11
+ m_kstrModelPath_gbc = m_kstrBinModelPath + 'gbc_model_colab.pkl'
12
+ m_kstrModelPath_prov111 = m_kstrBinModelPath + 'prov_gbc_v1.1.1_32cols.pkl' #--- ERROR: __randomstate_ctor() takes from 0 to 1 positional arguments but 2 were given
13
+ m_kstrModelPath_prov121 = m_kstrBinModelPath + 'prov_gbc_v1.2.1_32cols.pkl'
14
+ m_kstrModelPath_prov_py3816_sk111hp = m_kstrBinModelPath + 'prov_gbc_py3816_sk111hp_32cols.pkl'
15
+ m_kstrModelPath = m_kstrModelPath_prov_py3816_sk111hp
16
+
17
+ m_blnTraceOn = True
18
+
19
+
20
+
21
+ #--- Supervised: xg boost; gradient boosting classifier
22
+ def load_fromPkl():
23
+ try:
24
+ with open(m_kstrModelPath, 'rb') as filPkl:
25
+ mdlAnoms = pickle.load(filPkl)
26
+ return mdlAnoms
27
+
28
+ except:
29
+ e = sys.exc_info()
30
+ print("ERROR (mdl_xgb.load_fromPkl_genError): ", e)
31
+
32
+
33
+
34
+ def save_toPkl(mdlAnoms):
35
+ with open(m_kstrModelPath, 'wb') as filPkl:
36
+ pickle.dump(mdlAnoms, filPkl)
37
+ return mdlAnoms
38
+
39
+
40
+
41
+ def predict(npaData):
42
+
43
+ try:
44
+ #--- input: numpy.ndarray of feature eng, and scaled data
45
+ mdlAnoms = load_fromPkl()
46
+ if (m_blnTraceOn): print("TRACE (mdl_xgb.predict): data loaded ... ")
47
+ npaPredict = mdlAnoms.predict(npaData)
48
+
49
+ except:
50
+ e = sys.exc_info()
51
+ print("ERROR (mdl_xgb.predict_genError1): ", e)
52
+
53
+
54
+ #--- AttributeError: 'GradientBoostingClassifier' object has no attribute '_loss'
55
+ #--- version of scikit-learn? Monika: ?.?.? ; Iain: 1.2.0
56
+
57
+ #print("INFO (type.npaPredict): ", type(npaPredict))
58
+ #if (m_blnTraceOn): print("TRACE (mdl_xgb.predict) npaPredict.shape: ", npaPredict.shape)
59
+ return npaPredict
60
+
61
+
62
+ def train(pdfTrainData):
63
+ mdlAnoms = GradientBoostingClassifier()
64
+ mdlAnoms.fit(pdfTrainData.values)
65
+ save_toPkl(mdlAnoms)
66
+ return mdlAnoms
lib/providers.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lib.utils as libPaths
3
+ import lib.claims as libClaims
4
+
5
+ from lib.models import mdl_utils, mdl_xgb, mdl_logR, mdl_svm
6
+ from lib.models import mdl_autoenc, mdl_kmeans
7
+ import sys
8
+
9
+ m_blnTraceOn = True
10
+ m_blnTrace2On = False
11
+
12
+ #--- load, merge data from file
13
+ m_kstrDataPath = libPaths.pth_data
14
+ m_kstrModelPath = libPaths.pth_model
15
+ m_kstrBinModelPath = libPaths.pth_binModels
16
+
17
+
18
+
19
+ def load_providers(blnIsTrain=False):
20
+
21
+ pdfClaims = libClaims.loadPkl_claims(blnIsTrain)
22
+ pdfClaims = pdfClaims.drop(['ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
23
+ 'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
24
+ 'Gender', 'Race', 'County'], axis=1)
25
+ pdfProviders = pdfClaims.groupby(['Provider'], as_index=False).agg('sum')
26
+ return pdfProviders
27
+
28
+
29
+ #--- feat eng
30
+ def do_featEng(pdfClaimsFeatEng, blnIsTrain=False):
31
+ if (m_blnTraceOn): print("TRACE (providers.doFeatEng): blnIsTrain, ", blnIsTrain)
32
+ pdfFeatEng = pdfClaimsFeatEng
33
+
34
+ #--- add new features to assist with predictions
35
+ pdfFeatEng['InscClaimReimbursement_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['InscClaimAmtReimbursed'].transform('mean')
36
+ pdfFeatEng['DeductibleAmtPaid_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['DeductibleAmtPaid'].transform('mean')
37
+
38
+ pdfFeatEng['IPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualReimbursementAmt'].transform('mean')
39
+ pdfFeatEng['IPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['IPAnnualDeductibleAmt'].transform('mean')
40
+
41
+ pdfFeatEng['OPAnnualReimbursementAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualReimbursementAmt'].transform('mean')
42
+ pdfFeatEng['OPAnnualDeductibleAmt_ProviderAvg'] = pdfFeatEng.groupby(['Provider'])['OPAnnualDeductibleAmt'].transform('mean')
43
+ return pdfFeatEng
44
+
45
+
46
+
47
+ def get_logrPredict(pdfTestClaims):
48
+
49
+ #--- logistic regression predictions; load test data
50
+ pdfClaims = pdfTestClaims
51
+ #print("INFO (providers.get_logrPredict) pdfClaims.shape): ", pdfClaims.shape)
52
+
53
+ pdfFeatEng = do_featEng(pdfClaims, False)
54
+ npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
55
+ pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
56
+ #print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
57
+
58
+ ndaPredict = mdl_logR.predict(npaScaled)
59
+ #print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
60
+
61
+ pdfPredict = pd.DataFrame(ndaPredict)
62
+ #print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
63
+
64
+ #--- stitch the grouped data with the labels
65
+ pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
66
+ #print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
67
+
68
+ pdfResults.insert(0, "hasAnom?", pdfPredict[0])
69
+ return pdfResults
70
+
71
+
72
+ def get_svmPredict(pdfTestClaims):
73
+
74
+ #--- support vector machine predictions; load test data
75
+ pdfClaims = pdfTestClaims
76
+ if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfClaims.shape: ", pdfClaims.shape)
77
+
78
+ pdfFeatEng = do_featEng(pdfClaims, False)
79
+ npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
80
+ pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
81
+ if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) npaScaled.shape: ", npaScaled.shape)
82
+
83
+ ndaPredict = mdl_svm.predict(npaScaled)
84
+ if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) npaPredict.shape: ", ndaPredict.shape)
85
+
86
+ pdfPredict = pd.DataFrame(ndaPredict)
87
+ if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfPredict.shape: ", pdfPredict.shape)
88
+
89
+ #--- stitch the grouped data with the labels
90
+ pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
91
+ if (m_blnTraceOn): print("TRACE (providers.get_svmPredict) pdfResults.shape: ", pdfResults.shape)
92
+
93
+ pdfResults.insert(0, "hasAnom?", pdfPredict[0])
94
+ return pdfResults
95
+
96
+
97
+
98
+ def get_xgbPredict(pdfTestClaims):
99
+
100
+ try:
101
+ #--- load test data
102
+ pdfClaims = pdfTestClaims
103
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfClaims.shape): ", pdfClaims.shape)
104
+
105
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doFeatEng (provider) ... ")
106
+ pdfFeatEng = do_featEng(pdfClaims, False)
107
+
108
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doStdScaler ... ")
109
+ npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False)
110
+
111
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) doStdScaler_toPdf ... ")
112
+ pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
113
+ #if (m_blnTraceOn): print("TRACE (predict.npaScaled.shape1): ", npaScaled.shape)
114
+
115
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) run prediction ... ")
116
+ ndaPredict = mdl_xgb.predict(npaScaled)
117
+ #if (m_blnTraceOn): print("TRACE (predict.npaPredict.shape2): ", ndaPredict.shape)
118
+
119
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) convert to dataframe ... ")
120
+ pdfPredict = pd.DataFrame(ndaPredict)
121
+ pdfAnoms = pdfPredict[pdfPredict[0] > 0]
122
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfPredict.shape: ", pdfPredict.shape)
123
+ if (m_blnTraceOn): print("TRACE (providers.get_xgbPredict) #anoms: ", len(pdfAnoms.index))
124
+
125
+ #--- group data by provider
126
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) group claims by provider ... ")
127
+ pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
128
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) pdfResults.shape: ", pdfResults.shape)
129
+
130
+ #--- stitch the grouped data with the labels
131
+ if (m_blnTrace2On): print("TRACE (providers.get_xgbPredict) merge labels into dataset ... ")
132
+ pdfResults.insert(0, "hasAnom?", pdfPredict[0])
133
+
134
+ except:
135
+ e = sys.exc_info()
136
+ print("ERROR (providers.get_xgbPredict_genError): ", e)
137
+
138
+
139
+ if (m_blnTraceOn): print("TRACE (providers.get_xgbPredict) proc complete; return ... ")
140
+ return pdfResults
141
+
142
+
143
+
144
+ def get_encPredict(pdfTestClaims):
145
+
146
+ #--- principal component analysis predictions; load test data
147
+ pdfClaims = pdfTestClaims
148
+ if (m_blnTraceOn): print("TRACE (providers.get_encPredict) ppdfClaims.shape: ", pdfClaims.shape)
149
+
150
+ pdfFeatEng = do_featEng(pdfClaims, False) #--- not grouped by provider
151
+
152
+
153
+ #--- perform standard scaling; get fit then transform
154
+ npaScaled = mdl_utils.doProviders_stdScaler(pdfFeatEng, False) #--- grouped by provider
155
+ pdfScaled = mdl_utils.doProviders_stdScaler_toPdf(npaScaled)
156
+ #print("INFO (predict.npaScaled.shape): ", npaScaled.shape)
157
+
158
+ #--- perform PCA; then autoencode predict
159
+ ndaPredict = mdl_autoenc.predict(pdfScaled)
160
+ #print("INFO (predict.npaPredict.shape): ", ndaPredict.shape)
161
+
162
+ pdfPredict = pd.DataFrame(ndaPredict)
163
+ #print("INFO (predict.pdfPredict.shape): ", pdfPredict.shape)
164
+
165
+ #--- stitch the grouped data with the labels
166
+ pdfResults = pdfFeatEng.groupby(['Provider'], as_index=False).agg('sum')
167
+ #print("INFO (predict.pdfGrpFeatEng.shape): ", pdfResults.shape)
168
+
169
+ pdfResults.insert(0, "hasAnom?", pdfPredict[0])
170
+ return pdfResults
lib/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #--- note: this file is loaded by fastapi and streamlit,
2
+ # # so keep it independant of those libs
3
+
4
+ from pathlib import Path
5
+
6
+ pth_pwd = Path(__file__).resolve().parent
7
+ pth_appRoot = pth_pwd.parent
8
+
9
+ pth_root = str(pth_appRoot) + "/"
10
+ pth_api = pth_root + "api/"
11
+ pth_bin = pth_root + "bin/"
12
+ pth_binModels = pth_root + "bin/models/"
13
+ pth_data = pth_root + "data/"
14
+ pth_lib = pth_root + "lib/"
15
+ pth_libModels = pth_root + "models/"
16
+ pth_model = pth_root + "model/"
17
+ pth_qa = pth_root + "qa/"
18
+ pth_routes = pth_root + "routes/"
19
+ pth_templ = pth_root + "templ/"
20
+ pth_uix = pth_root + "uix/"
21
+
22
+ m_klngMaxRecords = 100
23
+ m_klngSampleSize = 25
lit_index.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ toExecute: (from root app folder) ... streamlit run lit_index.py
3
+ '''
4
+ import streamlit as st
5
+ #from uix import lit_sidebar as lit_sideBar
6
+ import uix.lit_sidebar as litSideBar
7
+
8
+
9
+ #--- streamlit: specify title and logo
10
+ st.set_page_config(
11
+ page_title='Healthcare Claims - ML Anomaly Detection',
12
+ #page_icon='https://cdn.freebiesupply.com/logos/thumbs/1x/nvidia-logo.png',
13
+ layout="wide")
14
+ st.header("Healthcare ML Claims Anomaly Detection")
15
+ st.markdown('---')
16
+
17
+
18
+ #--- streamlit: add a sidebar
19
+ litSideBar.init()
20
+
21
+
22
+ #if __name__ == '__main__':
23
+ # st.run("main:app", host="0.0.0.0", port=48300, reload=True)
24
+
25
+ #aryPkg[moduleNames.index(page)].run()
main.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ purpose:
3
+ '''
4
+
5
+ from fastapi import FastAPI
6
+ from fastapi.responses import HTMLResponse
7
+ from fastapi import APIRouter, Request, Response
8
+ from fastapi.templating import Jinja2Templates
9
+ import uvicorn
10
+
11
+
12
+ from lib import claims as libClaims, providers as libProviders
13
+ import lib.utils as libUtils
14
+ from lib.models import mdl_utils as libMdlUtils
15
+
16
+
17
+ #--- imported route handlers
18
+ from routes.api.rte_api import rteApi
19
+ from routes.qa.rte_qa import rteQa
20
+ from routes.qa.rte_claims import rteClaims
21
+ from routes.qa.rte_providers import rteProv
22
+
23
+
24
+ #--- fastAPI self doc descriptors
25
+ description = """
26
+ Fourthbrain Capstone: MLE10 Cohort
27
+
28
+ The Healthcare Claims Anomaly API is provided to assist with
29
+
30
+ ## Claims Analysis
31
+ ## Supervised Provider Predictions - Anomaly Detection (XGBoost)
32
+ ## Unsupervised Claim Predictions - Anomaly Detection (KMeans Cluster)
33
+
34
+ You will be able to:
35
+ * Analyze Claims data
36
+ * Identify potential Provider Anomalies
37
+ * Idenitfy potential Claim Anomalies
38
+ """
39
+
40
+ app = FastAPI(
41
+ title="App: Healthcare Claims - Anomaly Detection",
42
+ description=description,
43
+ version="0.0.1",
44
+ terms_of_service="http://example.com/terms/",
45
+ contact={
46
+ "name": "Iain McKone",
47
+ "email": "iain.mckone@gmail.com",
48
+ },
49
+ license_info={
50
+ "name": "Apache 2.0",
51
+ "url": "https://www.apache.org/licenses/LICENSE-2.0.html",
52
+ },
53
+ )
54
+
55
+
56
+ #--- configure route handlers
57
+ app.include_router(rteApi, prefix="/api")
58
+ app.include_router(rteQa, prefix="/qa")
59
+ app.include_router(rteClaims, prefix="/claims")
60
+ app.include_router(rteProv, prefix="/providers")
61
+
62
+
63
+
64
+ m_kstrPath_templ = libUtils.pth_templ
65
+ m_templRef = Jinja2Templates(directory=str(m_kstrPath_templ))
66
+
67
+
68
+ def get_jinja2Templ(request: Request, pdfResults, strParamTitle, lngNumRecords, blnIsTrain=False, blnIsSample=False):
69
+ lngNumRecords = min(lngNumRecords, libUtils.m_klngMaxRecords)
70
+ if (blnIsTrain): strParamTitle = strParamTitle + " - Training Data"
71
+ if (not blnIsTrain): strParamTitle = strParamTitle + " - Test Data"
72
+ if (blnIsSample): lngNumRecords = libUtils.m_klngSampleSize
73
+ strParamTitle = strParamTitle + " - max " + str(lngNumRecords) + " rows"
74
+
75
+ pdfClaims = pdfResults.sample(lngNumRecords)
76
+ htmlClaims = pdfClaims.to_html(classes='table table-striped')
77
+ kstrTempl = 'templ_showDataframe.html'
78
+ jsonContext = {'request': request,
79
+ 'paramTitle': strParamTitle,
80
+ 'paramDataframe': htmlClaims
81
+ }
82
+ result = m_templRef.TemplateResponse(kstrTempl, jsonContext)
83
+ return result
84
+
85
+
86
+ #--- get main ui/ux entry point
87
+ @app.get('/')
88
+ def index():
89
+ return {
90
+ "message": "Landing page: Capstone Healthcare Anomaly Detection"
91
+ }
92
+
93
+
94
+
95
+ if __name__ == '__main__':
96
+ uvicorn.run("main:app", host="0.0.0.0", port=48300, reload=True)
97
+ #CMD ["uvicorn", "main:app", "--host=0.0.0.0", "--reload"]
routes/__init__.py ADDED
File without changes
routes/api/__init__.py ADDED
File without changes
routes/api/rte_api.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Request, Response
2
+ from fastapi.responses import JSONResponse
3
+
4
+ import pandas as pd
5
+ import json
6
+
7
+ import lib.claims as libClaims
8
+ from lib.models import mdl_utils, mdl_xgb
9
+
10
+
11
+ rteApi = APIRouter()
12
+
13
+
14
+ #--- return json for claims data (merged)
15
+ #--- note: current is kaggle, but future could include from yyyymm filter
16
+ @rteApi.get('/claims', response_class = JSONResponse)
17
+ def api_getClaims(request: Request, response: Response):
18
+ pdfClaims = libClaims.load_claims()
19
+ jsonSample = pdfClaims.head(50).to_json(orient="records", indent=4)
20
+ result = json.loads(jsonSample)
21
+ return result
22
+
23
+
24
+ #--- return json for featEng
25
+ @rteApi.get('/claims/doFeatEng/', response_class = JSONResponse)
26
+ def tst_claims_featEng():
27
+ pdfClaims = libClaims.load_claims()
28
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
29
+ jsonSample = pdfClaims.head(50).to_json(orient="records", indent=4)
30
+ result = json.loads(jsonSample)
31
+ return result
32
+
33
+
34
+ @rteApi.get('/claims/doStdScaling/', response_class = JSONResponse)
35
+ def tst_claims_stdScaling():
36
+ pdfClaims = libClaims.load_claims()
37
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
38
+ pdfScaled = mdl_utils.doClaims_stdScaler_toPdf(pdfFeatEng)
39
+
40
+ jsonSample = pdfClaims.head(50).to_json(orient="records", indent=4)
41
+ result = json.loads(jsonSample)
42
+ return result
43
+
44
+
45
+ @rteApi.get('/claims/predict/superv', response_class = JSONResponse)
46
+ @rteApi.get('/claims/predict/xgb', response_class = JSONResponse)
47
+ def predict_xgb():
48
+ #--- load test data
49
+ pdfClaims = libClaims.load_claims()
50
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
51
+
52
+ npaScaled = mdl_utils.do_stdScaler(pdfFeatEng)
53
+ pdfScaled = mdl_utils.do_stdScaler_toPdf(npaScaled)
54
+
55
+ ndaPredict = mdl_xgb.predict(npaScaled)
56
+ pdfPredict = pd.DataFrame(ndaPredict)
57
+
58
+ #--- stitch the grouped data with the labels
59
+ pdfResults = pdfScaled.copy()
60
+ pdfResults.insert(0, "hasAnom?", pdfPredict[0])
61
+
62
+ #--- filter to only those rows that are flagged with an anomaly
63
+ pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
64
+
65
+ jsonSample = pdfResults.head(50).to_json(orient="records", indent=4)
66
+ result = json.loads(jsonSample)
67
+ return result
routes/qa/__init__.py ADDED
File without changes
routes/qa/rte_claims.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Request, Response
2
+ from fastapi.responses import HTMLResponse
3
+
4
+
5
+ import main as libMain
6
+ from lib import utils as libUtils, claims as libClaims
7
+ from lib.models import mdl_utils as libMdlUtils
8
+
9
+
10
+ m_kstrFile = __file__
11
+ m_blnTraceOn = True
12
+
13
+ m_kstrPath_templ = libUtils.pth_templ
14
+
15
+
16
+ rteClaims = APIRouter()
17
+
18
+
19
+
20
+ #--- get claims data
21
+ def claims_loadData(request: Request, response: Response, blnIsTrain=False, blnIsSample=False, blnForceCsv=False):
22
+
23
+ pdfClaims = libClaims.load_claims(blnIsTrain)
24
+ lngNumRecords = libUtils.m_klngMaxRecords
25
+ strParamTitle = "Claims"
26
+
27
+ return libMain.get_jinja2Templ(request, pdfClaims, strParamTitle, lngNumRecords, blnIsTrain, blnIsSample)
28
+
29
+
30
+
31
+ @rteClaims.get('/data/loadCsv/', response_class = HTMLResponse)
32
+ def claims_loadCsv(request: Request, response: Response):
33
+ #--- forces a reload of csv's in case a refresh is required
34
+ pdfClaims = libClaims.load_claims(False, True)
35
+ pdfClaims = libClaims.load_claims(True, True)
36
+ return claims_loadData(request, response, True,False)
37
+
38
+
39
+
40
+ @rteClaims.get('/data/train/', response_class = HTMLResponse)
41
+ def claims_loadTrainData(request: Request, response: Response, blnIsSample=False):
42
+ return claims_loadData(request, response, True, blnIsSample)
43
+
44
+
45
+
46
+ @rteClaims.get('/data/train/sample', response_class = HTMLResponse)
47
+ def claims_loadTrainSample(request: Request, response: Response):
48
+ return claims_loadTrainData(request, response, True)
49
+
50
+
51
+
52
+ @rteClaims.get('/data/test/', response_class = HTMLResponse)
53
+ def claims_loadTestData(request: Request, response: Response, blnIsSample=False):
54
+ return claims_loadData(request, response, False, blnIsSample)
55
+
56
+
57
+
58
+ @rteClaims.get('/data/test/sample', response_class = HTMLResponse)
59
+ def claims_loadTestSample(request: Request, response: Response):
60
+ return claims_loadTestData(request, response, True)
61
+
62
+
63
+
64
+ @rteClaims.get('/doStdScaling/', response_class = HTMLResponse)
65
+ def claims_stdScaling(request: Request, response: Response, blnIsTrain=False):
66
+ pdfClaims = libClaims.load_claims(blnIsTrain)
67
+ pdfFeatEng = libClaims.do_featEng(pdfClaims, blnIsTrain, False)
68
+ npaScaled = libMdlUtils.doClaims_stdScaler(pdfFeatEng, blnIsTrain)
69
+ pdfScaled = libMdlUtils.doClaims_stdScaler_toPdf(npaScaled)
70
+
71
+ lngNumRecords = libUtils.m_klngMaxRecords
72
+ blnIsSample = True
73
+
74
+ strParamTitle = "Std Scaled Claims"
75
+ return libMain.get_jinja2Templ(request, pdfScaled, strParamTitle, lngNumRecords, blnIsTrain, blnIsSample)
76
+
77
+
78
+
79
+ @rteClaims.get('/doStdScaling/train', response_class = HTMLResponse)
80
+ def claims_stdScalingTrain(request: Request, response: Response):
81
+ return claims_stdScaling(request, response, True)
82
+
83
+
84
+
85
+ @rteClaims.get('/doStdScaling/test', response_class = HTMLResponse)
86
+ def claims_stdScalingTest(request: Request, response: Response):
87
+ return claims_stdScaling(request, response, False)
88
+
89
+
90
+
91
+ @rteClaims.get('/doFeatEng/', response_class = HTMLResponse)
92
+ def claims_doFeatEng(request: Request, response: Response, blnIsTrain=False):
93
+ pdfClaims = libClaims.load_claims(blnIsTrain)
94
+ pdfFeatEng_claims = libClaims.do_featEng(pdfClaims, blnIsTrain)
95
+
96
+ lngNumRecords = libUtils.m_klngMaxRecords
97
+ blnIsSample = True
98
+
99
+ strParamTitle = "Feature Engineered Claims"
100
+
101
+ return libMain.get_jinja2Templ(request, pdfFeatEng_claims, strParamTitle,
102
+ lngNumRecords, blnIsTrain, True)
103
+
104
+
105
+
106
+ @rteClaims.get('/predict/kmeans', response_class = HTMLResponse)
107
+ def predict_kmeans(request: Request, response: Response):
108
+
109
+ #--- load test data, perform featEng, stdScaling, and fit to Kmeans args
110
+ pdfClaims = libClaims.load_claims(False)
111
+ print("TRACE: claims.predict.kmeans getting prediction ...")
112
+ pdfResults = libClaims.get_kmeansPredict(pdfClaims)
113
+ print("TRACE: claims.predict.kmeans prepping response ...")
114
+
115
+ lngNumRecords = libUtils.m_klngMaxRecords
116
+ blnIsSample = False
117
+ strParamTitle = "Predictions (KMeans Clusters)"
118
+
119
+ return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
120
+ lngNumRecords, False, blnIsSample)
121
+
122
+
123
+
124
+ @rteClaims.get('/fit/kmeans', response_class = HTMLResponse)
125
+ def fit_kmeans(request: Request, response: Response):
126
+
127
+ #--- load test data, perform featEng, stdScaling, and fit to Kmeans args
128
+ pdfClaims = libClaims.load_claims(False)
129
+ mdlKMeans = libClaims.get_kmeansFit(pdfClaims)
130
+
131
+ #--- inspect KMeans data; clusters, centers, sizes
132
+ #lstCenters = mdlKMeans.cluster_centers_
133
+ lstIdx = range(len(mdlKMeans.cluster_centers_))
134
+ if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit_kmeans) lstIdx: ", lstIdx)
135
+
136
+ lstSize = [sum(mdlKMeans.labels_ == idx) for idx,_ in enumerate(lstIdx)]
137
+ if (m_blnTraceOn): print("TRACE (" + m_kstrFile + ".fit_kmeans) lstSize: ", lstSize)
138
+
139
+ return
routes/qa/rte_providers.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Request, Response
2
+ from fastapi.responses import HTMLResponse
3
+ from fastapi.templating import Jinja2Templates
4
+ import pandas as pd
5
+
6
+ import main as libMain
7
+ from lib import utils as libUtils, claims as libClaims, providers as libProviders
8
+ from lib.models import mdl_utils as libMdlUtils
9
+
10
+
11
+
12
+ m_kstrFile = __file__
13
+ m_blnTraceOn = True
14
+
15
+ m_kstrPath_templ = libUtils.pth_templ
16
+ m_templRef = Jinja2Templates(directory=str(m_kstrPath_templ))
17
+
18
+
19
+ rteProv = APIRouter()
20
+
21
+
22
+
23
+ #--- get claims data
24
+ def providers_loadData(request: Request, response: Response, blnIsTrain=False, blnIsSample=False):
25
+
26
+ pdfProviders = libProviders.load_providers(blnIsTrain)
27
+
28
+ lngNumRecords = libUtils.m_klngMaxRecords
29
+ strParamTitle = "Providers"
30
+
31
+ return libMain.get_jinja2Templ(request, pdfProviders, strParamTitle, lngNumRecords, blnIsTrain, blnIsSample)
32
+
33
+
34
+
35
+ @rteProv.get('/data/train/', response_class = HTMLResponse)
36
+ def providers_loadTrainData(request: Request, response: Response, blnIsSample=False):
37
+ return providers_loadData(request, response, True, blnIsSample)
38
+
39
+
40
+
41
+ @rteProv.get('/data/train/sample', response_class = HTMLResponse)
42
+ def providers_loadTrainSample(request: Request, response: Response):
43
+ return providers_loadTrainData(request, response, True)
44
+
45
+
46
+
47
+ @rteProv.get('/data/test/', response_class = HTMLResponse)
48
+ def providers_loadTestData(request: Request, response: Response, blnIsSample=False):
49
+ return providers_loadData(request, response, False, blnIsSample)
50
+
51
+
52
+
53
+ @rteProv.get('/data/test/sample', response_class = HTMLResponse)
54
+ def providers_loadTestSample(request: Request, response: Response):
55
+ return providers_loadTestData(request, response, True)
56
+
57
+
58
+
59
+ @rteProv.get('/doFeatEng/', response_class = HTMLResponse)
60
+ def providers_featEng(request: Request, response: Response, blnIsTrain=False):
61
+ pdfClaims = libClaims.load_claims(blnIsTrain)
62
+ pdfFeatEng_claims = libClaims.do_featEng(pdfClaims, blnIsTrain)
63
+ pdfFeatEng_providers = libProviders.do_featEng(pdfFeatEng_claims)
64
+
65
+ lngNumRecords = libUtils.m_klngMaxRecords
66
+ blnIsSample = True
67
+
68
+ strParamTitle = "Feature Engineered Claims Grouped by Provider"
69
+
70
+ return libMain.get_jinja2Templ(request, pdfFeatEng_providers, strParamTitle,
71
+ lngNumRecords, blnIsTrain, True)
72
+
73
+
74
+
75
+ @rteProv.get('/doFeatEng/train', response_class = HTMLResponse)
76
+ def providers_featEngTrain(request: Request, response: Response):
77
+ return providers_featEng(request, response, True)
78
+
79
+
80
+
81
+ @rteProv.get('/doFeatEng/test', response_class = HTMLResponse)
82
+ def providers_featEngTest(request: Request, response: Response):
83
+ return providers_featEng(request, response, False)
84
+
85
+
86
+
87
+ @rteProv.get('/doStdScaling/', response_class = HTMLResponse)
88
+ def providers_stdScaling(request: Request, response: Response, blnIsTrain=False):
89
+ pdfClaims = libClaims.load_claims(blnIsTrain)
90
+ pdfFeatEng = libClaims.do_featEng(pdfClaims, blnIsTrain)
91
+ npaScaled = libMdlUtils.doProviders_stdScaler(pdfFeatEng, blnIsTrain)
92
+ pdfScaled = libMdlUtils.doProviders_stdScaler_toPdf(npaScaled)
93
+
94
+ lngNumRecords = libUtils.m_klngMaxRecords
95
+ blnIsSample = True
96
+
97
+ strParamTitle = "Std Scaled Claims Grouped by Provider"
98
+ return libMain.get_jinja2Templ(request, pdfScaled, strParamTitle,
99
+ lngNumRecords, blnIsTrain, blnIsSample)
100
+
101
+
102
+
103
+ @rteProv.get('/doStdScaling/train', response_class = HTMLResponse)
104
+ def providers_stdScalingTrain(request: Request, response: Response):
105
+ return providers_stdScaling(request, response, True)
106
+
107
+
108
+
109
+ @rteProv.get('/doStdScaling/test', response_class = HTMLResponse)
110
+ def providers_stdScalingTest(request: Request, response: Response):
111
+ return providers_stdScaling(request, response, False)
112
+
113
+
114
+
115
+ @rteProv.get('/predict/superv', response_class = HTMLResponse)
116
+ @rteProv.get('/predict/xgb', response_class = HTMLResponse)
117
+ def predict_supervised_xgb(request: Request, response: Response):
118
+
119
+ #--- load test data
120
+ #--- filter to only those rows that are flagged with an anomaly
121
+ pdfClaims = libClaims.load_claims(False)
122
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
123
+ pdfResults = libProviders.get_xgbPredict(pdfFeatEng)
124
+ pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
125
+
126
+ lngNumRecords = libUtils.m_klngMaxRecords
127
+ blnIsSample = True
128
+ strParamTitle = "Provider Predictions (Gradient Boosting Classifier)"
129
+
130
+ return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
131
+ lngNumRecords, False, blnIsSample)
132
+
133
+
134
+
135
+ @rteProv.get('/predict/logr', response_class = HTMLResponse)
136
+ def predict_supervised_logr(request: Request, response: Response):
137
+
138
+ #--- load test data
139
+ #--- filter to only those rows that are flagged with an anomaly
140
+ pdfClaims = libClaims.load_claims(False)
141
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
142
+ pdfResults = libProviders.get_logrPredict(pdfFeatEng)
143
+ pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
144
+
145
+ lngNumRecords = libUtils.m_klngMaxRecords
146
+ blnIsSample = True
147
+ strParamTitle = "Provider Predictions (Logistic Regression)"
148
+
149
+ return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
150
+ lngNumRecords, False, blnIsSample)
151
+
152
+
153
+
154
+ @rteProv.get('/predict/svm', response_class = HTMLResponse)
155
+ def predict_supervised_svm(request: Request, response: Response):
156
+
157
+ #--- load test data
158
+ #--- filter to only those rows that are flagged with an anomaly
159
+ pdfClaims = libClaims.load_claims(False)
160
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
161
+ pdfResults = libProviders.get_svmPredict(pdfFeatEng)
162
+ pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
163
+
164
+ lngNumRecords = libUtils.m_klngMaxRecords
165
+ blnIsSample = True
166
+ strParamTitle = "Provider Predictions (Support Vector Machines)"
167
+
168
+ return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
169
+ lngNumRecords, False, blnIsSample)
170
+
171
+
172
+
173
+ @rteProv.get('/predict/enc', response_class = HTMLResponse)
174
+ def predict_kerasSeq(request: Request, response: Response):
175
+
176
+ #--- load test data
177
+ #--- filter to only those rows that are flagged with an anomaly
178
+ pdfClaims = libClaims.load_claims(False)
179
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
180
+ pdfResults = libProviders.get_encPredict(pdfFeatEng)
181
+ pdfResults = pdfResults[pdfResults['hasAnom?'] > 0]
182
+
183
+ lngNumRecords = libUtils.m_klngMaxRecords
184
+ blnIsSample = True
185
+ strParamTitle = "Claims Predictions (Transformer/Encoder - Keras Sequential)"
186
+
187
+ return libMain.get_jinja2Templ(request, pdfResults, strParamTitle,
188
+ lngNumRecords, False, blnIsSample)
routes/qa/rte_qa.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+
3
+
4
+ m_kstrFile = __file__
5
+ m_blnTraceOn = True
6
+
7
+
8
+ rteQa = APIRouter()
9
+
10
+
11
+ @rteQa.get('/')
12
+ @rteQa.get('/verif')
13
+ @rteQa.get('/valid')
14
+ def qa_entry():
15
+ return {
16
+ "message": "qa routing - For verification, validation"
17
+ }
templ/templ_results.html ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <body>{{ dataframe | safe }}</body>
4
+ </html>
templ/templ_showDataframe.html ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Fourthbrain Capstone: Healthcare Anomalies</title>
6
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">
7
+ </head>
8
+ <body>
9
+
10
+ <h2>{{ paramTitle }}:</h2>
11
+
12
+ <!-- Mark data as safe, otherwise it will be rendered as a string -->
13
+ {{ paramDataframe | safe }}
14
+ </body>
15
+ </html>
uix/__init__.py ADDED
File without changes
uix/images/image1.jpg ADDED
uix/images/image1.jpg:Zone.Identifier ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [ZoneTransfer]
2
+ LastWriterPackageFamilyName=Microsoft.Windows.Photos_8wekyb3d8bbwe
3
+ ZoneId=3
uix/lit_packages.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+
3
+
4
+ #--- return a list of streamlit packages/pages to render
5
+ def packages():
6
+ #---
7
+ ary_pkg = []
8
+ ary_pkg.extend(['lit_continentData',
9
+ 'lit_countryData'
10
+ ])
11
+ '''
12
+ ary_pkg.extend(['lit_claimAnalysis',
13
+ 'lit_claimAnomalies'
14
+ ])
15
+ '''
16
+ return ary_pkg
17
+
18
+
19
+
20
+ def get_aryPkgDescr():
21
+ #--- load list of pages to display
22
+ aryDescr = []
23
+ aryPkgs = []
24
+
25
+ aryModules = packages()
26
+ for modname in aryModules:
27
+ m = importlib.import_module('.'+ modname,'uix')
28
+ aryPkgs.append(m)
29
+
30
+ #--- use the module description attribute if it exists
31
+ #--- otherwise use the module name
32
+ try:
33
+ aryDescr.append(m.description)
34
+ except:
35
+ aryDescr.append(modname)
36
+ return [aryDescr, aryPkgs]
uix/lit_sidebar.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import importlib
3
+ from uix import lit_packages
4
+
5
+ from uix.pages import lit_home, lit_about
6
+ from uix.pages import lit_anom_superv, lit_anom_unsuperv
7
+
8
+
9
+
10
+ #--- alt define sidebar pages
11
+ m_aryPages = {
12
+ "Home": lit_home, #--- TODO: update
13
+ #"Provider Analysis": lit_providerAnalysis,
14
+ #"Claims Analysis": lit_claimAnalysis,
15
+ "Provider Anoms - Supervised": lit_anom_superv,
16
+ "Claim Anoms - UnSupervised": lit_anom_unsuperv,
17
+ #"MLE Model Performance": lit_about, #--- TODO: update
18
+ "About": lit_about
19
+ }
20
+
21
+
22
+ #--- define module-level vars
23
+ m_aryModNames = lit_packages.packages()
24
+ m_aryDescr = []
25
+ m_aryMods = []
26
+
27
+ def init():
28
+ #--- upper panel
29
+ with st.sidebar:
30
+ kstrUrl_image = ""
31
+ st.sidebar.image(kstrUrl_image, width=200)
32
+ st.sidebar.markdown('Visualize Provider and Claims anomalies.')
33
+
34
+
35
+ #--- init checkboxes
36
+ strKey = st.sidebar.radio("Go to", list(m_aryPages.keys()))
37
+ pagSel = m_aryPages[strKey]
38
+ writePage(pagSel)
39
+
40
+
41
+
42
+ def init_selectBox():
43
+ #--- init module array of page names, and descr
44
+ init_modDescrAry()
45
+
46
+ # Display the sidebar with a menu of apps
47
+ kstrMsg = """
48
+ __Claims Anomaly Views__
49
+ """
50
+ with st.sidebar:
51
+ st.markdown('---')
52
+ st.markdown(kstrMsg)
53
+ page = st.selectbox('Select:', m_aryModNames, format_func=fmt_modName)
54
+
55
+ #--- display sidebar footer
56
+ with st.sidebar:
57
+ st.markdown('---')
58
+ st.write('Developed by Chavarria, McKone, Sharma')
59
+ st.write('Contact at iain.mckone@gmail.com')
60
+
61
+ # Run the chosen app
62
+ m_aryMods[m_aryModNames.index(page)].run()
63
+
64
+
65
+
66
+ def init_modDescrAry():
67
+ #--- init global array of page names, and descr
68
+ #--- note: you need to specify global scope for fxns to access module-level variables
69
+ global m_aryMods
70
+ global m_aryDescr
71
+
72
+ m_aryMods = []
73
+ m_aryDescr = []
74
+ for modName in m_aryModNames:
75
+ modTemp = importlib.import_module('.'+modName,'uix')
76
+ m_aryMods.append(modTemp)
77
+
78
+ #--- If the module has a description attribute use that in the
79
+ #--- select box otherwise use the module name
80
+ try:
81
+ m_aryDescr.append(modTemp.description)
82
+ except:
83
+ m_aryDescr.append(modName)
84
+
85
+
86
+
87
+ #--- display the app descriptions instead of the module names in the selctbox
88
+ def fmt_modName(strName):
89
+ global m_aryModNames
90
+ global m_aryDescr
91
+ return m_aryDescr[m_aryModNames.index(strName)]
92
+
93
+
94
+
95
+ def writePage(uixFile):
96
+ #--- writes out the page for the selected combo
97
+
98
+ # _reload_module(page)
99
+ uixFile.run()
uix/pages/__init__.py ADDED
File without changes
uix/pages/lit_about.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #--- about page
2
+ import streamlit as st
3
+
4
+ description = "About"
5
+ def run():
6
+
7
+ print("\nINFO (lit_about.run) loading ", description, " page ...")
8
+
9
+ #---
10
+ #st.experimental_memo.clear() #--- try to clear cache each time this page is hit
11
+ #st.cache_data.clear()
12
+
13
+ st.markdown('### About')
14
+ st.markdown('### MLE10 Capstone: Healthcare Anomaly Detection')
15
+ st.markdown('#### Team: McKone, Sharma, Chavarria, Lederer')
16
+
17
+ st.markdown('Kaggle Claims Data:')
18
+ st.markdown('https://www.kaggle.com/code/rohitrox/medical-provider-fraud-detection/data')
19
+ st.markdown(
20
+ """
21
+ About page
22
+ """,
23
+ unsafe_allow_html=True,
24
+ )
uix/pages/lit_anom_superv.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #--- anomaly detection - supervised page
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+
7
+ import lib.claims as libClaims
8
+ import lib.providers as libProviders
9
+ import lib.utils as libUtils
10
+
11
+ import sys
12
+
13
+ description = "Anomaly Detection - Supervised"
14
+ m_kblnTraceOn = True #--- enable/disable module level tracing
15
+
16
+ def run():
17
+ #--- note: in python, you need to specify global scope for fxns to access module-level variables
18
+ global m_kbln_TraceOn
19
+ print("\nINFO (litAnomSuperv.run) loading ", description, " page ...")
20
+
21
+
22
+ #--- page settings
23
+ if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): Initialize Page Settings ...")
24
+ st.header("Provider Anomalies - Supervised Approach (XG Boost)")
25
+
26
+ #--- provide file drag/drop capability
27
+ m_blnDisableDragDrop = False
28
+ if(not m_blnDisableDragDrop):
29
+ #btnSave = st.button("Save")
30
+ pklDropped = st.file_uploader("Upload a Claims Dataset", type=["pkl"])
31
+ m_blnDisableDragDrop = (pklDropped is None)
32
+
33
+
34
+ #if (True):
35
+ try:
36
+
37
+ #--- show: raw claims data analysis
38
+ if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): load raw claims data ...")
39
+ if (m_blnDisableDragDrop):
40
+ pdfClaims = libClaims.load_claims(False)
41
+ else:
42
+ pdfClaims = pd.read_pickle(pklDropped)
43
+
44
+ #--- get supervised predictions
45
+ if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): doFeatEng (claims) ...")
46
+ pdfFeatEng = libClaims.do_featEng(pdfClaims)
47
+
48
+ if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): perform xgb prediction ...")
49
+ pdfPred = libProviders.get_xgbPredict(pdfFeatEng)
50
+
51
+ if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): get sample ...")
52
+ lngSampleSize = min(50, len(pdfPred.index))
53
+ pdfSample = pdfPred.sample(lngSampleSize)
54
+
55
+ #--- save a test file
56
+ #if (btnSave):
57
+ #btnSave_testFile(pdfClaims, pdfPred)
58
+
59
+ except TypeError as e:
60
+ print("ERROR (litAnomSuperv.run_typeError1): ", e)
61
+
62
+ except:
63
+ e = sys.exc_info()
64
+ print("ERROR (litAnomSuperv.run_genError1): ", e)
65
+
66
+
67
+ try:
68
+ #--- save this file locally as a pkl
69
+ #btnSave_testFile(pdfClaims, pdfPred)
70
+
71
+
72
+ #--- table sorted $insClaims reimbursed by provider
73
+ #--- display providers with predictions, sorted by InscClaimAmt Reimbursed
74
+ pdfTopClaims = pdfSample.sort_values(by=["InscClaimAmtReimbursed"], ascending=False)
75
+ if (m_kblnTraceOn): print("TRACE (litAnomSuperv.run): Show $claims reimbursed by provider ...")
76
+ st.markdown("(Top) Ins Reimbursed by Provider")
77
+ st.dataframe(pdfTopClaims)
78
+
79
+
80
+ #--- chart Top Insurance claims ($) by Provider")
81
+ chart_topInsClaimsByProvider(pdfSample)
82
+
83
+
84
+ #--- chart Top deductible amts ($) by Provider")
85
+ chart_topDeductiblePaidByProvider(pdfSample)
86
+
87
+
88
+ #--- chart Top IP Annual Reimbursement amts ($) by Provider")
89
+ chart_topIPAnnualReimbAmtByProvider(pdfSample)
90
+
91
+
92
+ #--- chart Top IP Annual Reimbursement amts ($) by Provider")
93
+ chart_topIPAnnualDeductAmtByProvider(pdfSample)
94
+
95
+
96
+ #--- chart Top IP Annual Reimbursement amts ($) by Provider")
97
+ chart_topOPAnnualReimbAmtByProvider(pdfSample)
98
+
99
+
100
+ #--- chart Top IP Annual Reimbursement amts ($) by Provider")
101
+ chart_topOPAnnualDeductAmtByProvider(pdfSample)
102
+
103
+
104
+ except TypeError as e:
105
+ print("ERROR (litAnomSuperv.run_typeError2): ", e)
106
+
107
+ except:
108
+ e = sys.exc_info()
109
+ print("ERROR (litAnomSuperv.run_genError2): ", e)
110
+
111
+
112
+
113
+ def chart_topOPAnnualReimbAmtByProvider(pdfSample):
114
+ pdfBar = pdfSample.sort_values(by=["OPAnnualReimbursementAmt"], ascending=False)
115
+ pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
116
+
117
+ #--- chart
118
+ fig = go.Figure(
119
+ layout=dict(
120
+ title="(Sample Anomalies) Top OP Reimb Paid ($) by Provider",
121
+ legend=dict(groupclick="toggleitem"),
122
+ )
123
+ )
124
+
125
+ fig.add_trace(
126
+ go.Bar(
127
+ x=pdfBar.Provider,
128
+ y=pdfBar.OPAnnualReimbursementAmt,
129
+ name="OP Reimb Paid",
130
+ marker_color="LightBlue",
131
+ )
132
+ )
133
+
134
+
135
+ fig.add_trace(
136
+ go.Scatter(
137
+ x=pdfAnoms.Provider,
138
+ y=pdfAnoms.OPAnnualReimbursementAmt,
139
+ mode="markers",
140
+ marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
141
+ name="Anomalies"
142
+ ))
143
+
144
+ st.plotly_chart(fig, use_container_width=True)
145
+
146
+
147
+
148
+ def chart_topOPAnnualDeductAmtByProvider(pdfSample):
149
+ pdfBar = pdfSample.sort_values(by=["OPAnnualDeductibleAmt"], ascending=False)
150
+ pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
151
+
152
+ #--- chart
153
+ fig = go.Figure(
154
+ layout=dict(
155
+ title="(Sample Anomalies) Top OP Deduct Amt ($) by Provider",
156
+ legend=dict(groupclick="toggleitem"),
157
+ )
158
+ )
159
+
160
+ fig.add_trace(
161
+ go.Bar(
162
+ x=pdfBar.Provider,
163
+ y=pdfBar.OPAnnualDeductibleAmt,
164
+ name="OP Deductible Paid",
165
+ marker_color="LightBlue",
166
+ )
167
+ )
168
+
169
+
170
+ fig.add_trace(
171
+ go.Scatter(
172
+ x=pdfAnoms.Provider,
173
+ y=pdfAnoms.OPAnnualDeductibleAmt,
174
+ mode="markers",
175
+ marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
176
+ name="Anomalies"
177
+ ))
178
+
179
+ st.plotly_chart(fig, use_container_width=True)
180
+
181
+
182
+
183
+ def chart_topIPAnnualReimbAmtByProvider(pdfSample):
184
+ pdfBar = pdfSample.sort_values(by=["IPAnnualReimbursementAmt"], ascending=False)
185
+ pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
186
+
187
+ #--- chart
188
+ fig = go.Figure(
189
+ layout=dict(
190
+ title="(Sample Anomalies) Top IP Reimb Paid ($) by Provider",
191
+ legend=dict(groupclick="toggleitem"),
192
+ )
193
+ )
194
+
195
+ fig.add_trace(
196
+ go.Bar(
197
+ x=pdfBar.Provider,
198
+ y=pdfBar.IPAnnualReimbursementAmt,
199
+ name="IP Reimb Paid",
200
+ marker_color="LightBlue",
201
+ )
202
+ )
203
+
204
+
205
+ fig.add_trace(
206
+ go.Scatter(
207
+ x=pdfAnoms.Provider,
208
+ y=pdfAnoms.IPAnnualReimbursementAmt,
209
+ mode="markers",
210
+ marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
211
+ name="Anomalies"
212
+ ))
213
+
214
+ st.plotly_chart(fig, use_container_width=True)
215
+
216
+
217
+
218
+ def chart_topIPAnnualDeductAmtByProvider(pdfSample):
219
+ pdfBar = pdfSample.sort_values(by=["IPAnnualDeductibleAmt"], ascending=False)
220
+ pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
221
+
222
+ #--- chart
223
+ fig = go.Figure(
224
+ layout=dict(
225
+ title="(Sample Anomalies) Top IP Deduct Amt ($) by Provider",
226
+ legend=dict(groupclick="toggleitem"),
227
+ )
228
+ )
229
+
230
+ fig.add_trace(
231
+ go.Bar(
232
+ x=pdfBar.Provider,
233
+ y=pdfBar.IPAnnualDeductibleAmt,
234
+ name="IP Deductible Paid",
235
+ marker_color="LightBlue",
236
+ )
237
+ )
238
+
239
+
240
+ fig.add_trace(
241
+ go.Scatter(
242
+ x=pdfAnoms.Provider,
243
+ y=pdfAnoms.IPAnnualDeductibleAmt,
244
+ mode="markers",
245
+ marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
246
+ name="Anomalies"
247
+ ))
248
+
249
+ st.plotly_chart(fig, use_container_width=True)
250
+
251
+
252
+ def chart_topDeductiblePaidByProvider(pdfSample):
253
+ pdfBar = pdfSample.sort_values(by=["DeductibleAmtPaid"], ascending=False)
254
+ pdfAnoms = pdfBar[pdfBar['hasAnom?'] > 0]
255
+
256
+ #--- chart
257
+ fig = go.Figure(
258
+ layout=dict(
259
+ title="(Sample Anomalies) Top Deductibles Paid ($) by Provider",
260
+ legend=dict(groupclick="toggleitem"),
261
+ )
262
+ )
263
+
264
+ fig.add_trace(
265
+ go.Bar(
266
+ x=pdfBar.Provider,
267
+ y=pdfBar.DeductibleAmtPaid,
268
+ name="Deductibles Paid",
269
+ marker_color="LightBlue",
270
+ #offsetgroup="anoms",
271
+ #legendgroup="anoms",
272
+ #legendgrouptitle_text="Anoms",
273
+ )
274
+ )
275
+
276
+
277
+ fig.add_trace(
278
+ go.Scatter(
279
+ x=pdfAnoms.Provider,
280
+ y=pdfAnoms.DeductibleAmtPaid,
281
+ mode="markers",
282
+ marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
283
+ #offsetgroup="anoms",
284
+ #legendgroup="anoms",
285
+ name="Anomalies"
286
+ ))
287
+
288
+ st.plotly_chart(fig, use_container_width=True)
289
+
290
+
291
+ def chart_topInsClaimsByProvider(pdfSample):
292
+ pdfTopClaims = pdfSample.sort_values(by=["InscClaimAmtReimbursed"], ascending=False)
293
+ pdfAnoms = pdfTopClaims[pdfTopClaims['hasAnom?'] > 0]
294
+
295
+ #--- chart
296
+ #st.markdown("(Sample Anomalies) Top Insurance claims ($) by Provider")
297
+ fig = go.Figure(
298
+ layout=dict(
299
+ #xaxis=dict(categoryorder="category descending"),
300
+ #yaxis=dict(range=[0, 7]),
301
+ #scattermode="group",
302
+ title="(Sample Anomalies) Top Insurance claims ($) by Provider",
303
+ legend=dict(groupclick="toggleitem"),
304
+ )
305
+ )
306
+
307
+ fig.add_trace(
308
+ go.Bar(
309
+ x=pdfTopClaims.Provider,
310
+ y=pdfTopClaims.InscClaimAmtReimbursed,
311
+ name="Ins Claims Reibursed",
312
+ marker_color="LightBlue",
313
+ #offsetgroup="anoms",
314
+ #legendgroup="anoms",
315
+ #legendgrouptitle_text="Anoms",
316
+ )
317
+ )
318
+
319
+
320
+ fig.add_trace(
321
+ go.Scatter(
322
+ x=pdfAnoms.Provider,
323
+ y=pdfAnoms.InscClaimAmtReimbursed,
324
+ mode="markers",
325
+ marker = dict(size = 15, color = 'IndianRed', symbol = 'x'),
326
+ #offsetgroup="anoms",
327
+ #legendgroup="anoms",
328
+ name="Anomalies"
329
+ ))
330
+
331
+ st.plotly_chart(fig, use_container_width=True)
332
+
333
+
334
+
335
+ def btnSave_testFile(pdfClaims, pdfPred):
336
+ #--- get all providers for all anoms
337
+ #print("TRACE (lit_anom_superv.btnSave_testFile) query anoms ... ", pdfPred.head(10))
338
+ pdfAnomProv = pdfPred[pdfPred['hasAnom?'] > 0]
339
+ #pdfAnomProv = pdfAnomProv['Provider']
340
+
341
+ #--- filter claims by anomProviders
342
+ print("TRACE (lit_anom_superv.btnSave_testFile) filter claims ... ")
343
+ pdfClaimAnom = pdfClaims[pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
344
+ pdfClaimNoAnom = pdfClaims[~pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
345
+ lngNumAnoms = len(pdfClaimAnom.index)
346
+ lngNumOk = len(pdfClaimNoAnom.index)
347
+ print("TRACE (lit_anom_superv.btnSave_testFile) #anoms: ", lngNumAnoms, ", !anoms: ", lngNumOk)
348
+
349
+ #--- get a sample for remaining records
350
+ print("TRACE (lit_anom_superv.btnSave_testFile) sampling claims ... ")
351
+ pdfSave = pd.concat([pdfClaimAnom.sample(frac=0.6), pdfClaimNoAnom.sample(frac=0.1)])
352
+
353
+ print("TRACE (lit_anom_superv.btnSave_testFile) saving ... ")
354
+ saveProviderTestData(pdfSave)
355
+
356
+
357
+ def saveProviderTestData(pdfTestData):
358
+
359
+ #--- save the file
360
+ from datetime import date
361
+ import time
362
+ import pickle
363
+ strDteNow = date.today().strftime('%Y%m%d')
364
+ strTimeNow = time.strftime('%H%M%S')
365
+ strProvTestFile = libUtils.pth_data + strDteNow + strTimeNow + "_provTestSample.pkl"
366
+ #pd.to_pickle(pdfClaims.sample(200), strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
367
+ pdfTestData.to_pickle(strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
368
+
uix/pages/lit_anom_unsuperv.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #--- anomaly detection - unsupervised page
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+
8
+ import lib.claims as libClaims
9
+ import lib.providers as libProviders
10
+ import lib.utils as libUtils
11
+ import sys
12
+
13
+ description = "Anomaly Detection - Unsupervised"
14
+ m_kblnTraceOn = False #--- enable/disable module level tracing
15
+
16
+ def run():
17
+ #--- note: in python, you need to specify global scope for fxns to access module-level variables
18
+ global m_kblnTraceOn
19
+ print("\nINFO (lit_about.run) loading ", description, " page ...")
20
+
21
+
22
+ try:
23
+
24
+ #--- page settings
25
+ if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): Initialize Page Settings ...")
26
+ st.header("Claims Anomalies - Unsupervised Approach (KMeans)")
27
+
28
+
29
+ #--- provide file drag/drop capability
30
+ m_blnDisableDragDrop = False
31
+ if(not m_blnDisableDragDrop):
32
+ #btnSave = st.button("Save")
33
+ pklDropped = st.file_uploader("Upload a Claims Dataset", type=["pkl"])
34
+ m_blnDisableDragDrop = (pklDropped is None)
35
+
36
+
37
+ #--- show: raw claims data analysis
38
+ if (m_kblnTraceOn): print("TRACE (litAnomUnSuperv.run): load raw claims data ...")
39
+ if (m_blnDisableDragDrop):
40
+ pdfClaims = libClaims.load_claims(False)
41
+ else:
42
+ pdfClaims = pd.read_pickle(pklDropped)
43
+
44
+ #--- show: raw claims data analysis
45
+ if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show Raw Claims Dataframe ...")
46
+ pdfClaims = libClaims.load_claims(False)
47
+
48
+
49
+ #--- get unsupervised predictions
50
+ #pdfFeatEng = libClaims.do_featEng(pdfClaims)
51
+ pdfPred = libClaims.get_kmeansPredict(pdfClaims)
52
+ pdfSample = pdfPred.sample(100)
53
+ pdfSample['providerId'] = pdfSample['Provider'].str[3:].astype(np.float64)
54
+
55
+
56
+ #--- save this file locally as a pkl
57
+ #btnSave_testFile(pdfClaims, pdfPred)
58
+
59
+
60
+ #--- table of claims and clusters, sorted by InscClaimAmt Reimbursed
61
+ pdfTopClaims = pdfSample.sort_values(by=["cluster", "InscClaimAmtReimbursed"], ascending=False)
62
+ if (m_kblnTraceOn): print("TRACE (litAnomUnsuperv.run): Show $claims reimbursed by cluster ...")
63
+ st.markdown("(Top) Ins Claim Reimbursed by Cluster")
64
+ st.dataframe(pdfTopClaims)
65
+
66
+
67
+ #--- chart cluster data distribution
68
+ chart_clusterDistr(pdfSample)
69
+
70
+
71
+ col1, col2, col3 = st.columns(3)
72
+
73
+
74
+ #--- chart KMeans clusters": InscClaimAmtReimbursed
75
+ #chart_KMeansClusters(pdfSample, "Age", "InscClaimAmtReimbursed", col1)
76
+ #chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col2)
77
+
78
+ chart_KMeansClusters(pdfSample, "providerId", "AdmittedDays", col1)
79
+ chart_KMeansClusters(pdfSample, "providerId", "DeductibleAmtPaid", col2)
80
+ chart_KMeansClusters(pdfSample, "providerId", "InscClaimAmtReimbursed", col3)
81
+
82
+ chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_KidneyDisease", col1)
83
+ chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_Heartfailure", col2)
84
+ chart_KMeansClusters(pdfSample, "providerId", "ChronicCond_ObstrPulmonary", col3)
85
+
86
+ chart_KMeansClusters(pdfSample, "AdmittedDays", "DeductibleAmtPaid", col1)
87
+ chart_KMeansClusters(pdfSample, "AdmittedDays", "InscClaimAmtReimbursed", col2)
88
+ chart_KMeansClusters(pdfSample, "DeductibleAmtPaid", "InscClaimAmtReimbursed", col3)
89
+
90
+
91
+
92
+ #--- chart cluster bars
93
+ #chart_KMeansBars(pdfSample, "cluster", "InscClaimAmtReimbursed", col1)
94
+ #chart_KMeansBars(pdfSample, "cluster", "DeductibleAmtPaid", col2)
95
+
96
+ #chart_KMeansBars(pdfSample, "cluster", "IPAnnualReimbursementAmt", col1)
97
+ #chart_KMeansBars(pdfSample, "cluster", "IPAnnualDeductibleAmt", col2)
98
+
99
+ #chart_KMeansBars(pdfSample, "cluster", "OPAnnualReimbursementAmt", col1)
100
+ #chart_KMeansBars(pdfSample, "cluster", "OPAnnualDeductibleAmt", col2)
101
+
102
+ #chart_KMeansBars(pdfSample, "cluster", "ChronicCond_Heartfailure", col1)
103
+ #chart_KMeansBars(pdfSample, "cluster", "ChronicCond_KidneyDisease", col2)
104
+
105
+ except TypeError as e:
106
+ print("ERROR (litAnomUnsuperv.run_typeError): ", e)
107
+
108
+ except:
109
+ e = sys.exc_info()
110
+ print("ERROR (litAnomUnsuperv.run_genError): ", e)
111
+
112
+
113
+
114
+ def chart_clusterDistr(pdfSample):
115
+ #pdfClustDistr = pdfSample['cluster'].value_counts()
116
+ pdfBar = pdfSample
117
+ pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
118
+ pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
119
+ pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]
120
+
121
+ kstrTitle = "(KMeans Clusters) Claims data"
122
+ #--- chart
123
+ fig = go.Figure(
124
+ layout=dict(
125
+ legend=dict(groupclick="toggleitem"),
126
+ xaxis=dict(title='cluster'),
127
+ yaxis=dict(title='#data points')
128
+ )
129
+ )
130
+
131
+ fig.add_trace(
132
+ go.Bar(
133
+ x=pdfCluster0['cluster'],
134
+ y=pdfCluster0['cluster'].value_counts(),
135
+ name='cluster0'
136
+ )
137
+ )
138
+
139
+ if (pdfCluster1.shape[0]>0):
140
+ fig.add_trace(
141
+ go.Bar(
142
+ x=pdfCluster1['cluster'],
143
+ y=pdfCluster1['cluster'].value_counts(),
144
+ name='cluster1'
145
+ ))
146
+
147
+ if (pdfCluster2.shape[0]>0):
148
+ fig.add_trace(
149
+ go.Bar(
150
+ x=pdfCluster2['cluster'],
151
+ y=pdfCluster2['cluster'].value_counts(),
152
+ name='cluster2'
153
+ ))
154
+ st.plotly_chart(fig, use_container_width=True)
155
+
156
+
157
+ def chart_KMeansClusters(pdfSample, strXFeature, strYFeature, stCol):
158
+ pdfScatter = pdfSample
159
+ pdfCluster0 = pdfScatter[pdfScatter['cluster'] == 0]
160
+ pdfCluster1 = pdfScatter[pdfScatter['cluster'] == 1]
161
+ pdfCluster2 = pdfScatter[pdfScatter['cluster'] == 2]
162
+
163
+ kstrTitle = "(KMeans Clusters) Claims data"
164
+ #--- chart
165
+ fig = go.Figure(
166
+ layout=dict(
167
+ legend=dict(groupclick="toggleitem"),
168
+ xaxis=dict(title=strXFeature),
169
+ yaxis=dict(title=strYFeature)
170
+ )
171
+ )
172
+
173
+ fig.add_trace(
174
+ go.Scatter(
175
+ x=pdfCluster0[strXFeature],
176
+ y=pdfCluster0[strYFeature],
177
+ text="claimId: " + pdfCluster0['ClaimID'],
178
+ mode='markers',
179
+ name='cluster0'
180
+ )
181
+ )
182
+
183
+ if (pdfCluster1.shape[0]>0):
184
+ fig.add_trace(
185
+ go.Scatter(
186
+ x=pdfCluster1[strXFeature],
187
+ y=pdfCluster1[strYFeature],
188
+ mode='markers',
189
+ name='cluster1'
190
+ ))
191
+
192
+ if (pdfCluster2.shape[0]>0):
193
+ fig.add_trace(
194
+ go.Scatter(
195
+ x=pdfCluster2[strXFeature],
196
+ y=pdfCluster2[strYFeature],
197
+ mode='markers',
198
+ name='cluster2'
199
+ ))
200
+ stCol.plotly_chart(fig, use_container_width=True)
201
+
202
+
203
+ def chart_KMeansBars(pdfSample, strXFeature, strYFeature, stCol):
204
+ pdfBar = pdfSample
205
+ pdfCluster0 = pdfBar[pdfBar['cluster'] == 0]
206
+ pdfCluster1 = pdfBar[pdfBar['cluster'] == 1]
207
+ pdfCluster2 = pdfBar[pdfBar['cluster'] == 2]
208
+
209
+ kstrTitle = "(KMeans Clusters) Claims data"
210
+ #--- chart
211
+ fig = go.Figure(
212
+ layout=dict(
213
+ legend=dict(groupclick="toggleitem"),
214
+ xaxis=dict(title=strXFeature),
215
+ yaxis=dict(title=strYFeature)
216
+ )
217
+ )
218
+
219
+ fig.add_trace(
220
+ go.Bar(
221
+ x=pdfCluster0[strXFeature],
222
+ y=pdfCluster0[strYFeature],
223
+ name='cluster0'
224
+ )
225
+ )
226
+
227
+ if (pdfCluster1.shape[0]>0):
228
+ fig.add_trace(
229
+ go.Bar(
230
+ x=pdfCluster1[strXFeature],
231
+ y=pdfCluster1[strYFeature],
232
+ name='cluster1'
233
+ ))
234
+
235
+ if (pdfCluster2.shape[0]>0):
236
+ fig.add_trace(
237
+ go.Bar(
238
+ x=pdfCluster2[strXFeature],
239
+ y=pdfCluster2[strYFeature],
240
+ name='cluster2'
241
+ ))
242
+ stCol.plotly_chart(fig, use_container_width=True)
243
+
244
+
245
+
246
+ def btnSave_testFile(pdfClaims, pdfPred):
247
+ #--- get all claims for all anoms
248
+ """ print("TRACE (lit_anom_unsuperv.btnSave_testFile) query anoms ... ", pdfPred.head(10))
249
+ pdfAnomClaims = pdfPred[pdfPred['hasAnom?'] > 0]
250
+ #pdfAnomProv = pdfAnomProv['Provider']
251
+
252
+ #--- filter claims by anomProviders
253
+ print("TRACE (lit_anom_unsuperv.btnSave_testFile) filter claims ... ")
254
+ pdfClaimAnom = pdfClaims[pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
255
+ pdfClaimNoAnom = pdfClaims[~pdfClaims['Provider'].isin(pdfAnomProv['Provider'])]
256
+ lngNumAnoms = len(pdfClaimAnom.index)
257
+ lngNumOk = len(pdfClaimNoAnom.index)
258
+ print("TRACE (lit_anom_unsuperv.btnSave_testFile) #anoms: ", lngNumAnoms, ", !anoms: ", lngNumOk)
259
+
260
+ #--- get a sample for remaining records
261
+ print("TRACE (lit_anom_unsuperv.btnSave_testFile) sampling claims ... ")
262
+ pdfSave = pd.concat([pdfClaimAnom.sample(frac=0.6), pdfClaimNoAnom.sample(frac=0.1)]) """
263
+
264
+ pdfSave = pdfClaims.sample(frac=0.1)
265
+
266
+ print("TRACE (lit_anom_unsuperv.btnSave_testFile) saving ... ")
267
+ saveProviderTestData(pdfSave)
268
+
269
+
270
+ def saveProviderTestData(pdfTestData):
271
+
272
+ #--- save the file
273
+ from datetime import date
274
+ import time
275
+ import pickle
276
+ strDteNow = date.today().strftime('%Y%m%d')
277
+ strTimeNow = time.strftime('%H%M%S')
278
+ strProvTestFile = libUtils.pth_data + strDteNow + strTimeNow + "_claimsTestSample.pkl"
279
+ #pd.to_pickle(pdfClaims.sample(200), strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
280
+ pdfTestData.to_pickle(strProvTestFile, protocol=pickle.HIGHEST_PROTOCOL)
uix/pages/lit_claimAnalysis.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #--- claim analysis page
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import plotly.express as px
5
+
6
+ import lib.claims as libClaims
7
+
8
+ description = "Claim Analysis"
9
+ m_kbln_traceOn = False #--- enable/disable module level tracing
10
+
11
+
12
+ def run():
13
+ #--- note: in python, you need to specify global scope for fxns to access module-level variables
14
+ global m_kbln_traceOn
15
+
16
+ try:
17
+
18
+ #--- page settings
19
+ if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Initialize Page Settings ...")
20
+ st.header("Claims Analysis")
21
+
22
+
23
+ #--- show: raw claims data analysis
24
+ if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show Raw Claims Dataframe ...")
25
+ dfClaims = libClaims.load_claims(False)
26
+ #dfClaims = libClaims.loadPkl_testClaims() #--- note: a large dataset; reduce before render
27
+ dfRaw = dfClaims.sample(25)
28
+ st.markdown("(Sample) Raw Claims Data: Providers, Beneficiaries, Physicians, Procedures, etc")
29
+ st.dataframe(dfRaw)
30
+
31
+
32
+ #--- show: data grouped by provider
33
+ pdfClaimsByProvider = dfClaims.groupby(
34
+ by=["Provider"], as_index=False).agg(
35
+ {"ClaimID":"count", "InscClaimAmtReimbursed":"sum", "DeductibleAmtPaid":"sum"}
36
+ )
37
+ st.markdown("(Sample) Raw Claims Data: Grouped by Provider")
38
+ st.dataframe(pdfClaimsByProvider.sample(25))
39
+
40
+ #--- show: bar charts
41
+ col1, col2 = st.columns(2)
42
+
43
+ #--- show $claims reimbursed by provider
44
+ if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show $claims reimbursed by provider ...")
45
+ pdfTopClaimsByProv = dfClaims.nlargest(10, "InscClaimAmtReimbursed")
46
+ fig = px.bar(pdfTopClaimsByProv,
47
+ x="Provider", y="InscClaimAmtReimbursed", title="$ Claims by Provider")
48
+ #col1.markdown("(Sample) $Claims Reimbursed by Provider")
49
+ col1.plotly_chart(fig, use_container_width=True)
50
+
51
+ #--- #claims reimbursed by provider
52
+ if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show #claims reimbursed by provider ...")
53
+ #pdfMaxClaimsByProv = dfClaims.groupby(['Provider'])['ClaimID'].count()
54
+ pdfClaimCountByProv = dfClaims.groupby(
55
+ by=["Provider"], as_index=False).agg(
56
+ {"ClaimID": "count"}
57
+ )
58
+ pdfClaimCountByProv = pdfClaimCountByProv.nlargest(10, "ClaimID")
59
+ fig = px.bar(pdfClaimCountByProv,
60
+ x="Provider", y="ClaimID", title="# Claims by Provider", barmode="group")
61
+ #col2.markdown("(Sample) #Claims Reimbursed by Provider") #--- just to even out the display
62
+ col2.plotly_chart(fig, use_container_width=True)
63
+
64
+
65
+ #--- TODO: (optimization) create a single group by dataframe; try not to recreate for each chart
66
+ if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): Show top $deductible paid by provider ...")
67
+ pdfDedAmtPaid = dfClaims.nlargest(10, "DeductibleAmtPaid")
68
+ fig = px.bar(pdfDedAmtPaid,
69
+ x="Provider", y="DeductibleAmtPaid", title="Deductible Paid by Provider")
70
+ col1.plotly_chart(fig, use_container_width=True)
71
+
72
+ if (m_kbln_traceOn): print("TRACE (litClaimAnalysis.run): end of fxn ...")
73
+
74
+ except TypeError as e:
75
+ print("ERROR (litClaimAnalysis.run): ", e)
uix/pages/lit_home.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #--- about page
2
+ import streamlit as st
3
+
4
+ description = "Home"
5
+ def run():
6
+
7
+ print("\nINFO (lit_home.run) loading ", description, " page ...")
8
+
9
+
10
+ st.markdown('### Home')
11
+ st.markdown('### MLE10 Capstone: Healthcare Anomaly Detection')
12
+ st.markdown('\
13
+ Healthcare fraud is an expensive white-collar crime in the US and leads to an \
14
+ increase in healthcare premiums, and a reduction in quality and access to care.\
15
+ The National Health Care Anti-Fraud Association conservatively estimates that \
16
+ about 3 percent of US healthcare spending is lost to fraud per year ($300 billion \
17
+ approximately).')
18
+
19
+ st.markdown('\
20
+ Machine Learning techniques can identify current and evolving anomalies in claims \
21
+ data. As fraud becomes more sophisticated across an increasing number of annual \
22
+ transactions, an ML solution provides an opportunity to greatly reduce the effort, \
23
+ time and associated cost spent in identifying claims anomalies, and recouping any \
24
+ misappropriated funds. ')
25
+
26
+ st.markdown('\
27
+ To illustrate the capabilities of Machine Learning to identify claims anomalies, \
28
+ this capstone project team has developed two solutions: \
29
+ \n\t - a supervised Logistic Regression Model to identify potential anomalies at \
30
+ the provider level \
31
+ \n\t - an unsupervised KMeans Clustering Model to identify potential anomalies \
32
+ at the claim level.')
33
+
34
+ st.markdown(
35
+ """
36
+
37
+ Home page
38
+
39
+ """,
40
+ unsafe_allow_html=True,
41
+ )
uix/pages/lit_modelPerf.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ description = "Model Performance"
2
+
3
+ def run():
4
+ import streamlit as st
5
+ import pandas as pd
6
+ import plotly.express as px