tiesan commited on
Commit
4ff22c7
β€’
1 Parent(s): a634c99

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +4 -5
  2. app.py +159 -0
  3. etl.py +159 -0
  4. heliumhealth-a05d595e5991.json +12 -0
  5. requirements.txt +88 -0
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
- title: Qa App Overlap
3
- emoji: 🏒
4
  colorFrom: yellow
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 4.1.2
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Overlap Qa Check
3
+ emoji: 🌍
4
  colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.29.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ import gradio as gr
4
+ import gspread
5
+ import validators
6
+ from validators import ValidationFailure
7
+ from google.oauth2.service_account import Credentials
8
+ from pydrive.auth import GoogleAuth
9
+ from pydrive.drive import GoogleDrive
10
+
11
+ import etl
12
+
13
+ # test sheet - https://docs.google.com/spreadsheets/d/1iJ0-882HsWkAth0e0P_kf21aL6583Z7m1LwloaRCVEc/edit#gid=0
14
+ week_list = [f"week_{i}" for i in range(1, 51)]
15
+
16
+ # credentials file
17
+ cred_file_path = 'heliumhealth-a05d595e5991.json'
18
+
19
+ # google auth scopes
20
+ scopes = ['https://www.googleapis.com/auth/spreadsheets',
21
+ 'https://www.googleapis.com/auth/drive']
22
+
23
+ # create credentials
24
+ credentials = Credentials.from_service_account_file(
25
+ cred_file_path, scopes=scopes )
26
+
27
+ # authorize google spreadsheet
28
+ gc = gspread.authorize(credentials)
29
+
30
+ gauth = GoogleAuth()
31
+ drive = GoogleDrive(gauth)
32
+
33
+ def overlap_matching(row, d_type):
34
+ """
35
+ matching overlap
36
+ """
37
+ match_cols = {
38
+ "prescription": ['RX Norm [Super Generic]_x', 'RX Norm [Super Generic]_y'],
39
+ "diagnosis": ['ICD10 Diagnosis_x','ICD10 Diagnosis_y']
40
+ }
41
+ cols = match_cols[d_type]
42
+ row['match_status'] = 'match' if row[cols[0]] == row[cols[1]] else 'no match'
43
+ return row
44
+
45
+ def overlap_check(gs, start_date, end_date, week_no, d_type):
46
+ """
47
+ load, preprocess, check overlap, postprocess and output data to google sheet
48
+
49
+ Args:
50
+ gs (GSheet instance): Gsheet instance access to google sheet
51
+ start_date (str): Date str format(YYYY-mm-dd)
52
+ end_date (_type_): Date str format(YYYY-mm-dd)
53
+ d_type (_type_): sheet type (prescription or diagnosis)
54
+ """
55
+ # load data
56
+ all_data = etl.load_data(gs, start_date, end_date, d_type)
57
+
58
+ # preprocess and return overlap data
59
+ overlap_data = etl.preprocess_data(all_data, d_type)
60
+
61
+ # do overlap matching
62
+ overlap_data = overlap_data.apply(overlap_matching, axis=1, args=[d_type])
63
+
64
+ # post process
65
+ overlap_data = etl.post_process(overlap_data, d_type)
66
+
67
+ # write to sheet
68
+ etl.output_data(gc, overlap_data, week_no, d_type)
69
+
70
+
71
+ def overlap_check_main(sheet_type, start_date_str, end_date_str, week_str, sheet_url):
72
+ """
73
+ overlap check main function
74
+
75
+ Args:
76
+ sheet_type (str): sheet type (prescription or diagnosis)
77
+ start_date_str (str): start date string: e.g. 2023-03-21
78
+ end_date_str (str): end date string: e.g. 2023-03-24
79
+ week_str (str): week string e.g. week_1
80
+ sheet_url (url): _description_
81
+
82
+ Raises:
83
+ gr.exceptions.Error: Date Format Error - either start_date or end date are bad format
84
+ gr.exceptions.Error: Date Error - when start_date is greater than end date
85
+ gr.exceptions.Error: URL Error - Bad url format
86
+ """
87
+ # format date from string
88
+ try:
89
+ start_date = datetime.strptime(start_date_str.strip(), "%Y-%m-%d").date()
90
+ end_date = datetime.strptime(end_date_str.strip(), "%Y-%m-%d").date()
91
+ except:
92
+ raise gr.Error(message="Wrong date format")
93
+
94
+ # raise error when start date is greater end date
95
+ if start_date >= end_date:
96
+ raise gr.Error(message="Start date cannot be greater end date")
97
+
98
+ # Check if the input is valid url
99
+ # ToDO: Error message displayed is not explanatory - Fix it
100
+ url_check = validators.url(sheet_url)
101
+ if isinstance(url_check, ValidationFailure):
102
+ raise gr.Error(message="Please enter a valid URL")
103
+
104
+
105
+ # open the google sheet for reading
106
+
107
+ gs = gc.open_by_url(sheet_url)
108
+
109
+
110
+ # if sheet_type == 'prescription':
111
+ try:
112
+ overlap_check(gs, start_date_str, end_date_str, week_str, sheet_type)
113
+ except:
114
+ gr.Error(message="Permission denied. Please add IAM user to the sheet and try again")
115
+
116
+
117
+ return f"Successfully ran {sheet_type} overlap check for {week_str.replace('_', ' ').title()} ({start_date_str} - {end_date_str})"
118
+
119
+ with gr.Blocks() as demo:
120
+ gr.Markdown(
121
+ """
122
+ ## Overlap Check App
123
+
124
+ * Add IAM User to sheet you want to test
125
+ * Gsheet tabs required for diagnosis:
126
+ * Diagnosis
127
+ * Gsheet tabs required for prescription:
128
+ * Prescriptions
129
+ * Data headers required for diagnosis
130
+ * Unstructured Name, ICD10 Diagnosis, Intern
131
+ * Data headers required for prescription
132
+ * Unstructured Name, RX Norm [Super Generic], Intern
133
+ """
134
+ )
135
+ # inputs
136
+ sheet_type = gr.Dropdown(['prescription', 'diagnosis'], label="QA Type")
137
+ start_date = gr.Textbox(label="Start Date", placeholder="YYYY-MM-DD")
138
+ end_date = gr.Textbox(label="End Date", placeholder="YYYY-MM-DD")
139
+ week_input = gr.Dropdown(week_list, label="Week")
140
+ url = gr.Textbox(label="URL", placeholder="Enter sheet url ...")
141
+
142
+ # outputs
143
+ output = gr.Textbox(label="Output Box")
144
+
145
+ run_btn = gr.Button("Run")
146
+ run_btn.click(
147
+ fn=overlap_check_main,
148
+ inputs=[
149
+ sheet_type,
150
+ start_date,
151
+ end_date,
152
+ week_input,
153
+ url
154
+ ],
155
+ outputs=output,
156
+ api_name="Overlap_check"
157
+ )
158
+
159
+ demo.launch()
etl.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from gspread_dataframe import set_with_dataframe
4
+
5
+ def load_data(gs, start_date, end_date, d_type):
6
+ """
7
+ load data from google sheet
8
+
9
+ Args:
10
+ gs (Gsheet instance ): GSheet instance with access to google sheet
11
+ start_date (str): Date str format(YYYY-mm-dd)
12
+ end_date (str): Date str format(YYYY-mm-dd)
13
+ d_type (str):sheet type (prescription or diagnosis)
14
+
15
+ Raises:
16
+ gr.Error: 403 permission denied
17
+
18
+ Returns:
19
+ data (DataFrame): loaded data from google sheet
20
+ """
21
+ sheets_dict = {
22
+ "diagnosis": "Diagnosis",
23
+ "prescription": "Prescriptions"
24
+ }
25
+
26
+ # open data and super sheet
27
+ try:
28
+ data_sheet = gs.worksheet(sheets_dict[d_type])
29
+ except:
30
+ raise gr.Error(message='Permission denied. Please add IAM user to the sheet and try again')
31
+
32
+ # read data from sheets
33
+ all_data = pd.DataFrame(data_sheet.get_all_records())
34
+ all_data.columns = all_data.columns.str.strip()
35
+
36
+ # transform date fields
37
+ all_data['date_cleaned'] = pd.to_datetime(all_data['date_cleaned'], infer_datetime_format=True)
38
+ all_data['date_reviewed'] = pd.to_datetime(all_data['date_reviewed'], infer_datetime_format=True)
39
+
40
+ data = all_data[(all_data['date_cleaned'] >= start_date) & (all_data['date_cleaned'] <= end_date)]
41
+
42
+ return data
43
+
44
+ def preprocess_data(data, d_type):
45
+ """
46
+ preprocess loaded data from google sheet
47
+
48
+ Args:
49
+ data (DataFrame): google sheet data as a dataframe
50
+ d_type (Str): sheet type (prescription or diagnosis)
51
+
52
+ Raises:
53
+ gr.Error (ValueError): Wrong column
54
+
55
+ Returns:
56
+ Overlap (DataFrame): Overlap data
57
+ """
58
+
59
+ data.columns = data.columns.str.strip()
60
+
61
+ overlap_cols = {
62
+ "prescription": ["RX Norm [Super Generic]_x", "RX Norm [Super Generic]_y" ],
63
+ "diagnosis": ["ICD10 Diagnosis_x", "ICD10 Diagnosis_y"]
64
+ }
65
+
66
+ try:
67
+ # extract the interns
68
+ intern_1 = data['Intern'].unique()[0]
69
+ intern_2 = data['Intern'].unique()[1]
70
+
71
+ # intern 1 and intern 2 data
72
+ intern_1_data = data[data['Intern'] == intern_1.strip()]
73
+ intern_2_data = data[data['Intern'] == intern_2.strip()]
74
+
75
+ # extract the overlap
76
+ overlap = pd.merge(intern_1_data, intern_2_data, on='Unstructured Name')
77
+
78
+ # remove all white spaces
79
+ col = overlap_cols[d_type]
80
+ overlap[col[0]] = overlap[col[0]].str.strip()
81
+ overlap[col[1]] = overlap[col[1]].str.strip()
82
+ except:
83
+ raise gr.Error(message="No Intern Column")
84
+
85
+ return overlap
86
+
87
+ def post_process(data, d_type):
88
+ """
89
+ post process data
90
+
91
+ Args:
92
+ data (DataFrame): matched data as dataframe
93
+ d_type (str): sheet type (prescription or diagnosis)
94
+
95
+ Returns:
96
+ data (DataFrame): postprocessed data as dataframe
97
+ """
98
+ drop_cols = {
99
+ "prescription": ['Type_y','Supervisor_y', 'date_cleaned_y', 'date_reviewed_y'],
100
+ "diagnosis": ['Supervisor_y', 'date_cleaned_y', 'date_reviewed_y']
101
+ }
102
+ post_cols = {
103
+ "prescription": ['Unstructured Name', 'Intern_x', 'Status_x', 'RX Norm [Super Generic]_x',
104
+ 'Intern_y', 'Status_y', 'RX Norm [Super Generic]_y', 'match_status', 'Type_x',
105
+ 'Supervisor_x', 'date_cleaned_x', 'date_reviewed_x'],
106
+ "diagnosis": ['Unstructured Name', 'Intern_x', 'Status_x', 'ICD10 Diagnosis_x',
107
+ 'Intern_y', 'Status_y', 'ICD10 Diagnosis_y', 'match_status',
108
+ 'Supervisor_x', 'date_cleaned_x', 'date_reviewed_x']
109
+ }
110
+
111
+ d_cols = drop_cols[d_type]
112
+ p_cols = post_cols[d_type]
113
+
114
+ data = data.drop(columns=d_cols)
115
+ data = data[p_cols]
116
+
117
+ data = data.rename(columns={
118
+ 'date_cleaned_x': 'date_cleaned',
119
+ 'date_reviewed_x': 'date_reviewed'
120
+ })
121
+
122
+ return data
123
+
124
+ def output_data(gc, data, week_no, dtype):
125
+ """
126
+ outputs data to a google sheet
127
+
128
+ Args:
129
+ gc (GSheet instance): Gsheet permission instance
130
+ data (DataFrame): Data to write to google sheet
131
+ week_no (str): Week no e.g. week_1
132
+ d_type (str): sheet type (prescription or diagnosis)
133
+
134
+ Raises:
135
+ gr.exception.Error: Sheet Error
136
+ """
137
+ out_sheet_dict = {
138
+ "diagnosis": "https://docs.google.com/spreadsheets/d/1UJ2PHR62mcz11D2qeX-Wk2fs1357BW78o8g76lwG93w/edit#gid=2123373592",
139
+ "prescription": "https://docs.google.com/spreadsheets/d/1Fo9V6J_L9eWX3qEScITP2L8nI9exk_VslGMgk-1jCJw/edit#gid=1287467149"
140
+ }
141
+
142
+ outsheet_url = out_sheet_dict[dtype]
143
+ out_gs = gc.open_by_url(outsheet_url)
144
+
145
+ # write output to sheet
146
+ try:
147
+ out_worksheet = out_gs.worksheet(week_no)
148
+ # clear the worksheet
149
+ out_worksheet.clear()
150
+ except:
151
+ raise gr.Error(f"Result sheet for {week_no} does not exist")
152
+
153
+ # write dataframe to work sheet
154
+ set_with_dataframe(
155
+ worksheet=out_worksheet,
156
+ dataframe=data,
157
+ include_index=False,
158
+ include_column_header=True,
159
+ resize=True)
heliumhealth-a05d595e5991.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "heliumhealth",
4
+ "private_key_id": "a05d595e59911b601cf718b32e967a0f4e1f463d",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEugIBADANBgkqhkiG9w0BAQEFAASCBKQwggSgAgEAAoIBAQCzJJ2dCd32LwAc\njoTiCB2shYwGRFvApmF3/RUkT9BZDg5S+imfuXSsHBgueKIEAW7E0UBGVbioKQC0\nEzKh/+aLVdZEYdqyNdrx4g5HmuDn4sQyd/zP0J3cUa/LlF/cND6HsExJcdkjL9rV\nVZEQZhpVrgB9J6Ui5dZsrN9RWHguP8awI6JFqsfF2aKWJ4Z+Gl9YlxMhCqcwBalF\nf4MUOwnXSK1t0QMXdEAg9zA+wwySYjCz/nVOIrEBB1lHp1mEiwpv2KtuX3f2GGRt\nSGwOThDWJPn6EuLcX2mlxEs7ylEs1hYcKfROPzBMSRX5g/m5lKkRuZLsBpqKZj5G\nrB62FasXAgMBAAECggEADIMutRTG2Zaf0d7MfkMl23J6fMeShwq3/RsCuoYOneHC\n813AwYTBFvJhaAGKA3CcLYgWDrOXEbvHbkx2+WS31BybOiyBi3gCgmmRKMQzGE84\n9/N0h4lhNU3JUxVaerV1dqHEOk8G5RPHG6Z92uDezwQ3yYYW3TCjgPS0JmjhAA3b\nAnAgATuiY4ivscR28xSTDsexQhDnVGdM0ms4hrAUV0nM+sGLbXil6hTBoRi/iEmq\nGESeFsDOcdJKhIAvs94FwhHGLljCniP+2/jN6Wg8+wTFwUpaH0NpV6tfHYvGe8Bp\natfVZwCqwP5+J3dXHuaNUwe/cUK6HSVcGARXp5vubQKBgQD7xXVg2WoMONpGHWnD\nxk3T085JiC+MgBK+y5MzNLTtVxCNzsv3I70lu8ilv7ytfy35bnQ2dx8Ych1Jjc7z\n5kQs9rRxtY5iZvyTRaBB4Zfyr2uGxrR/lMJizsH+M0my48zPiHWLdqJSTupBmufr\n2kFeRIbphEl6u5fir9s5c/+EGwKBgQC2JuCQXrYHryn2vs6qf+Zf/RgzK5+3irHy\nEeYbedEpe6dCy5dWjlh7sXR23938XTXtnwmKWMuovdnLybBXvuWacOsqz5wYuXAn\nK6V8oBPpV9Aj+nVSft26E8nMzNX1lXbvxBgq1GvOUk7Mbi9p4nm6gjKNKYo1/U9P\nH+S9VHQMtQKBgAxqkH9WOYSgySLsEGs3PF9V8rZtoOqs5j/Cil/cGZAa2xYjPKvT\nd2CFAkAqVIO54eqLJ/AHr+Dkv80A0VP15ybQg9WXvo3bxlj89gpJtdSiEgtzgTNJ\ngsycpbSDkv7ffRo/AI0ALMEiYysZGJbpDJA5kO2zOGx1E+h6A7WxoshBAn8UolLO\nB+yW8kDOss62gcaXGRSkt0xgflWqFlz6v9Hx4RARgP6jz3w2huOqk7GR5P027c0m\n3ugzNU52x2Iyjm10EVaSgvIr2tXZmhglBf07cbciXDzuG3ECozs49/tE1qmif5Q9\nRdLwjGJgxhqY5A7mEdmoJAyEES3qyLIgwHBlAoGAS0sA8A5Ay+BbaAGufqSaBnxt\nQgMGNYUgyeG6edksXMRPMqTWbQ5xdjEmv38nQ8AT9RVMZ9TfZqCts1fCD+FgTcAU\niAasrJMvd4mCBhWZWPwGJN+yaeZHvdbrXsSEo3kkyiwjFip03CCJtDJO3MwyowOb\nIxC34Nn4A6XUGjaF7Io=\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "test-gs@heliumhealth.iam.gserviceaccount.com",
7
+ "client_id": "102772078802762265627",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/test-gs%40heliumhealth.iam.gserviceaccount.com"
12
+ }
requirements.txt ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ async-timeout==4.0.2
7
+ attrs==23.1.0
8
+ beautifulsoup4==4.12.2
9
+ cachetools==5.3.0
10
+ certifi==2023.5.7
11
+ charset-normalizer==3.1.0
12
+ click==8.1.3
13
+ contourpy==1.0.7
14
+ cycler==0.11.0
15
+ decorator==5.1.1
16
+ entrypoints==0.4
17
+ fastapi==0.95.1
18
+ ffmpy==0.3.0
19
+ filelock==3.12.0
20
+ fonttools==4.39.3
21
+ frozenlist==1.3.3
22
+ fsspec==2023.5.0
23
+ google==3.0.0
24
+ google-api-core==2.11.0
25
+ google-api-python-client==2.86.0
26
+ google-auth==2.17.3
27
+ google-auth-httplib2==0.1.0
28
+ google-auth-oauthlib==1.0.0
29
+ googleapis-common-protos==1.59.0
30
+ gradio==3.28.3
31
+ gradio_client==0.2.0
32
+ gspread==5.8.0
33
+ gspread-dataframe==3.3.0
34
+ h11==0.14.0
35
+ httpcore==0.17.0
36
+ httplib2==0.22.0
37
+ httpx==0.24.0
38
+ huggingface-hub==0.14.1
39
+ idna==3.4
40
+ Jinja2==3.1.2
41
+ jsonschema==4.17.3
42
+ kiwisolver==1.4.4
43
+ linkify-it-py==2.0.2
44
+ markdown-it-py==2.2.0
45
+ MarkupSafe==2.1.2
46
+ matplotlib==3.7.1
47
+ mdit-py-plugins==0.3.3
48
+ mdurl==0.1.2
49
+ multidict==6.0.4
50
+ numpy==1.24.3
51
+ oauth2client==4.1.3
52
+ oauthlib==3.2.2
53
+ orjson==3.8.12
54
+ packaging==23.1
55
+ pandas==2.0.1
56
+ Pillow==9.5.0
57
+ protobuf==4.22.4
58
+ pyasn1==0.5.0
59
+ pyasn1-modules==0.3.0
60
+ pydantic==1.10.7
61
+ PyDrive==1.3.1
62
+ pydub==0.25.1
63
+ Pygments==2.15.1
64
+ pyparsing==3.0.9
65
+ pyrsistent==0.19.3
66
+ python-dateutil==2.8.2
67
+ python-multipart==0.0.6
68
+ pytz==2023.3
69
+ PyYAML==6.0
70
+ requests==2.30.0
71
+ requests-oauthlib==1.3.1
72
+ rsa==4.9
73
+ semantic-version==2.10.0
74
+ six==1.16.0
75
+ sniffio==1.3.0
76
+ soupsieve==2.4.1
77
+ starlette==0.26.1
78
+ toolz==0.12.0
79
+ tqdm==4.65.0
80
+ typing_extensions==4.5.0
81
+ tzdata==2023.3
82
+ uc-micro-py==1.0.2
83
+ uritemplate==4.1.1
84
+ urllib3==2.0.2
85
+ uvicorn==0.22.0
86
+ validators==0.20.0
87
+ websockets==11.0.3
88
+ yarl==1.9.2