qa-app-overlap / etl.py
tiesan's picture
Upload 5 files
4ff22c7
raw
history blame contribute delete
No virus
5.1 kB
import gradio as gr
import pandas as pd
from gspread_dataframe import set_with_dataframe
def load_data(gs, start_date, end_date, d_type):
"""
load data from google sheet
Args:
gs (Gsheet instance ): GSheet instance with access to google sheet
start_date (str): Date str format(YYYY-mm-dd)
end_date (str): Date str format(YYYY-mm-dd)
d_type (str):sheet type (prescription or diagnosis)
Raises:
gr.Error: 403 permission denied
Returns:
data (DataFrame): loaded data from google sheet
"""
sheets_dict = {
"diagnosis": "Diagnosis",
"prescription": "Prescriptions"
}
# open data and super sheet
try:
data_sheet = gs.worksheet(sheets_dict[d_type])
except:
raise gr.Error(message='Permission denied. Please add IAM user to the sheet and try again')
# read data from sheets
all_data = pd.DataFrame(data_sheet.get_all_records())
all_data.columns = all_data.columns.str.strip()
# transform date fields
all_data['date_cleaned'] = pd.to_datetime(all_data['date_cleaned'], infer_datetime_format=True)
all_data['date_reviewed'] = pd.to_datetime(all_data['date_reviewed'], infer_datetime_format=True)
data = all_data[(all_data['date_cleaned'] >= start_date) & (all_data['date_cleaned'] <= end_date)]
return data
def preprocess_data(data, d_type):
"""
preprocess loaded data from google sheet
Args:
data (DataFrame): google sheet data as a dataframe
d_type (Str): sheet type (prescription or diagnosis)
Raises:
gr.Error (ValueError): Wrong column
Returns:
Overlap (DataFrame): Overlap data
"""
data.columns = data.columns.str.strip()
overlap_cols = {
"prescription": ["RX Norm [Super Generic]_x", "RX Norm [Super Generic]_y" ],
"diagnosis": ["ICD10 Diagnosis_x", "ICD10 Diagnosis_y"]
}
try:
# extract the interns
intern_1 = data['Intern'].unique()[0]
intern_2 = data['Intern'].unique()[1]
# intern 1 and intern 2 data
intern_1_data = data[data['Intern'] == intern_1.strip()]
intern_2_data = data[data['Intern'] == intern_2.strip()]
# extract the overlap
overlap = pd.merge(intern_1_data, intern_2_data, on='Unstructured Name')
# remove all white spaces
col = overlap_cols[d_type]
overlap[col[0]] = overlap[col[0]].str.strip()
overlap[col[1]] = overlap[col[1]].str.strip()
except:
raise gr.Error(message="No Intern Column")
return overlap
def post_process(data, d_type):
"""
post process data
Args:
data (DataFrame): matched data as dataframe
d_type (str): sheet type (prescription or diagnosis)
Returns:
data (DataFrame): postprocessed data as dataframe
"""
drop_cols = {
"prescription": ['Type_y','Supervisor_y', 'date_cleaned_y', 'date_reviewed_y'],
"diagnosis": ['Supervisor_y', 'date_cleaned_y', 'date_reviewed_y']
}
post_cols = {
"prescription": ['Unstructured Name', 'Intern_x', 'Status_x', 'RX Norm [Super Generic]_x',
'Intern_y', 'Status_y', 'RX Norm [Super Generic]_y', 'match_status', 'Type_x',
'Supervisor_x', 'date_cleaned_x', 'date_reviewed_x'],
"diagnosis": ['Unstructured Name', 'Intern_x', 'Status_x', 'ICD10 Diagnosis_x',
'Intern_y', 'Status_y', 'ICD10 Diagnosis_y', 'match_status',
'Supervisor_x', 'date_cleaned_x', 'date_reviewed_x']
}
d_cols = drop_cols[d_type]
p_cols = post_cols[d_type]
data = data.drop(columns=d_cols)
data = data[p_cols]
data = data.rename(columns={
'date_cleaned_x': 'date_cleaned',
'date_reviewed_x': 'date_reviewed'
})
return data
def output_data(gc, data, week_no, dtype):
"""
outputs data to a google sheet
Args:
gc (GSheet instance): Gsheet permission instance
data (DataFrame): Data to write to google sheet
week_no (str): Week no e.g. week_1
d_type (str): sheet type (prescription or diagnosis)
Raises:
gr.exception.Error: Sheet Error
"""
out_sheet_dict = {
"diagnosis": "https://docs.google.com/spreadsheets/d/1UJ2PHR62mcz11D2qeX-Wk2fs1357BW78o8g76lwG93w/edit#gid=2123373592",
"prescription": "https://docs.google.com/spreadsheets/d/1Fo9V6J_L9eWX3qEScITP2L8nI9exk_VslGMgk-1jCJw/edit#gid=1287467149"
}
outsheet_url = out_sheet_dict[dtype]
out_gs = gc.open_by_url(outsheet_url)
# write output to sheet
try:
out_worksheet = out_gs.worksheet(week_no)
# clear the worksheet
out_worksheet.clear()
except:
raise gr.Error(f"Result sheet for {week_no} does not exist")
# write dataframe to work sheet
set_with_dataframe(
worksheet=out_worksheet,
dataframe=data,
include_index=False,
include_column_header=True,
resize=True)