Spaces:
Runtime error
Runtime error
Dalia Gala
commited on
Commit
•
686717d
1
Parent(s):
2f1a119
Add application file
Browse files- .DS_Store +0 -0
- .Rhistory +0 -0
- data/.DS_Store +0 -0
- data/dataframe.csv +0 -0
- images/.DS_Store +0 -0
- images/.ipynb_checkpoints/pie_charts-checkpoint.png +0 -0
- images/.ipynb_checkpoints/tests-checkpoint.png +0 -0
- images/pie_charts.png +0 -0
- images/tests.png +0 -0
- pages/.DS_Store +0 -0
- pages/.ipynb_checkpoints/1_📊_Define_Target_Variables-checkpoint.py +266 -0
- pages/.ipynb_checkpoints/2_📈_Visualize_the_Results-checkpoint.py +593 -0
- pages/.ipynb_checkpoints/3_💡_Put_the_Idea_into_Practice-checkpoint.py +47 -0
- pages/1_📊_Define_Target_Variables.py +266 -0
- pages/2_📈_Visualize_the_Results.py +593 -0
- pages/3_💡_Put_the_Idea_into_Practice.py +57 -0
- requirements.txt +20 -0
- utils.py +255 -0
- 🏠Home.py +58 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.Rhistory
ADDED
File without changes
|
data/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data/dataframe.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
images/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
images/.ipynb_checkpoints/pie_charts-checkpoint.png
ADDED
images/.ipynb_checkpoints/tests-checkpoint.png
ADDED
images/pie_charts.png
ADDED
images/tests.png
ADDED
pages/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
pages/.ipynb_checkpoints/1_📊_Define_Target_Variables-checkpoint.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:16:50 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
import pandas as pd
|
12 |
+
from sklearn.model_selection import train_test_split
|
13 |
+
from utils import assign_labels_by_probabilities, drop_data, train_and_predict
|
14 |
+
|
15 |
+
### PAGE CONFIG ###
|
16 |
+
st.set_page_config(page_title='EquiVar', page_icon=':robot_face:', layout='wide')
|
17 |
+
|
18 |
+
hide_st_style = """
|
19 |
+
<style>
|
20 |
+
#GithubIcon {visibility: hidden;}
|
21 |
+
#MainMenu {visibility: hidden;}
|
22 |
+
footer {visibility: hidden;}
|
23 |
+
header {visibility: hidden;}
|
24 |
+
</style>
|
25 |
+
"""
|
26 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
27 |
+
|
28 |
+
### IMPORT DATA FILES ###
|
29 |
+
dataframe = pd.read_csv('./data/dataframe.csv')
|
30 |
+
dataframe = dataframe.drop(["Unnamed: 0"], axis = 1)
|
31 |
+
dataframe = dataframe.rename(columns={"education_level": "education level"})
|
32 |
+
|
33 |
+
### DICTIONARIES AND CONSTANTS###
|
34 |
+
groups = ["attention", "reasoning", "memory", "behavioural restraint", "information processing speed"]
|
35 |
+
|
36 |
+
groups_dict = {
|
37 |
+
'Divided Visual Attention' : 'attention',
|
38 |
+
'Forward Memory Span': 'memory',
|
39 |
+
'Arithmetic Reasoning' : 'reasoning',
|
40 |
+
'Grammatical Reasoning' : 'reasoning',
|
41 |
+
'Go/No go' : 'behavioural restraint',
|
42 |
+
'Reverse_Memory_Span' : 'memory',
|
43 |
+
'Verbal List Learning' : 'memory',
|
44 |
+
'Delayed Verbal List Learning' : 'memory',
|
45 |
+
'Digit Symbol Coding' : 'information processing speed',
|
46 |
+
'Trail Making Part A' : 'information processing speed',
|
47 |
+
'Trail Making Part B' : 'information processing speed'
|
48 |
+
}
|
49 |
+
|
50 |
+
education_dict = {
|
51 |
+
1: 'Some high school',
|
52 |
+
2: 'High school diploma / GED',
|
53 |
+
3: 'Some college',
|
54 |
+
4: 'College degree',
|
55 |
+
5: 'Professional degree',
|
56 |
+
6: "Master's degree",
|
57 |
+
7: 'Ph.D.',
|
58 |
+
8: "Associate's degree",
|
59 |
+
99: 'Other'
|
60 |
+
}
|
61 |
+
|
62 |
+
df_keys_dict = {
|
63 |
+
'Divided Visual Attention' :'divided_visual_attention',
|
64 |
+
'Forward Memory Span' :'forward_memory_span',
|
65 |
+
'Arithmetic Reasoning' : 'arithmetic_problem_solving',
|
66 |
+
'Grammatical Reasoning' : 'logical_reasoning',
|
67 |
+
'Go/No go': 'adaptive_behaviour_response_inhibition',
|
68 |
+
'Reverse_Memory_Span' : 'reverse_memory_span',
|
69 |
+
'Verbal List Learning': 'episodic_verbal_learning',
|
70 |
+
'Delayed Verbal List Learning': 'delayed_recall',
|
71 |
+
'Digit Symbol Coding': 'abstract_symbol_processing_speed',
|
72 |
+
'Trail Making Part A' :'numerical_info_processing_speed',
|
73 |
+
'Trail Making Part B': 'numerical_and_lexical_info_processing_speed'
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
### CREATE THE "TARGET VARIABLE DEFINITION" PAGE ###
|
78 |
+
st.title('Target variable definition')
|
79 |
+
|
80 |
+
st.markdown('''On this page, we invite you to imagine that you are hiring for a certain role. Using the sliders below, you will specify two different notions of a “good employee” for that role—two different target variables. Once you’re done, the simulator will build two models, one for each of your target variable definitions. You can visualize these datasets and models—and their effects on fairness and overall features of the models and data—in the Visualize the Results page.''')
|
81 |
+
|
82 |
+
st.markdown('''You specify the notions of different employees by assigning weights of importance to cognitive characteristics that a “good” employee would have for the role. The cognitive test data that you’ll be working with comes from real-world people, mirroring an increasing number of hiring algorithms that are based cognitive tests ([Wilson, et al. 2021](https://dl.acm.org/doi/10.1145/3442188.3445928)).''')
|
83 |
+
|
84 |
+
st.markdown('''We have pre-set the weights below to reflect two different conceptions of a “good” employee: one conception that emphasizes attentiveness and numerical skills (A) and another conception that emphasizes interpersonal skills and memory (B). If you want to, you can change the slider values as you see fit.''')
|
85 |
+
|
86 |
+
st.markdown('''After you’ve set the slider values, click :red[“Assign labels and train your models”]. The simulator will then label certain individuals as "good employees"—in other words it will assign class labels "1" for successful and "0" for not good, based on your selections. Then, the simulator will build the two models.''')
|
87 |
+
|
88 |
+
st.markdown('''To learn more about target variable definition and hiring models based in cognitive tests, :green["See explanation"] below.''')
|
89 |
+
|
90 |
+
with st.expander("See explanation"):
|
91 |
+
|
92 |
+
st.markdown('''The models that the simulator builds are of a kind increasingly used in hiring software: companies will have applicants play games that test for different kinds of cognitive ability (like reasoning or memory). Then hiring software will be built to predict which applicants will be successful based on which cognitive characteristics they have. What cognitive characteristics make for a successful employee? This will depend on what role is being hired. And it will also depend on how one defines “successful employee.”''')
|
93 |
+
|
94 |
+
st.markdown('''In the real world, “successful employee” is defined for these kinds of hiring models in the following way. Managers select a group of current employees that they consider to be successful; this group of employees plays the cognitive test games. A model is then trained to identify applicants who share cognitive characteristics with the current employees that are considered successful. The target variable of “successful employee” is thus defined in terms of comparison to certain people who are deemed successful. ''')
|
95 |
+
|
96 |
+
st.markdown('''One will get different target variables if one deems different current employees as the successful ones. And, as we discussed in the Home page (and as we explain more in the Putting the Idea into Practice page), there will likely be disagreement between managers about which employees are successful. For instance, a manager who values attentiveness and numerical skills will deem different employees “successful” than a manager who values interpersonal skills and memory. Even when different managers roughly share their sensibilities in what characteristics make for a successful employee, there may still be different, equally good ways to “weight” the importance of the various characteristics.''')
|
97 |
+
|
98 |
+
st.markdown('''In the real world, the cognitive characteristics shared by those considered successful employees is implicit. Companies do not first identify the cognitive characteristics that make for a successful employee; rather, they identify employees who they consider successful, and then the hiring model works backwards to identify what characteristics these employees share.''')
|
99 |
+
|
100 |
+
st.markdown('''In our simulator, the cognitive characteristics shared by “good” employees are explicit. You assign different weights—using the sliders—to the cognitive characteristics you think are more or less important in a good employee (for the role you’re considering). To illustrate how different target variables have different effects on fairness and overall model attributes, you’ll define “good” employee in two ways. (We’ve made the cognitive characteristics explicit both so you can see the point of different target variable definitions more clearly, and because of limitations of the data that we’re working with.)''')
|
101 |
+
|
102 |
+
st.markdown('''The cognitive characteristics that the simulator works with are from one of datasets of the [NeuroCognitive Performance Test](https://www.nature.com/articles/s41597-022-01872-8). This dataset has eleven different tests which we have grouped into five categories: ''')
|
103 |
+
|
104 |
+
st.markdown(
|
105 |
+
"""
|
106 |
+
- **Memory**: Forward Memory Span, Reverse Memory Span, Verbal List Learning, Delayed Verbal List Learning
|
107 |
+
- **Information Processing Speed**: Digit Symbol Coding, Trail Making Part A, Trail Making Part B
|
108 |
+
- **Reasoning**: Arithmetic Reasoning, Grammatical Reasoning
|
109 |
+
- **Attention**: Divided Visual Attention
|
110 |
+
- **Behavioral Restraint**: Go/No go
|
111 |
+
""")
|
112 |
+
|
113 |
+
st.markdown('''After you’ve set the weights to these five characteristics using the sliders, you can see which weights are assigned to each test (e.g. Forward Memory Span or Digit Symbol Coding) by ticking the checkbox beneath the sliders. ''')
|
114 |
+
|
115 |
+
col1, col2 = st.columns(2)
|
116 |
+
|
117 |
+
#Initialise slider values
|
118 |
+
list_values_A = (9, 10, 2, 1, 5)
|
119 |
+
list_values_B = (1, 2, 10, 9, 3)
|
120 |
+
|
121 |
+
selectionsA = {}
|
122 |
+
selectionsB = {}
|
123 |
+
results_dict_A = groups_dict
|
124 |
+
results_dict_B = groups_dict
|
125 |
+
|
126 |
+
with col1:
|
127 |
+
st.subheader("Define target variable for model A ")
|
128 |
+
|
129 |
+
if "slider_values_A" not in st.session_state:
|
130 |
+
for count, value in enumerate(groups):
|
131 |
+
selectionsA[value] = list_values_A[count]
|
132 |
+
st.session_state["slider_values_A"] = selectionsA
|
133 |
+
else:
|
134 |
+
selectionsA = st.session_state["slider_values_A"]
|
135 |
+
|
136 |
+
for i in groups:
|
137 |
+
nameA = f"{i} importance, model A"
|
138 |
+
value = selectionsA[i]
|
139 |
+
slider = st.slider(nameA, min_value=0, max_value=10, value = value)
|
140 |
+
selectionsA[i] = slider
|
141 |
+
|
142 |
+
results_dict_A = {k: selectionsA.get(v, v) for k, v in results_dict_A.items()}
|
143 |
+
total = sum(results_dict_A.values())
|
144 |
+
for (key, u) in results_dict_A.items():
|
145 |
+
if total != 0:
|
146 |
+
w = (u/total)
|
147 |
+
results_dict_A[key] = w
|
148 |
+
|
149 |
+
if st.checkbox("Show target variable A weights per subtest", key="A"):
|
150 |
+
for (key, u) in results_dict_A.items():
|
151 |
+
txt = key.replace("_", " ")
|
152 |
+
st.markdown("- " + txt + " : " + f":green[{str(round((u*100), 2))}]")
|
153 |
+
|
154 |
+
|
155 |
+
st.session_state["slider_values_A"] = selectionsA
|
156 |
+
|
157 |
+
|
158 |
+
with col2:
|
159 |
+
st.subheader("Define target variable for model B ")
|
160 |
+
|
161 |
+
if "slider_values_B" not in st.session_state:
|
162 |
+
for count, value in enumerate(groups):
|
163 |
+
selectionsB[value] = list_values_B[count]
|
164 |
+
st.session_state["slider_values_B"] = selectionsB
|
165 |
+
else:
|
166 |
+
selectionsB = st.session_state["slider_values_B"]
|
167 |
+
|
168 |
+
for i in groups:
|
169 |
+
nameB = f"{i} importance, model B"
|
170 |
+
value = selectionsB[i]
|
171 |
+
slider = st.slider(nameB, min_value=0, max_value=10, value = value)
|
172 |
+
selectionsB[i] = slider
|
173 |
+
|
174 |
+
results_dict_B = {k: selectionsB.get(v, v) for k, v in results_dict_B.items()}
|
175 |
+
total = sum(results_dict_B.values())
|
176 |
+
for (key, u) in results_dict_B.items():
|
177 |
+
if total != 0:
|
178 |
+
w = ((u/total))
|
179 |
+
results_dict_B[key] = w
|
180 |
+
|
181 |
+
if st.checkbox("Show target variable B weights per subtest", key = "B"):
|
182 |
+
for (key, u) in results_dict_B.items():
|
183 |
+
txt = key.replace("_", " ")
|
184 |
+
st.markdown("- " + txt + " : " + f":green[{str(round((u*100), 2))}]")
|
185 |
+
|
186 |
+
st.session_state["slider_values_B"] = selectionsB
|
187 |
+
|
188 |
+
if st.button("Assign labels and train your models", type = "primary", use_container_width = True):
|
189 |
+
if 'complete_df' in st.session_state:
|
190 |
+
del st.session_state['complete_df']
|
191 |
+
if 'clean_df' in st.session_state:
|
192 |
+
del st.session_state['clean_df']
|
193 |
+
if 'cm_A' in st.session_state:
|
194 |
+
del st.session_state['cm_A']
|
195 |
+
if 'cm_B' in st.session_state:
|
196 |
+
del st.session_state['cm_B']
|
197 |
+
scoreA = pd.DataFrame()
|
198 |
+
scoreB = pd.DataFrame()
|
199 |
+
test1 = all(value == 0 for value in results_dict_A.values())
|
200 |
+
test2 = all(value == 0 for value in results_dict_B.values())
|
201 |
+
if test1 == True or test2 == True:
|
202 |
+
st.error('Cannot train the models if you do not define the target variables. Make your selections for both models first!', icon="🚨")
|
203 |
+
else:
|
204 |
+
for (key, u) in results_dict_A.items():
|
205 |
+
scoreA[df_keys_dict[key]] = u * dataframe[df_keys_dict[key]]
|
206 |
+
scoresA = scoreA.sum(axis=1)
|
207 |
+
dataframe['model_A_scores'] = scoresA
|
208 |
+
for (key, u) in results_dict_B.items():
|
209 |
+
scoreB[df_keys_dict[key]] = u * dataframe[df_keys_dict[key]]
|
210 |
+
scoresB = scoreB.sum(axis=1)
|
211 |
+
dataframe['model_B_scores'] = scoresB
|
212 |
+
|
213 |
+
new_annotated = assign_labels_by_probabilities(dataframe, "model_A_scores", "Model_A_label", "Model_A_probabilities", quantile=0.85, num_samples=100)
|
214 |
+
new_annotated = assign_labels_by_probabilities(new_annotated, "model_B_scores", "Model_B_label", "Model_B_probabilities", quantile=0.85, num_samples=100)
|
215 |
+
new_annotated = new_annotated.reset_index()
|
216 |
+
|
217 |
+
|
218 |
+
clean_data = drop_data(new_annotated)
|
219 |
+
# specify the columns of interest
|
220 |
+
selected_cols = ['Model_A_label', 'Model_B_label']
|
221 |
+
|
222 |
+
# count the number of rows where all three selected columns have a value of 1
|
223 |
+
num_rows_with_all_flags_1 = len(new_annotated[new_annotated[selected_cols].sum(axis=1) == len(selected_cols)])
|
224 |
+
|
225 |
+
# print the result
|
226 |
+
st.write(f"Shared candidates between your target variables: :green[{num_rows_with_all_flags_1}].")
|
227 |
+
with st.spinner('Please wait... The models will be trained now.'):
|
228 |
+
|
229 |
+
X_data, Y_data_A, Y_data_B = clean_data.iloc[:, :-2], clean_data.iloc[:, [-2]], clean_data.iloc[:, [-1]]
|
230 |
+
X_data = X_data.drop(["index"], axis = 1)
|
231 |
+
Y_data_B = Y_data_B.reset_index()
|
232 |
+
X_train, X_test, y_train_A, y_test_A = train_test_split(X_data, Y_data_A, test_size=0.2)
|
233 |
+
y_train_A = y_train_A.reset_index()
|
234 |
+
y_test_A = y_test_A.reset_index()
|
235 |
+
y_train_B = pd.merge(y_train_A,Y_data_B[['index', 'Model_B_label']],on='index', how='left')
|
236 |
+
y_test_B = pd.merge(y_test_A,Y_data_B[['index', 'Model_B_label']],on='index', how='left')
|
237 |
+
y_train_B = y_train_B.drop(labels='Model_A_label', axis = 1)
|
238 |
+
y_test_B = y_test_B.drop(labels='Model_A_label', axis = 1)
|
239 |
+
y_train_A = y_train_A.set_index("index")
|
240 |
+
y_train_B = y_train_B.set_index("index")
|
241 |
+
y_test_A = y_test_A.set_index("index")
|
242 |
+
y_test_B = y_test_B.set_index("index")
|
243 |
+
|
244 |
+
accuracy_A, precision_A, recall_A, X_full_A, cm_A, baseline_accuracy_A = train_and_predict("A", X_train, X_test, y_train_A, y_test_A)
|
245 |
+
accuracy_B, precision_B, recall_B, X_full_B, cm_B, baseline_accuracy_B = train_and_predict("B", X_train, X_test, y_train_B, y_test_B)
|
246 |
+
full = pd.merge(X_full_A,X_full_B[['index','Predicted_B', 'Prob_0_B', "Prob_1_B"]],on='index', how='left')
|
247 |
+
complete = pd.merge(full,new_annotated[['index', 'age', 'gender', 'education level', 'country', 'Model_A_label', 'Model_B_label', 'model_A_scores', 'model_B_scores']],on='index', how='left')
|
248 |
+
complete=complete.replace({"education level": education_dict})
|
249 |
+
complete = complete.rename(columns={"index": "Candidate ID"})
|
250 |
+
|
251 |
+
if 'complete_df' not in st.session_state:
|
252 |
+
st.session_state['complete_df'] = complete
|
253 |
+
if 'clean_df' not in st.session_state:
|
254 |
+
st.session_state['clean_df'] = clean_data
|
255 |
+
if 'cm_A' not in st.session_state:
|
256 |
+
st.session_state['cm_A'] = cm_A
|
257 |
+
if 'cm_B' not in st.session_state:
|
258 |
+
st.session_state['cm_B'] = cm_B
|
259 |
+
|
260 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
261 |
+
with row1_1:
|
262 |
+
st.write(f"Model A accuracy: :green[{baseline_accuracy_A}].")
|
263 |
+
with row1_2:
|
264 |
+
st.write(f"Model B accuracy: :green[{baseline_accuracy_B}].")
|
265 |
+
|
266 |
+
st.success('''Success! You have defined the target variables and trained your models. Head to "Visualise the Results" in the sidebar.''')
|
pages/.ipynb_checkpoints/2_📈_Visualize_the_Results-checkpoint.py
ADDED
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:30:56 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import plotly.express as px
|
15 |
+
import plotly.graph_objects as go
|
16 |
+
from matplotlib_venn import venn2
|
17 |
+
from sklearn.metrics import confusion_matrix
|
18 |
+
from utils import display_proportional, plot_data, run_PCA, create_confusion_matrix_heatmap, plot_conf_rates
|
19 |
+
|
20 |
+
### PAGE CONFIG ###
|
21 |
+
st.set_page_config(page_title='EquiVar', page_icon=':robot_face:', layout='wide')
|
22 |
+
|
23 |
+
hide_st_style = """
|
24 |
+
<style>
|
25 |
+
#GithubIcon {visibility: hidden;}
|
26 |
+
#MainMenu {visibility: hidden;}
|
27 |
+
footer {visibility: hidden;}
|
28 |
+
header {visibility: hidden;}
|
29 |
+
</style>
|
30 |
+
"""
|
31 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
32 |
+
|
33 |
+
### DICTIONARIES AND CONSTANTS###
|
34 |
+
|
35 |
+
colours_education = {
|
36 |
+
'Some high school' : 'indigo',
|
37 |
+
'High school diploma / GED' : '#7ae99e',
|
38 |
+
'Some college' : '#0a68c9',
|
39 |
+
'College degree': '#80c4fa',
|
40 |
+
'Professional degree': '#f48508',
|
41 |
+
"Master's degree" : '#2fa493',
|
42 |
+
'Ph.D.' : '#f2a3a1',
|
43 |
+
"Associate's degree" : '#fbcc66',
|
44 |
+
'Other' : '#fa322f'
|
45 |
+
}
|
46 |
+
|
47 |
+
colours_country = {
|
48 |
+
'AU' : '#7ae99e',
|
49 |
+
'US': '#80c4fa',
|
50 |
+
'NZ': '#2fa493',
|
51 |
+
'CA' : '#fbcc66'
|
52 |
+
}
|
53 |
+
|
54 |
+
colours_gender = {
|
55 |
+
'f' : '#83C9FF',
|
56 |
+
'm': '#0067C9'
|
57 |
+
}
|
58 |
+
|
59 |
+
characteristic_dict = {
|
60 |
+
'gender' : colours_gender,
|
61 |
+
'education level' : colours_education,
|
62 |
+
'country' : colours_country,
|
63 |
+
'age' : 'indigo'
|
64 |
+
}
|
65 |
+
|
66 |
+
pred_dict = {
|
67 |
+
'Model A' : 'Predicted_A',
|
68 |
+
'Model B' : 'Predicted_B'
|
69 |
+
}
|
70 |
+
|
71 |
+
prob_dict = {
|
72 |
+
'Model A' : 'Prob_1_A',
|
73 |
+
'Model B' : 'Prob_1_B'
|
74 |
+
}
|
75 |
+
|
76 |
+
model_dict = {
|
77 |
+
'Model A' : 'Model_A_label',
|
78 |
+
'Model B' : 'Model_B_label'
|
79 |
+
}
|
80 |
+
|
81 |
+
df_keys_dict = {
|
82 |
+
'Divided Visual Attention' :'divided_visual_attention',
|
83 |
+
'Forward Memory Span' :'forward_memory_span',
|
84 |
+
'Arithmetic Reasoning' : 'arithmetic_problem_solving',
|
85 |
+
'Grammatical Reasoning' : 'logical_reasoning',
|
86 |
+
'Go/No go': 'adaptive_behaviour_response_inhibition',
|
87 |
+
'Reverse_Memory_Span' : 'reverse_memory_span',
|
88 |
+
'Verbal List Learning': 'episodic_verbal_learning',
|
89 |
+
'Delayed Verbal List Learning': 'delayed_recall',
|
90 |
+
'Digit Symbol Coding': 'abstract_symbol_processing_speed',
|
91 |
+
'Trail Making Part A' :'numerical_info_processing_speed',
|
92 |
+
'Trail Making Part B': 'numerical_and_lexical_info_processing_speed'
|
93 |
+
}
|
94 |
+
|
95 |
+
### DEFINE ALL SUB-PAGES AS FUNCTIONS TO CALL ###
|
96 |
+
|
97 |
+
def mod_prop(cmA, cmB):
|
98 |
+
st.markdown('''This section contains model confusion matrices, which give us a lot of information about how good the models which we produced really are. We obtain the confusion matrices by plotting "Actual" labels on one axis, and "Predicted" labels on the other. For both models, in each square of each confusion matrix, we can see which group of candidates it represents, the number of candidates in this group, and what percentage of all candidates assessed they represent.''')
|
99 |
+
st.markdown('''
|
100 |
+
- **True Negative (TN)**: candidates predicted to have label "0" whose actual label was "0".
|
101 |
+
- **False Positive (FP)**: candidates predicted to have label "1" whose actual label was "0".
|
102 |
+
- **False Negative (FN)** candidates predicted to have label "0" whose actual label was "1".
|
103 |
+
- **True Positive (TP)**: candidates predicted to have label "1" whose actual label was "1".
|
104 |
+
''')
|
105 |
+
|
106 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
107 |
+
with row1_1:
|
108 |
+
create_confusion_matrix_heatmap(cmB, "Model A")
|
109 |
+
|
110 |
+
with row1_2:
|
111 |
+
model = "B"
|
112 |
+
create_confusion_matrix_heatmap(cmB, "Model B")
|
113 |
+
|
114 |
+
st.subheader("Model Accuracy Rates")
|
115 |
+
st.markdown('''We can also represent each model in terms of its accuracy rates, calculated using the numbers of candidates in each group shown in the confusion matrix:''')
|
116 |
+
st.markdown('''
|
117 |
+
- True Positive Rate (TPR), also called recall, sensitivity or hit rate, is the probability of a positive test result when the result is indeed positive.
|
118 |
+
- True Negative Rate (TNR), or specificity/selectivity, is the probability of a negative test result when the test is indeed negative.
|
119 |
+
- Positive Predictive Value (PPV) or Precision is the ratio of truly positive results to all positive results.
|
120 |
+
- Negative Predictive Value (NPR) is the ratio of truly negative results to all negative results.
|
121 |
+
- False Positive Rate (FPR) or fall-out is the probability of assigning a falsely positive result when the test is negative.
|
122 |
+
- False Negative Rate (FNR) or miss rate is the probability that a test with a positive result will falsely be assigned a negative result.
|
123 |
+
- False Discovery Rate (FDR) it the ratio of false positive results to total number of positive results.
|
124 |
+
''')
|
125 |
+
measures_A = plot_conf_rates(cmA)
|
126 |
+
measures_B = plot_conf_rates(cmB)
|
127 |
+
fig = go.Figure()
|
128 |
+
fig.add_trace(go.Bar(
|
129 |
+
x=measures_A["Measure"],
|
130 |
+
y=measures_A["Score"],
|
131 |
+
name='Model A rates',
|
132 |
+
marker_color='rgb(55, 83, 109)'
|
133 |
+
))
|
134 |
+
fig.add_trace(go.Bar(
|
135 |
+
x=measures_B["Measure"],
|
136 |
+
y=measures_B["Score"],
|
137 |
+
name='Model B rates',
|
138 |
+
marker_color='rgb(26, 118, 255)'
|
139 |
+
))
|
140 |
+
fig.update_layout(
|
141 |
+
title='Model measures comparison',
|
142 |
+
xaxis_tickfont_size=14,
|
143 |
+
xaxis=dict(
|
144 |
+
title='Measure'),
|
145 |
+
yaxis=dict(
|
146 |
+
title='Score',
|
147 |
+
titlefont_size=16,
|
148 |
+
tickfont_size=14,
|
149 |
+
),
|
150 |
+
legend=dict(
|
151 |
+
bgcolor='rgba(255, 255, 255, 0)',
|
152 |
+
bordercolor='rgba(255, 255, 255, 0)'
|
153 |
+
),
|
154 |
+
barmode='group',
|
155 |
+
bargap=0.15, # gap between bars of adjacent location coordinates.
|
156 |
+
bargroupgap=0.1 # gap between bars of the same location coordinate.
|
157 |
+
)
|
158 |
+
st.plotly_chart(fig, use_container_width = True)
|
159 |
+
if st.checkbox("Show tables"):
|
160 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
161 |
+
# CSS to inject contained in a string
|
162 |
+
hide_table_row_index = """
|
163 |
+
<style>
|
164 |
+
thead tr th:first-child {display:none}
|
165 |
+
tbody th {display:none}
|
166 |
+
</style> """
|
167 |
+
|
168 |
+
# Inject CSS with Markdown
|
169 |
+
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
170 |
+
with row1_1:
|
171 |
+
st.markdown("Accuracy rates for model A")
|
172 |
+
st.table(measures_A)
|
173 |
+
with row1_2:
|
174 |
+
st.markdown("Accuracy rates for model B")
|
175 |
+
st.table(measures_B)
|
176 |
+
|
177 |
+
def model_scores(dataframe):
|
178 |
+
st.markdown('''This section displays the distribution of scores assigned to each hypothetical employee, according to the values you set on the sliders, compared with the distribution of protected characteristics. These scores are then used to assign labels to the hypothetical employees, creating two distinct datasets to train the models. In essence, these scores explicitly and numerically mimic the viewpoints of hypothetical hiring managers A and B when deciding who to label as "top employees."''')
|
179 |
+
st.markdown('''Just as in the original NCPT dataset analysis, the scores obtained by participants in the cognitive games decline with age. This observation provides insight into the potential issue of ageism inherent in the use of gamified hiring processes.''')
|
180 |
+
# Create a selectbox to choose a protected characteristic to explore
|
181 |
+
plot_radio = st.selectbox('Characteristic to explore', characteristic_dict.keys())
|
182 |
+
row2_space1, row2_1, row2_space2 = st.columns((0.1, 5, 0.1))
|
183 |
+
|
184 |
+
with row2_1:
|
185 |
+
data = dataframe[["model_A_scores", "model_B_scores", plot_radio]]
|
186 |
+
|
187 |
+
if plot_radio == "age":
|
188 |
+
bins= [18,20,30,40,50,60,70,80,90]
|
189 |
+
labels = ['18-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90']
|
190 |
+
data['age_bins'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)
|
191 |
+
plot_radio = 'age_bins'
|
192 |
+
colours = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)']
|
193 |
+
c1, c2, c3, c4, c5 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
194 |
+
with c2:
|
195 |
+
fig = px.box(data, x = plot_radio, y="model_A_scores", labels={"model_A_scores":"Dataset A Input Scores", 'age_bins':"Age"}, title="Input scores in dataset A")
|
196 |
+
fig.update_layout(showlegend=False)
|
197 |
+
fig.update_traces(marker_color='rgba(93, 164, 214, 0.5)')
|
198 |
+
st.plotly_chart(fig, use_container_width=True)
|
199 |
+
with c4:
|
200 |
+
fig = px.box(data, x = plot_radio, y="model_B_scores", labels={"model_B_scores":"Dataset B Input Scores", 'age_bins':"Age"}, title="Input scores in dataset B")
|
201 |
+
fig.update_traces(marker_color = 'rgba(255, 144, 14, 0.5)')
|
202 |
+
fig.update_layout(showlegend=False)
|
203 |
+
st.plotly_chart(fig, use_container_width=True)
|
204 |
+
|
205 |
+
def PCA_general(full_df, dataframe_PCA):
|
206 |
+
st.markdown('''On this page, you can see the distribution of the dataset labels which were assigned based on the scores calculated from the slider values you selected previously. Principal Components Analysis, or PCA, is a technique often used to analyse and subsequently visualize datasets where there are many features per single example. This is the case with the NCPT dataset used in our simulator. Specifically, the battery which we used has 11 features per single example, the example being the player of cognitive games, and, in our metaphor, a hypothetical employee or job candidate. It is impossible to plot 11 dimensions, and PCA allows for the visualisation of multidimensional data, while also preserving as much information as possible.''')
|
207 |
+
choice = st.radio("What would you like to explore?", ("PCAs", "Components loading"), horizontal = True)
|
208 |
+
pcaA, dfA, labelsA, coeffA, componentsA = run_PCA(dataframe_PCA, 'Model_B_label', 'Model_A_label', 2)
|
209 |
+
pcaB, dfB, labelsB, coeffB, componentsB = run_PCA(dataframe_PCA, 'Model_A_label', 'Model_B_label', 2)
|
210 |
+
loadings = pcaB.components_.T * np.sqrt(pcaB.explained_variance_)
|
211 |
+
total_var = pcaA.explained_variance_ratio_.sum() * 100
|
212 |
+
dfA = dfA.rename(columns={'target': 'Dataset A'}).reset_index()
|
213 |
+
dfB = dfB.rename(columns={'target': 'Dataset B'}).reset_index()
|
214 |
+
df_all = pd.merge(dfA, dfB[['index', 'Dataset B']], on='index', how='left')
|
215 |
+
|
216 |
+
conditions = [
|
217 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 0),
|
218 |
+
(df_all['Dataset B'] == 1) & (df_all['Dataset A'] == 0),
|
219 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 1),
|
220 |
+
(df_all['Dataset A'] == 0) & (df_all['Dataset B'] == 0)]
|
221 |
+
|
222 |
+
values = ['Selected A', 'Selected B', 'Selected both', 'Not selected']
|
223 |
+
df_all['All'] = np.select(conditions, values)
|
224 |
+
|
225 |
+
df_all = df_all.drop(["index"], axis = 1)
|
226 |
+
df_all.All=pd.Categorical(df_all.All,categories=['Not selected', 'Selected A', 'Selected B', 'Selected both'])
|
227 |
+
df_all=df_all.sort_values('All')
|
228 |
+
|
229 |
+
selections_dict = {0: 'Not selected', 1: 'Selected'}
|
230 |
+
df_all = df_all.replace({"Dataset A": selections_dict, "Dataset B": selections_dict})
|
231 |
+
|
232 |
+
color_dict_sel = {'Not selected': '#3366CC', 'Selected': 'grey'}
|
233 |
+
|
234 |
+
if "pca_df" not in st.session_state:
|
235 |
+
st.session_state.pca_df = df_all
|
236 |
+
|
237 |
+
if choice == "PCAs":
|
238 |
+
c1, c2 = st.columns(2)
|
239 |
+
with c1:
|
240 |
+
fig = px.scatter(st.session_state.pca_df,
|
241 |
+
x=st.session_state.pca_df['principal component 1'].astype(str),
|
242 |
+
y=st.session_state.pca_df['principal component 2'].astype(str),
|
243 |
+
title='Dataset A PCA',
|
244 |
+
labels={"x": 'PC 1', "y": 'PC 2'},
|
245 |
+
color=st.session_state.pca_df['Dataset A'],
|
246 |
+
color_discrete_map=color_dict_sel)
|
247 |
+
fig.update_traces(marker_size = 8)
|
248 |
+
st.plotly_chart(fig, use_container_width=True)
|
249 |
+
with c2:
|
250 |
+
fig = px.scatter(st.session_state.pca_df,
|
251 |
+
x=st.session_state.pca_df['principal component 1'].astype(str),
|
252 |
+
y=st.session_state.pca_df['principal component 2'].astype(str),
|
253 |
+
title='Dataset B PCA',
|
254 |
+
labels={"x": 'PC 1', "y": 'PC 2'},
|
255 |
+
color=st.session_state.pca_df['Dataset B'],
|
256 |
+
color_discrete_map=color_dict_sel)
|
257 |
+
fig.update_traces(marker_size = 8)
|
258 |
+
st.plotly_chart(fig, use_container_width=True)
|
259 |
+
|
260 |
+
st.markdown(f'''These plots show the reduction of 11 dimensions (11 subtest results) to 2 dimensions. Total Variance for the data is {total_var:.2f}%. Both of the datasets have the same features, therefore they both have the same total variance. Total variance value indicates what percentage of information has been preserved when the dimensionality was reduced. Note that for both datasets, A and B, different points are labelled "1" or "0". This shows that the two datasets represent the two different target variable definitions which were created by you previously. The plots are interactive - zoom in to explore in detail.''')
|
261 |
+
|
262 |
+
pcaA, dfA, labelsA, coeffA, componentsA = run_PCA(dataframe_PCA, 'Model_B_label', 'Model_A_label', 2)
|
263 |
+
pcaB, dfB, labelsB, coeffB, componentsB = run_PCA(dataframe_PCA, 'Model_A_label', 'Model_B_label', 2)
|
264 |
+
loadings = pcaB.components_.T * np.sqrt(pcaB.explained_variance_)
|
265 |
+
total_var = pcaA.explained_variance_ratio_.sum() * 100
|
266 |
+
dfA = dfA.rename(columns={'target': 'Dataset A'}).reset_index()
|
267 |
+
dfB = dfB.rename(columns={'target': 'Dataset B'}).reset_index()
|
268 |
+
df_all = pd.merge(dfA, dfB[['index', 'Dataset B']], on='index', how='left')
|
269 |
+
|
270 |
+
conditions = [
|
271 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 0),
|
272 |
+
(df_all['Dataset B'] == 1) & (df_all['Dataset A'] == 0),
|
273 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 1),
|
274 |
+
(df_all['Dataset A'] == 0) & (df_all['Dataset B'] == 0)]
|
275 |
+
|
276 |
+
values = ['Selected A', 'Selected B', 'Selected both', 'Not selected']
|
277 |
+
df_all['All'] = np.select(conditions, values)
|
278 |
+
|
279 |
+
df_all = df_all.drop(["index"], axis = 1)
|
280 |
+
df_all.All=pd.Categorical(df_all.All,categories=['Not selected', 'Selected A', 'Selected B', 'Selected both'])
|
281 |
+
df_all=df_all.sort_values('All')
|
282 |
+
|
283 |
+
selections_dict = {0: 'Not selected', 1: 'Selected'}
|
284 |
+
df_all = df_all.replace({"Dataset A": selections_dict, "Dataset B": selections_dict})
|
285 |
+
|
286 |
+
if "pca_df" not in st.session_state:
|
287 |
+
st.session_state.pca_df = df_all
|
288 |
+
|
289 |
+
fig = px.scatter(st.session_state.pca_df,
|
290 |
+
x=st.session_state.pca_df['principal component 1'],
|
291 |
+
y=st.session_state.pca_df['principal component 2'],
|
292 |
+
title="PCA with labelled groups",
|
293 |
+
color=st.session_state.pca_df["All"],
|
294 |
+
width = 800, height = 800,
|
295 |
+
color_discrete_sequence=px.colors.qualitative.Safe,
|
296 |
+
opacity = 0.95)
|
297 |
+
|
298 |
+
fig.update_yaxes(
|
299 |
+
scaleanchor="x",
|
300 |
+
scaleratio=1,
|
301 |
+
)
|
302 |
+
fig.update_traces(marker_size = 10)
|
303 |
+
st.plotly_chart(fig)
|
304 |
+
|
305 |
+
if choice == "Components loading":
|
306 |
+
c1,c2 = st.columns(2)
|
307 |
+
loadings_df = pd.DataFrame(loadings, columns = ["PC1", "PC2"])
|
308 |
+
labels_A_proper = { v:k for k,v in df_keys_dict.items()}
|
309 |
+
loadings_df["Features"] = labels_A_proper.values()
|
310 |
+
with c1:
|
311 |
+
fig = px.bar(loadings_df, x="PC1", y="Features", orientation = 'h')
|
312 |
+
st.plotly_chart(fig, use_container_width = True)
|
313 |
+
with c2:
|
314 |
+
fig = px.bar(loadings_df, x="PC2", y="Features", orientation = 'h')
|
315 |
+
st.plotly_chart(fig, use_container_width = True)
|
316 |
+
|
317 |
+
# fig = go.Figure()
|
318 |
+
# fig.add_trace(go.Bar(
|
319 |
+
# x=loadings_df["PC1"],
|
320 |
+
# y=loadings_df["Features"],
|
321 |
+
# name='Principal Component 1',
|
322 |
+
# marker_color='rgb(55, 83, 109)',
|
323 |
+
# orientation='h'
|
324 |
+
# ))
|
325 |
+
# fig.add_trace(go.Bar(
|
326 |
+
# x=loadings_df["PC2"],
|
327 |
+
# y=loadings_df["Features"],
|
328 |
+
# name='Principal Component 2',
|
329 |
+
# marker_color='rgb(26, 118, 255)',
|
330 |
+
# orientation='h'
|
331 |
+
# ))
|
332 |
+
# fig.update_layout(
|
333 |
+
# title='Component loadings',
|
334 |
+
# xaxis_tickfont_size=14,
|
335 |
+
# xaxis=dict(
|
336 |
+
# title='Loading value'),
|
337 |
+
# yaxis=dict(
|
338 |
+
# title='Feature',
|
339 |
+
# titlefont_size=16,
|
340 |
+
# tickfont_size=14,
|
341 |
+
# ),
|
342 |
+
# legend=dict(
|
343 |
+
# bgcolor='rgba(255, 255, 255, 0)',
|
344 |
+
# bordercolor='rgba(255, 255, 255, 0)'
|
345 |
+
# ),
|
346 |
+
# barmode='group',
|
347 |
+
# bargap=0.15, # gap between bars of adjacent location coordinates.
|
348 |
+
# bargroupgap=0.1 # gap between bars of the same location coordinate.
|
349 |
+
# )
|
350 |
+
# st.plotly_chart(fig, use_container_width = True)
|
351 |
+
|
352 |
+
st.markdown('''On this plot, PCA component loadings can be explored. These facilitate the understanding of how much each variable (which there are 11 of) contributes to a particular principal component. Here, the 11 variables were reduced to 2 components, which are labelled PC1 and PC2. The magnitude of the loading (here displayed as the size of the bar in the bar chart) indicates how strong the relationship between the variable and the component is. Therefore, the higher the bar, the stronger the relationship between that component and that variable. The loading's sign can be positive or negative. This indicates whether the principal component and that variable are positively or negatively correlated. We can see that multiple variables are positively correlated with PC2. Two variables, episodic verbal learning and delayed recall are negatively correlated with both of the components.''')
|
353 |
+
|
354 |
+
|
355 |
+
def model_out(full_df):
|
356 |
+
st.markdown('''This section highlights the discrepancies between your two models when presented with the same pool of new, previously unseen candidates to label. Specifically, you'll be investigating the candidates assigned a "1" label by both models. These individuals would be those considered for a job interview or chosen for the role, according to your defined target variable.''')
|
357 |
+
# Create a selectbox to choose a protected characteristic to explore
|
358 |
+
selectbox = st.selectbox('Characteristic to explore', characteristic_dict.keys())
|
359 |
+
representation = st.selectbox("Representation", ("absolute", "proportional"))
|
360 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
361 |
+
with row1_1:
|
362 |
+
st.subheader("Candidates selected by model A")
|
363 |
+
|
364 |
+
if representation == "absolute":
|
365 |
+
# Select predicted data ==1
|
366 |
+
data = full_df.loc[full_df['Predicted_A'] == 1]
|
367 |
+
|
368 |
+
# Use function plot_data to plot selected data
|
369 |
+
plot_data(data, selectbox, characteristic_dict[selectbox])
|
370 |
+
else:
|
371 |
+
display_proportional(full_df, selectbox, 'Predicted_A')
|
372 |
+
|
373 |
+
with row1_2:
|
374 |
+
st.subheader("Candidates selected by model B")
|
375 |
+
|
376 |
+
if representation == "absolute":
|
377 |
+
# Select predicted data ==1
|
378 |
+
data = full_df.loc[full_df['Predicted_B'] == 1]
|
379 |
+
|
380 |
+
# Use function plot_data to plot selected data
|
381 |
+
plot_data(data, selectbox, characteristic_dict[selectbox])
|
382 |
+
|
383 |
+
else:
|
384 |
+
display_proportional(full_df, selectbox,'Predicted_B')
|
385 |
+
|
386 |
+
|
387 |
+
st.markdown('''In this section, you're comparing the model's selections concerning four protected characteristics: age, gender, education level, and country. You can visualize these differences in two ways: "Absolute" or "Proportional".''')
|
388 |
+
st.markdown('''"Absolute" representation gives you the raw numbers or percentages of each characteristic chosen. For instance, if the model labeled 5 female candidates and 5 male candidates as "1", the "Absolute" outcome will display as 50% for both genders."Proportional" representation, on the other hand, shows the percentage of a group selected by the model relative to the total number of that group in the input data. For example, if the model evaluated 100 male candidates and selected 5, you will see a 5% representation. If it evaluated 200 female candidates and selected 5, it will show a 2.5% representation.''')
|
389 |
+
st.markdown('''If you encounter empty categories in the "Proportional" view, this indicates that while candidates from these categories were evaluated, none were labeled as "1". Hence, their proportional representation amounts to 0%.''')
|
390 |
+
|
391 |
+
def dataframe_out(full_df):
|
392 |
+
selectbox_M = st.selectbox('Choose which model output to rank by', pred_dict.keys())
|
393 |
+
|
394 |
+
# Select data
|
395 |
+
data = full_df.loc[full_df[pred_dict[selectbox_M]] == 1]
|
396 |
+
data = data.sort_values(by = prob_dict[selectbox_M], ascending = False)
|
397 |
+
data = data[['Candidate ID','Prob_1_A', 'Prob_1_B', 'Predicted_A', 'Predicted_B']]
|
398 |
+
data = data.rename(columns={"Prob_1_A": "Ranking, model A", "Prob_1_B": "Ranking, model B", "Predicted_A": "Predicted label A", "Predicted_B": "Predicted label B"})
|
399 |
+
data.index = np.arange(1, len(data) + 1)
|
400 |
+
|
401 |
+
st.table(data.style.background_gradient(subset = ["Ranking, model A", "Ranking, model B"], axis=0, vmin=0.40).highlight_max(color = '#FFCD9B', subset = ["Predicted label A", "Predicted label B"], axis=0))
|
402 |
+
|
403 |
+
st.markdown("""In this section, you can review the data for all candidates labeled "1" by the selected model, found at the top of the page. Simultaneously, you can observe the labels assigned to these same candidates by the other model. It's likely that there will be instances where candidates chosen by one model weren't selected by the other. Candidates labeled "1" are highlighted in orange in the "Predicted label A" and "Predicted label B" columns.""")
|
404 |
+
st.markdown('''In addition to this, you can see the probability with which each candidate was labeled "1". The intensity of the blue color indicates the candidate's ranking position - a darker blue represents a higher ranking (with 1 being the maximum and 0 the minimum). You may notice that some candidates highly ranked by one model may be ranked significantly lower by the other model.''')
|
405 |
+
|
406 |
+
def venn_diagram(full_df):
|
407 |
+
row2_space1, row2_1, row2_space2, row2_2, row2_space3 = st.columns((0.1, 1, 0.1, 1, 0.1))
|
408 |
+
with row2_1:
|
409 |
+
fig, ax = plt.subplots()
|
410 |
+
|
411 |
+
list_A = full_df.loc[full_df['Predicted_A'] == 1, 'Candidate ID'].astype(int)
|
412 |
+
list_B = full_df.loc[full_df['Predicted_B'] == 1, 'Candidate ID'].astype(int)
|
413 |
+
set1 = set(list_A)
|
414 |
+
set2 = set(list_B)
|
415 |
+
|
416 |
+
venn2([set1, set2], ('Model A', 'Model B'), ax=ax)
|
417 |
+
st.pyplot(fig)
|
418 |
+
|
419 |
+
with row2_2:
|
420 |
+
st.markdown('''This Venn Diagram visualizes the number of candidates chosen by both models. It's likely that some candidates will be selected by both models, while others may be chosen by only one model. If we consider Model A as the decision of one hiring manager and Model B as another's, it's easy to see how the selection outcome varies depending on the decision-maker. Some candidates may get the opportunity to be hired, while others might not. This serves as an illustration of the inherent arbitrariness in defining the target variable when dealing with highly subjective outcomes.''')
|
421 |
+
st.markdown('''For instance, it's straightforward to define a target variable in a classification problem like distinguishing dragonflies from butterflies, where there's little room for ambiguity. However, defining what makes a 'good' employee is far more challenging due to its subjective nature.''')
|
422 |
+
|
423 |
+
|
424 |
+
|
425 |
+
def model_vis(full_df):
|
426 |
+
st.markdown('''In this section, you can visualize the demographics of the different subgroups of the data. Firstly, you can see the demographic characteristics of the candidates who have positive labels ("1") and negative labels ("0") which were assigned based on the scores calculated from the slider values you selected previously. Then, you can visualize the demographic distributions of the data which was used for training and evaluation of the models.''')
|
427 |
+
choice = st.radio("**Select desired data:**", ("Positive and negative labels", "Training and evaluation data"), horizontal=True)
|
428 |
+
if choice == "Positive and negative labels":
|
429 |
+
# Create a selectbox to choose a protected characteristic to explore
|
430 |
+
selectbox_Lab = st.selectbox('Label to visualize', ('positive labels', 'negative labels'))
|
431 |
+
|
432 |
+
# Create a selectbox to choose a protected characteristic to explore
|
433 |
+
selectbox_Char = st.selectbox('Protected characteristic', characteristic_dict.keys())
|
434 |
+
|
435 |
+
row2_space1, row2_1, row2_space2, row2_2, row2_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
436 |
+
|
437 |
+
with row2_1:
|
438 |
+
st.subheader("Dataset A")
|
439 |
+
|
440 |
+
# Select test data
|
441 |
+
if selectbox_Lab == 'positive labels':
|
442 |
+
data = full_df.loc[full_df['Model_A_label'] == 1]
|
443 |
+
else:
|
444 |
+
data = full_df.loc[full_df['Model_A_label'] == 0]
|
445 |
+
|
446 |
+
# Use function plot_data to plot selected data
|
447 |
+
plot_data(data, selectbox_Char, characteristic_dict[selectbox_Char])
|
448 |
+
|
449 |
+
|
450 |
+
with row2_2:
|
451 |
+
st.subheader("Dataset B")
|
452 |
+
|
453 |
+
# Select test data
|
454 |
+
if selectbox_Lab == 'positive labels':
|
455 |
+
data = full_df.loc[full_df['Model_B_label'] == 1]
|
456 |
+
else:
|
457 |
+
data = full_df.loc[full_df['Model_B_label'] == 0]
|
458 |
+
|
459 |
+
# Use function plot_data to plot selected data
|
460 |
+
plot_data(data, selectbox_Char, characteristic_dict[selectbox_Char])
|
461 |
+
st.markdown('''You are visualising the demographic composition of those hypothetical employees who were assigned labels "1" or "0" based on your definitions of the target variables. You might see differences in proportions of genders between the two models for the positive labels, as well as a major difference in the age between the positive and negative labels. Visualising the labels in this manner before training the model can help understand and mitigate differences in demographic representation in the modelling outcomes. Likely, if all candidates labelled "1" were in younger age groups, the candidates selected by the model at the deployment stage will also be in younger age groups. Moreover, target variable definition affects the proportional representation. Having defined two target variables, one can choose the dataset and the model which offers more proportional representation.''')
|
462 |
+
|
463 |
+
|
464 |
+
if choice == "Training and evaluation data":
|
465 |
+
# Create a selectbox to choose a protected characteristic to explore
|
466 |
+
selectbox = st.selectbox('Characteristic to explore', characteristic_dict.keys())
|
467 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 1, 0.1, 1, 0.1))
|
468 |
+
# Plot training data
|
469 |
+
with row1_1:
|
470 |
+
st.subheader("Training data")
|
471 |
+
|
472 |
+
# Select train data
|
473 |
+
train = full_df.loc[full_df["Predicted_A"] == "train"]
|
474 |
+
|
475 |
+
# Use function plot_data to plot selected data
|
476 |
+
plot_data(train, selectbox, characteristic_dict[selectbox])
|
477 |
+
|
478 |
+
# Plot test data
|
479 |
+
|
480 |
+
with row1_2:
|
481 |
+
st.subheader("Test data")
|
482 |
+
|
483 |
+
# Select test data
|
484 |
+
test = full_df.loc[full_df["Predicted_A"] != "train"]
|
485 |
+
|
486 |
+
# Use function plot_data to plot selected data
|
487 |
+
plot_data(test, selectbox, characteristic_dict[selectbox])
|
488 |
+
|
489 |
+
st.markdown('''To train a machine learning model, the data has to be split into two different sets. The first set is the training data, which will be used to teach the model the relationships between the input features (11 subtest results) and the corresponding labels ("0" and "1", assigned based on your definitions of target variables and the values you chose for the sliders). The second set is the test data, or evaluation data. It is used to assess the performance of the model. This is the data which is used to plot the confusion matrices and calculate the model metrics which you saw at the bottom of the "Define the target variable" page. This is also the data whose features you can explore in "Modelling outcomes". It is important that the training and testing data are balanced. Here, you can compare the demographic composition of the training and evaluation data. The training and evaluation datasets compositions were the same and contained the same candidates and same features for both models A and B. However, the labels for each dataset were different and based on what you selected in "Define target variable".''')
|
490 |
+
|
491 |
+
def filter_for_protected(data):
|
492 |
+
st.markdown('''Sometimes, the overall model metrics can be deceptive when it comes to predicting the results for different groups under consideration. Ideally, for our models, the varying model metrics would be similar across different groups, which would indicate that the overall model performance is reflected in how this model performs for a given group. It is often not the case, and it is likely that you will see that models A and B perform differently when it comes to those metrics. Even the same model can have different metrics for different subgroups.''')
|
493 |
+
model = st.selectbox('Choose which model outputs to assess', pred_dict.keys())
|
494 |
+
test = data.loc[data[pred_dict[model]] != "train"]
|
495 |
+
|
496 |
+
selectbox_Char = st.selectbox('Protected characteristic', characteristic_dict.keys())
|
497 |
+
if selectbox_Char == 'age':
|
498 |
+
bins= [18,20,30,40,50,60,70,80,91]
|
499 |
+
labels = ['18-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90']
|
500 |
+
test['age_bins'] = pd.cut(test['age'], bins=bins, labels=labels, right=False)
|
501 |
+
selectbox_Char = 'age_bins'
|
502 |
+
# which_group = st.selectbox('Which group?', test[selectbox_Char].unique())
|
503 |
+
df = pd.DataFrame({'Measure': ['True Positive Rate', 'True Negative Rate', 'Positive Predictive Value', 'Negative Predictive Value', 'False Positive Rate', 'False Negative Rate', 'False Discovery Rate']})
|
504 |
+
for group in test[selectbox_Char].unique():
|
505 |
+
rslt_df = test[test[selectbox_Char] == group]
|
506 |
+
y_true = [int(numeric_string) for numeric_string in rslt_df[model_dict[model]]]
|
507 |
+
y_pred = [int(numeric_string) for numeric_string in rslt_df[pred_dict[model]]]
|
508 |
+
cm = confusion_matrix(y_true, y_pred)
|
509 |
+
if cm.shape == (1,1):
|
510 |
+
cm = np.array([[cm[0, 0], 0], [0, 0]])
|
511 |
+
d = plot_conf_rates(cm)
|
512 |
+
df[f"{group}"] = d["Score"]
|
513 |
+
|
514 |
+
fig = go.Figure()
|
515 |
+
for group in test[selectbox_Char].unique():
|
516 |
+
fig.add_trace(go.Bar(
|
517 |
+
x=df["Measure"],
|
518 |
+
y=df[group],
|
519 |
+
name=group
|
520 |
+
))
|
521 |
+
|
522 |
+
fig.update_layout(
|
523 |
+
title='Model metrics per group',
|
524 |
+
xaxis_tickfont_size=14,
|
525 |
+
xaxis=dict(
|
526 |
+
title='Metric'),
|
527 |
+
yaxis=dict(
|
528 |
+
title='Score',
|
529 |
+
titlefont_size=16,
|
530 |
+
tickfont_size=14,
|
531 |
+
),
|
532 |
+
legend=dict(
|
533 |
+
bgcolor='rgba(255, 255, 255, 0)',
|
534 |
+
bordercolor='rgba(255, 255, 255, 0)'
|
535 |
+
),
|
536 |
+
barmode='group',
|
537 |
+
bargap=0.15, # gap between bars of adjacent location coordinates.
|
538 |
+
bargroupgap=0.1 # gap between bars of the same location coordinate.
|
539 |
+
)
|
540 |
+
st.plotly_chart(fig, use_container_width = True)
|
541 |
+
if st.checkbox("Show table of scores"):
|
542 |
+
# CSS to inject contained in a string
|
543 |
+
hide_table_row_index = """
|
544 |
+
<style>
|
545 |
+
thead tr th:first-child {display:none}
|
546 |
+
tbody th {display:none}
|
547 |
+
</style> """
|
548 |
+
|
549 |
+
# Inject CSS with Markdown
|
550 |
+
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
551 |
+
st.markdown(f"Accuracy rates for {selectbox_Char}")
|
552 |
+
st.table(df)
|
553 |
+
|
554 |
+
def data_plot(key1, key2, key3, key4):
|
555 |
+
st.title('''Visualize the Results''')
|
556 |
+
if key1 not in st.session_state:
|
557 |
+
st.error('Cannot train the models if you do not define the target variables. Go to "Define Target Variables"!', icon="🚨")
|
558 |
+
else:
|
559 |
+
tab1, tab2 = st.tabs(["Demographic", "Non-demographic"])
|
560 |
+
with tab1:
|
561 |
+
dataframe = st.session_state[key1]
|
562 |
+
clean_data = st.session_state[key2]
|
563 |
+
st.subheader('''**Select what to explore:**''')
|
564 |
+
data_choice = st.radio('''What to explore''', ("Modelling outcomes", "Input data"), horizontal = True, label_visibility = "collapsed")
|
565 |
+
if data_choice == "Modelling outcomes":
|
566 |
+
st.subheader('''Demographics of the overall modelling outcomes''')
|
567 |
+
model_out(dataframe)
|
568 |
+
st.subheader('''Demographics of the selected protected groups''')
|
569 |
+
filter_for_protected(dataframe)
|
570 |
+
else:
|
571 |
+
st.subheader('''Demographics of the input scores''')
|
572 |
+
model_scores(dataframe)
|
573 |
+
st.subheader('''Demographics of the input labels''')
|
574 |
+
model_vis(dataframe)
|
575 |
+
with tab2:
|
576 |
+
dataframe = st.session_state[key1]
|
577 |
+
clean_data = st.session_state[key2]
|
578 |
+
cmA = st.session_state[key3]
|
579 |
+
cmB = st.session_state[key4]
|
580 |
+
st.subheader('''**Select what to explore:**''')
|
581 |
+
data_choice = st.radio('''Select what to explore:''', ("Modelling outcomes", "Input data"), horizontal = True, label_visibility = "collapsed")
|
582 |
+
if data_choice == "Modelling outcomes":
|
583 |
+
st.subheader('''Labelled dataframe''')
|
584 |
+
dataframe_out(dataframe)
|
585 |
+
st.subheader('''Venn Diagram''')
|
586 |
+
venn_diagram(dataframe)
|
587 |
+
st.subheader('''Model accuracy metrics''')
|
588 |
+
mod_prop(cmA, cmB)
|
589 |
+
else:
|
590 |
+
st.subheader('''Principal Component Analysis''')
|
591 |
+
PCA_general(dataframe, clean_data)
|
592 |
+
|
593 |
+
data_plot('complete_df', 'clean_df', 'cm_A', 'cm_B')
|
pages/.ipynb_checkpoints/3_💡_Put_the_Idea_into_Practice-checkpoint.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:42:38 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### IMPORT LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
### PAGE CONFIG ###
|
13 |
+
st.set_page_config(page_title='EquiVar', page_icon=':robot_face:', layout='wide')
|
14 |
+
|
15 |
+
### IDEA IN PRACTICE PAGE ###
|
16 |
+
|
17 |
+
st.title("Different target variable definitions in practice")
|
18 |
+
st.markdown('''This dashboard is designed to help you understand how the notion of a “good employee” is translated into machine learning models—by defining target variables—and how target variable definition affects fairness in hiring algorithms (and other aspects of such algorithms too). On this page, we describe how to put the dashboard, and the insights it affords, into practice.''')
|
19 |
+
|
20 |
+
st.markdown('''How can this be done? The first step is to make two changes to the dashboard, because you cannot simply take it “off the shelf” and immediately put it into practice. This is because:''')
|
21 |
+
|
22 |
+
st.markdown('''- The dashboard is not built using your data or your models.''')
|
23 |
+
|
24 |
+
st.markdown('''- How you define a target variable in the dashboard is not how it’s done in practice. (Rather, the way you define it in the dashboard is a straightforward way for you to see the effects of target variable definition.)''')
|
25 |
+
st.markdown('''Below we describe how to address these two issues.''')
|
26 |
+
|
27 |
+
st.subheader('''Using your data and your models''')
|
28 |
+
|
29 |
+
st.markdown('''The dashboard offers a starting point: if you want to build something like the dashboard for your data and your models, you now have a blue-print to work from.''')
|
30 |
+
|
31 |
+
st.subheader('''Defining the target variable in practice''')
|
32 |
+
|
33 |
+
st.markdown('''In the dashboard, you define the target variable by assigning weights to different cognitive characteristics. These weights determine the “positive label:” people in our dataset who perform best on the tests, given the weights you assign, are those assigned the positive label—that is, those that the model treats as “good.” Then, the model is trained to identify people whose cognitive characteristics match those with the positive label.''')
|
34 |
+
|
35 |
+
st.markdown('''As we discussed on the Home page, a growing number of hiring algorithms use cognitive tests to identify promising job applicants. However, the way target variable definition works with these real-world algorithms is different from how it works in the dashboard. For example, consider Pymetrics, a leading developer of hiring software. In some cases, Pymetrics builds bespoke algorithms for a company that is hiring for a given role. Pymetrics will ask the company to identify a group of the client’s current employees in that role that the client considers “good.” Then, these “good” employees play cognitive test games similar to the ones used in our dashboard. It is these employees who are assigned the positive labels. From this point on, Pymetrics’ algorithmic development goes just as it does in our dashboard: a model is trained to identify job applicants whose cognitive characteristics are similar to those with the positive label.''')
|
36 |
+
|
37 |
+
st.markdown('''So, for hiring algorithms like Pymetrics’, the target variable is defined not by assigning weights to cognitive attributes, but rather by directly identifying a certain group of current employees as “good.” In the dashboard, you can define different target variables by assigning different weights to the cognitive attributes. If you are in practice building an algorithm like Pymetrics’, you can define different target variables by identifying different groups of current employees as “good.”''')
|
38 |
+
|
39 |
+
st.markdown('''How might this work? As we discussed on the Home page of the dashboard, reasonable minds may disagree about what makes for a good employee, and relatedly reasonable minds may disagree about which current employees are good employees. For example, within a company, two different managers—call them Manager A and Manager B—may not be perfectly aligned in who they consider to be good employees for a certain role. The managers may agree in some cases. We might imagine that there are 50 employees whom both Manager A and Manager B deem good. But the two managers might disagree about other employees. Imagine that there are 25 further employees whom Manager A thinks of as good but Manager B does not (this needn’t mean that Manager B thinks that these employees are bad, just that they are not the best). Likewise, there might be 25 further employees whom Manager B thinks of as good but Manager A does not.''')
|
40 |
+
|
41 |
+
st.markdown('''In this case, there are two different (overlapping) groups of 75 employees, each corresponding to what Managers A and B think of as good employees. These two different groups of employees—and in turn, two different target variable definitions—could be used to train two different models.''')
|
42 |
+
|
43 |
+
st.markdown('''Instead of constructing two groups of “good” employees directly from the judgments of Managers A and B, you could weight their judgments against one another. For example, you could have two groups of employees, X and Y. Both X and Y contain the 50 employees that Managers A and B agree on. But group X contains 20 of Manager A’s preferred employees and 5 of Manager B’s, while group Y contains 20 of Manager B’s preferred employees and 5 of Manager A’s. Here again we have different groups of “good” employees, and so two different target variables.''')
|
44 |
+
|
45 |
+
st.markdown('''One could select different groups of good employees in other ways still. An employer might have different metrics to evaluate employee success. Different employees might be better than others according to one metric compared to another. Depending on what importance is assigned to the different metrics—depending on how you weight the different metrics against one another—different groups of employees may emerge as “good.”''')
|
46 |
+
|
47 |
+
st.markdown('''Our focus in the dashboard has been on hiring algorithms that are based on cognitive test games. There are other kinds of algorithms used in hiring—for example, algorithms that identify promising job applicants on the basis of their resumés. In designing any such algorithm, the target variable must be defined, and the notion of a “good employee” must be translated into algorithmic terms. And so the insights of this dashboard apply, and can be put into practice, for almost any kind of hiring algorithm you’re working with.''')
|
pages/1_📊_Define_Target_Variables.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:16:50 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
import pandas as pd
|
12 |
+
from sklearn.model_selection import train_test_split
|
13 |
+
from utils import assign_labels_by_probabilities, drop_data, train_and_predict
|
14 |
+
|
15 |
+
### PAGE CONFIG ###
|
16 |
+
st.set_page_config(page_title='FairTargetSim', page_icon=':robot_face:', layout='wide')
|
17 |
+
|
18 |
+
hide_st_style = """
|
19 |
+
<style>
|
20 |
+
#GithubIcon {visibility: hidden;}
|
21 |
+
#MainMenu {visibility: hidden;}
|
22 |
+
footer {visibility: hidden;}
|
23 |
+
header {visibility: hidden;}
|
24 |
+
</style>
|
25 |
+
"""
|
26 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
27 |
+
|
28 |
+
### IMPORT DATA FILES ###
|
29 |
+
dataframe = pd.read_csv('./data/dataframe.csv')
|
30 |
+
dataframe = dataframe.drop(["Unnamed: 0"], axis = 1)
|
31 |
+
dataframe = dataframe.rename(columns={"education_level": "education level"})
|
32 |
+
|
33 |
+
### DICTIONARIES AND CONSTANTS###
|
34 |
+
groups = ["attention", "reasoning", "memory", "behavioural restraint", "information processing speed"]
|
35 |
+
|
36 |
+
groups_dict = {
|
37 |
+
'Divided Visual Attention' : 'attention',
|
38 |
+
'Forward Memory Span': 'memory',
|
39 |
+
'Arithmetic Reasoning' : 'reasoning',
|
40 |
+
'Grammatical Reasoning' : 'reasoning',
|
41 |
+
'Go/No go' : 'behavioural restraint',
|
42 |
+
'Reverse_Memory_Span' : 'memory',
|
43 |
+
'Verbal List Learning' : 'memory',
|
44 |
+
'Delayed Verbal List Learning' : 'memory',
|
45 |
+
'Digit Symbol Coding' : 'information processing speed',
|
46 |
+
'Trail Making Part A' : 'information processing speed',
|
47 |
+
'Trail Making Part B' : 'information processing speed'
|
48 |
+
}
|
49 |
+
|
50 |
+
education_dict = {
|
51 |
+
1: 'Some high school',
|
52 |
+
2: 'High school diploma / GED',
|
53 |
+
3: 'Some college',
|
54 |
+
4: 'College degree',
|
55 |
+
5: 'Professional degree',
|
56 |
+
6: "Master's degree",
|
57 |
+
7: 'Ph.D.',
|
58 |
+
8: "Associate's degree",
|
59 |
+
99: 'Other'
|
60 |
+
}
|
61 |
+
|
62 |
+
df_keys_dict = {
|
63 |
+
'Divided Visual Attention' :'divided_visual_attention',
|
64 |
+
'Forward Memory Span' :'forward_memory_span',
|
65 |
+
'Arithmetic Reasoning' : 'arithmetic_problem_solving',
|
66 |
+
'Grammatical Reasoning' : 'logical_reasoning',
|
67 |
+
'Go/No go': 'adaptive_behaviour_response_inhibition',
|
68 |
+
'Reverse_Memory_Span' : 'reverse_memory_span',
|
69 |
+
'Verbal List Learning': 'episodic_verbal_learning',
|
70 |
+
'Delayed Verbal List Learning': 'delayed_recall',
|
71 |
+
'Digit Symbol Coding': 'abstract_symbol_processing_speed',
|
72 |
+
'Trail Making Part A' :'numerical_info_processing_speed',
|
73 |
+
'Trail Making Part B': 'numerical_and_lexical_info_processing_speed'
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
### CREATE THE "TARGET VARIABLE DEFINITION" PAGE ###
|
78 |
+
st.title('Define Target Variables')
|
79 |
+
|
80 |
+
st.markdown('''On this page, we invite you to imagine that you are hiring for a certain role. Using the sliders below, you will specify two different notions of a “good" employee for that role—two different target variables. Once you’re done, the simulator will build two models, one for each of your target variable definitions (the simulator will also create two datasets on which each model is trained). You can visualize these models and datasets — and their effects on fairness and overall features of the models and data — in the Visualize the Results page.''')
|
81 |
+
|
82 |
+
st.markdown('''You specify the notions of different employees by assigning weights of importance to cognitive characteristics that a “good” employee would have for the role. The cognitive test data that you’ll be working with comes from real-world people, mirroring an increasing number of hiring algorithms that are based on cognitive tests ([Wilson, et al. 2021](https://dl.acm.org/doi/10.1145/3442188.3445928)).''')
|
83 |
+
|
84 |
+
st.markdown('''We have pre-set the weights below to reflect two different conceptions of a “good” employee: one conception that emphasizes attentiveness and numerical skills (A) and another conception that emphasizes interpersonal skills and memory (B). If you want to, you can change the slider values as you see fit.''')
|
85 |
+
|
86 |
+
st.markdown('''After you’ve set the slider values, click :red[“Assign labels and train your models”]. The simulator will then label certain individuals as "good employees"—in other words it will assign class labels "1" for "good" and "0" for "not good", based on your selections. Then, the simulator will build the two models.''')
|
87 |
+
|
88 |
+
st.markdown('''To learn more about target variable definition and hiring models based in cognitive tests, :green["See explanation"] below.''')
|
89 |
+
|
90 |
+
with st.expander("See explanation"):
|
91 |
+
|
92 |
+
st.markdown('''The models that the simulator builds are of a kind increasingly used in hiring software: companies will have applicants play games that test for different kinds of cognitive ability (like reasoning or memory). Then hiring software will be built to predict which applicants will be "good" based on which cognitive characteristics they have. What cognitive characteristics make for a "good" employee? This will depend on what role is being hired. And it will also depend on how one defines a “good" employee.''')
|
93 |
+
|
94 |
+
st.markdown('''In the real world, "good" employee is defined for these kinds of hiring models in the following way. Managers select a group of current employees that they consider to be "good"; this group of employees plays the cognitive test games. A model is then trained to identify applicants who share cognitive characteristics with the current employees that are considered "good". The target variable of "good" employee is thus defined in terms of comparison to certain people who are deemed "good". ''')
|
95 |
+
|
96 |
+
st.markdown('''One will get different target variables if one deems different current employees as the "good" ones. And, as we discussed in the Home page (and as we explain more in the Putting the Idea into Practice page), there will likely be disagreement between managers about which employees are "good". For instance, a manager who values attentiveness and numerical skills will deem different employees “good” than a manager who values interpersonal skills and memory. Even when different managers roughly share their sensibilities in what characteristics make for a "good" employee, there may still be different, equally good ways to “weight” the importance of the various characteristics.''')
|
97 |
+
|
98 |
+
st.markdown('''In the real world, the cognitive characteristics shared by those considered "good" employees are implicit. Companies do not first identify the cognitive characteristics that make for a "good" employee; rather, they identify employees who they consider "good", and then the hiring model works backwards to identify what characteristics these employees share.''')
|
99 |
+
|
100 |
+
st.markdown('''In our simulator, the cognitive characteristics shared by “good” employees are explicit. You assign different weights—using the sliders—to the cognitive characteristics you think are more or less important in a "good" employee (for the role you’re considering). To illustrate how different target variables have different effects on fairness and overall model attributes, you’ll define “good” employee in two ways. (We’ve made the cognitive characteristics explicit both so you can see the point of different target variable definitions more clearly, and because of limitations of the data that we’re working with.)''')
|
101 |
+
|
102 |
+
st.markdown('''The cognitive characteristics that the simulator works with are from one of the datasets of the [NeuroCognitive Performance Test](https://www.nature.com/articles/s41597-022-01872-8). This dataset has eleven different tests which we have grouped into five categories: ''')
|
103 |
+
|
104 |
+
st.markdown(
|
105 |
+
"""
|
106 |
+
- **Memory**: Forward Memory Span, Reverse Memory Span, Verbal List Learning, Delayed Verbal List Learning
|
107 |
+
- **Information Processing Speed**: Digit Symbol Coding, Trail Making Part A, Trail Making Part B
|
108 |
+
- **Reasoning**: Arithmetic Reasoning, Grammatical Reasoning
|
109 |
+
- **Attention**: Divided Visual Attention
|
110 |
+
- **Behavioral Restraint**: Go/No go
|
111 |
+
""")
|
112 |
+
|
113 |
+
st.markdown('''After you’ve set the weights to these five characteristics using the sliders, you can see which weights are assigned to each test (e.g. Forward Memory Span or Digit Symbol Coding) by ticking the checkbox beneath the sliders.''')
|
114 |
+
|
115 |
+
col1, col2 = st.columns(2)
|
116 |
+
|
117 |
+
#Initialise slider values
|
118 |
+
list_values_A = (9, 10, 2, 1, 5)
|
119 |
+
list_values_B = (1, 2, 10, 9, 3)
|
120 |
+
|
121 |
+
selectionsA = {}
|
122 |
+
selectionsB = {}
|
123 |
+
results_dict_A = groups_dict
|
124 |
+
results_dict_B = groups_dict
|
125 |
+
|
126 |
+
with col1:
|
127 |
+
st.subheader("Define target variable for model A ")
|
128 |
+
|
129 |
+
if "slider_values_A" not in st.session_state:
|
130 |
+
for count, value in enumerate(groups):
|
131 |
+
selectionsA[value] = list_values_A[count]
|
132 |
+
st.session_state["slider_values_A"] = selectionsA
|
133 |
+
else:
|
134 |
+
selectionsA = st.session_state["slider_values_A"]
|
135 |
+
|
136 |
+
for i in groups:
|
137 |
+
nameA = f"{i} importance, model A"
|
138 |
+
value = selectionsA[i]
|
139 |
+
slider = st.slider(nameA, min_value=0, max_value=10, value = value)
|
140 |
+
selectionsA[i] = slider
|
141 |
+
|
142 |
+
results_dict_A = {k: selectionsA.get(v, v) for k, v in results_dict_A.items()}
|
143 |
+
total = sum(results_dict_A.values())
|
144 |
+
for (key, u) in results_dict_A.items():
|
145 |
+
if total != 0:
|
146 |
+
w = (u/total)
|
147 |
+
results_dict_A[key] = w
|
148 |
+
|
149 |
+
if st.checkbox("Show target variable A weights per subtest", key="A"):
|
150 |
+
for (key, u) in results_dict_A.items():
|
151 |
+
txt = key.replace("_", " ")
|
152 |
+
st.markdown("- " + txt + " : " + f":green[{str(round((u*100), 2))}]")
|
153 |
+
|
154 |
+
|
155 |
+
st.session_state["slider_values_A"] = selectionsA
|
156 |
+
|
157 |
+
|
158 |
+
with col2:
|
159 |
+
st.subheader("Define target variable for model B ")
|
160 |
+
|
161 |
+
if "slider_values_B" not in st.session_state:
|
162 |
+
for count, value in enumerate(groups):
|
163 |
+
selectionsB[value] = list_values_B[count]
|
164 |
+
st.session_state["slider_values_B"] = selectionsB
|
165 |
+
else:
|
166 |
+
selectionsB = st.session_state["slider_values_B"]
|
167 |
+
|
168 |
+
for i in groups:
|
169 |
+
nameB = f"{i} importance, model B"
|
170 |
+
value = selectionsB[i]
|
171 |
+
slider = st.slider(nameB, min_value=0, max_value=10, value = value)
|
172 |
+
selectionsB[i] = slider
|
173 |
+
|
174 |
+
results_dict_B = {k: selectionsB.get(v, v) for k, v in results_dict_B.items()}
|
175 |
+
total = sum(results_dict_B.values())
|
176 |
+
for (key, u) in results_dict_B.items():
|
177 |
+
if total != 0:
|
178 |
+
w = ((u/total))
|
179 |
+
results_dict_B[key] = w
|
180 |
+
|
181 |
+
if st.checkbox("Show target variable B weights per subtest", key = "B"):
|
182 |
+
for (key, u) in results_dict_B.items():
|
183 |
+
txt = key.replace("_", " ")
|
184 |
+
st.markdown("- " + txt + " : " + f":green[{str(round((u*100), 2))}]")
|
185 |
+
|
186 |
+
st.session_state["slider_values_B"] = selectionsB
|
187 |
+
|
188 |
+
if st.button("Assign labels and train your models", type = "primary", use_container_width = True):
|
189 |
+
if 'complete_df' in st.session_state:
|
190 |
+
del st.session_state['complete_df']
|
191 |
+
if 'clean_df' in st.session_state:
|
192 |
+
del st.session_state['clean_df']
|
193 |
+
if 'cm_A' in st.session_state:
|
194 |
+
del st.session_state['cm_A']
|
195 |
+
if 'cm_B' in st.session_state:
|
196 |
+
del st.session_state['cm_B']
|
197 |
+
scoreA = pd.DataFrame()
|
198 |
+
scoreB = pd.DataFrame()
|
199 |
+
test1 = all(value == 0 for value in results_dict_A.values())
|
200 |
+
test2 = all(value == 0 for value in results_dict_B.values())
|
201 |
+
if test1 == True or test2 == True:
|
202 |
+
st.error('Cannot train the models if you do not define the target variables. Make your selections for both models first!', icon="🚨")
|
203 |
+
else:
|
204 |
+
for (key, u) in results_dict_A.items():
|
205 |
+
scoreA[df_keys_dict[key]] = u * dataframe[df_keys_dict[key]]
|
206 |
+
scoresA = scoreA.sum(axis=1)
|
207 |
+
dataframe['model_A_scores'] = scoresA
|
208 |
+
for (key, u) in results_dict_B.items():
|
209 |
+
scoreB[df_keys_dict[key]] = u * dataframe[df_keys_dict[key]]
|
210 |
+
scoresB = scoreB.sum(axis=1)
|
211 |
+
dataframe['model_B_scores'] = scoresB
|
212 |
+
|
213 |
+
new_annotated = assign_labels_by_probabilities(dataframe, "model_A_scores", "Model_A_label", "Model_A_probabilities", quantile=0.85, num_samples=100)
|
214 |
+
new_annotated = assign_labels_by_probabilities(new_annotated, "model_B_scores", "Model_B_label", "Model_B_probabilities", quantile=0.85, num_samples=100)
|
215 |
+
new_annotated = new_annotated.reset_index()
|
216 |
+
|
217 |
+
|
218 |
+
clean_data = drop_data(new_annotated)
|
219 |
+
# specify the columns of interest
|
220 |
+
selected_cols = ['Model_A_label', 'Model_B_label']
|
221 |
+
|
222 |
+
# count the number of rows where all three selected columns have a value of 1
|
223 |
+
num_rows_with_all_flags_1 = len(new_annotated[new_annotated[selected_cols].sum(axis=1) == len(selected_cols)])
|
224 |
+
|
225 |
+
# print the result
|
226 |
+
st.write(f"Shared candidates between your target variables: :green[{num_rows_with_all_flags_1}] (among 100 total candidates per each target variable).")
|
227 |
+
with st.spinner('Please wait... The models will be trained now.'):
|
228 |
+
|
229 |
+
X_data, Y_data_A, Y_data_B = clean_data.iloc[:, :-2], clean_data.iloc[:, [-2]], clean_data.iloc[:, [-1]]
|
230 |
+
X_data = X_data.drop(["index"], axis = 1)
|
231 |
+
Y_data_B = Y_data_B.reset_index()
|
232 |
+
X_train, X_test, y_train_A, y_test_A = train_test_split(X_data, Y_data_A, test_size=0.2)
|
233 |
+
y_train_A = y_train_A.reset_index()
|
234 |
+
y_test_A = y_test_A.reset_index()
|
235 |
+
y_train_B = pd.merge(y_train_A,Y_data_B[['index', 'Model_B_label']],on='index', how='left')
|
236 |
+
y_test_B = pd.merge(y_test_A,Y_data_B[['index', 'Model_B_label']],on='index', how='left')
|
237 |
+
y_train_B = y_train_B.drop(labels='Model_A_label', axis = 1)
|
238 |
+
y_test_B = y_test_B.drop(labels='Model_A_label', axis = 1)
|
239 |
+
y_train_A = y_train_A.set_index("index")
|
240 |
+
y_train_B = y_train_B.set_index("index")
|
241 |
+
y_test_A = y_test_A.set_index("index")
|
242 |
+
y_test_B = y_test_B.set_index("index")
|
243 |
+
|
244 |
+
accuracy_A, precision_A, recall_A, X_full_A, cm_A, baseline_accuracy_A = train_and_predict("A", X_train, X_test, y_train_A, y_test_A)
|
245 |
+
accuracy_B, precision_B, recall_B, X_full_B, cm_B, baseline_accuracy_B = train_and_predict("B", X_train, X_test, y_train_B, y_test_B)
|
246 |
+
full = pd.merge(X_full_A,X_full_B[['index','Predicted_B', 'Prob_0_B', "Prob_1_B"]],on='index', how='left')
|
247 |
+
complete = pd.merge(full,new_annotated[['index', 'age', 'gender', 'education level', 'country', 'Model_A_label', 'Model_B_label', 'model_A_scores', 'model_B_scores']],on='index', how='left')
|
248 |
+
complete=complete.replace({"education level": education_dict})
|
249 |
+
complete = complete.rename(columns={"index": "Candidate ID"})
|
250 |
+
|
251 |
+
if 'complete_df' not in st.session_state:
|
252 |
+
st.session_state['complete_df'] = complete
|
253 |
+
if 'clean_df' not in st.session_state:
|
254 |
+
st.session_state['clean_df'] = clean_data
|
255 |
+
if 'cm_A' not in st.session_state:
|
256 |
+
st.session_state['cm_A'] = cm_A
|
257 |
+
if 'cm_B' not in st.session_state:
|
258 |
+
st.session_state['cm_B'] = cm_B
|
259 |
+
|
260 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
261 |
+
with row1_1:
|
262 |
+
st.write(f"Model A accuracy: :green[{baseline_accuracy_A}].")
|
263 |
+
with row1_2:
|
264 |
+
st.write(f"Model B accuracy: :green[{baseline_accuracy_B}].")
|
265 |
+
|
266 |
+
st.success('''Success! You have defined the target variables and trained your models. Head to "Visualise the Results" in the sidebar.''')
|
pages/2_📈_Visualize_the_Results.py
ADDED
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:30:56 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import plotly.express as px
|
15 |
+
import plotly.graph_objects as go
|
16 |
+
from matplotlib_venn import venn2
|
17 |
+
from sklearn.metrics import confusion_matrix
|
18 |
+
from utils import display_proportional, plot_data, run_PCA, create_confusion_matrix_heatmap, plot_conf_rates
|
19 |
+
|
20 |
+
### PAGE CONFIG ###
|
21 |
+
st.set_page_config(page_title='FairTargetSim', page_icon=':robot_face:', layout='wide')
|
22 |
+
|
23 |
+
hide_st_style = """
|
24 |
+
<style>
|
25 |
+
#GithubIcon {visibility: hidden;}
|
26 |
+
#MainMenu {visibility: hidden;}
|
27 |
+
footer {visibility: hidden;}
|
28 |
+
header {visibility: hidden;}
|
29 |
+
</style>
|
30 |
+
"""
|
31 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
32 |
+
|
33 |
+
### DICTIONARIES AND CONSTANTS###
|
34 |
+
|
35 |
+
colours_education = {
|
36 |
+
'Some high school' : 'indigo',
|
37 |
+
'High school diploma / GED' : '#7ae99e',
|
38 |
+
'Some college' : '#0a68c9',
|
39 |
+
'College degree': '#80c4fa',
|
40 |
+
'Professional degree': '#f48508',
|
41 |
+
"Master's degree" : '#2fa493',
|
42 |
+
'Ph.D.' : '#f2a3a1',
|
43 |
+
"Associate's degree" : '#fbcc66',
|
44 |
+
'Other' : '#fa322f'
|
45 |
+
}
|
46 |
+
|
47 |
+
colours_country = {
|
48 |
+
'AU' : '#7ae99e',
|
49 |
+
'US': '#80c4fa',
|
50 |
+
'NZ': '#2fa493',
|
51 |
+
'CA' : '#fbcc66'
|
52 |
+
}
|
53 |
+
|
54 |
+
colours_gender = {
|
55 |
+
'f' : '#83C9FF',
|
56 |
+
'm': '#0067C9'
|
57 |
+
}
|
58 |
+
|
59 |
+
characteristic_dict = {
|
60 |
+
'gender' : colours_gender,
|
61 |
+
'education level' : colours_education,
|
62 |
+
'country' : colours_country,
|
63 |
+
'age' : 'indigo'
|
64 |
+
}
|
65 |
+
|
66 |
+
pred_dict = {
|
67 |
+
'Model A' : 'Predicted_A',
|
68 |
+
'Model B' : 'Predicted_B'
|
69 |
+
}
|
70 |
+
|
71 |
+
prob_dict = {
|
72 |
+
'Model A' : 'Prob_1_A',
|
73 |
+
'Model B' : 'Prob_1_B'
|
74 |
+
}
|
75 |
+
|
76 |
+
model_dict = {
|
77 |
+
'Model A' : 'Model_A_label',
|
78 |
+
'Model B' : 'Model_B_label'
|
79 |
+
}
|
80 |
+
|
81 |
+
df_keys_dict = {
|
82 |
+
'Divided Visual Attention' :'divided_visual_attention',
|
83 |
+
'Forward Memory Span' :'forward_memory_span',
|
84 |
+
'Arithmetic Reasoning' : 'arithmetic_problem_solving',
|
85 |
+
'Grammatical Reasoning' : 'logical_reasoning',
|
86 |
+
'Go/No go': 'adaptive_behaviour_response_inhibition',
|
87 |
+
'Reverse_Memory_Span' : 'reverse_memory_span',
|
88 |
+
'Verbal List Learning': 'episodic_verbal_learning',
|
89 |
+
'Delayed Verbal List Learning': 'delayed_recall',
|
90 |
+
'Digit Symbol Coding': 'abstract_symbol_processing_speed',
|
91 |
+
'Trail Making Part A' :'numerical_info_processing_speed',
|
92 |
+
'Trail Making Part B': 'numerical_and_lexical_info_processing_speed'
|
93 |
+
}
|
94 |
+
|
95 |
+
### DEFINE ALL SUB-PAGES AS FUNCTIONS TO CALL ###
|
96 |
+
|
97 |
+
def mod_prop(cmA, cmB):
|
98 |
+
st.markdown('''This section contains model confusion matrices, which give us a lot of information about how good the models which we produced really are. We obtain the confusion matrices by plotting "Actual" labels on one axis, and "Predicted" labels on the other. For both models, in each square of each confusion matrix, we can see which group of candidates it represents, the number of candidates in this group, and what percentage of all candidates assessed they represent.''')
|
99 |
+
st.markdown('''
|
100 |
+
- **True Negative (TN)**: candidates predicted to have label "0" whose actual label was "0".
|
101 |
+
- **False Positive (FP)**: candidates predicted to have label "1" whose actual label was "0".
|
102 |
+
- **False Negative (FN)** candidates predicted to have label "0" whose actual label was "1".
|
103 |
+
- **True Positive (TP)**: candidates predicted to have label "1" whose actual label was "1".
|
104 |
+
''')
|
105 |
+
|
106 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
107 |
+
with row1_1:
|
108 |
+
create_confusion_matrix_heatmap(cmB, "Model A")
|
109 |
+
|
110 |
+
with row1_2:
|
111 |
+
model = "B"
|
112 |
+
create_confusion_matrix_heatmap(cmB, "Model B")
|
113 |
+
|
114 |
+
st.subheader("Model Accuracy Rates")
|
115 |
+
st.markdown('''We can also represent each model in terms of its accuracy rates, calculated using the numbers of candidates in each group shown in the confusion matrix:''')
|
116 |
+
st.markdown('''
|
117 |
+
- True Positive Rate (TPR), also called recall, sensitivity or hit rate, is the probability of a positive test result when the result is indeed positive.
|
118 |
+
- True Negative Rate (TNR), or specificity/selectivity, is the probability of a negative test result when the test is indeed negative.
|
119 |
+
- Positive Predictive Value (PPV) or Precision is the ratio of truly positive results to all positive results.
|
120 |
+
- Negative Predictive Value (NPR) is the ratio of truly negative results to all negative results.
|
121 |
+
- False Positive Rate (FPR) or fall-out is the probability of assigning a falsely positive result when the test is negative.
|
122 |
+
- False Negative Rate (FNR) or miss rate is the probability that a test with a positive result will falsely be assigned a negative result.
|
123 |
+
- False Discovery Rate (FDR) it the ratio of false positive results to total number of positive results.
|
124 |
+
''')
|
125 |
+
measures_A = plot_conf_rates(cmA)
|
126 |
+
measures_B = plot_conf_rates(cmB)
|
127 |
+
fig = go.Figure()
|
128 |
+
fig.add_trace(go.Bar(
|
129 |
+
x=measures_A["Measure"],
|
130 |
+
y=measures_A["Score"],
|
131 |
+
name='Model A rates',
|
132 |
+
marker_color='rgb(55, 83, 109)'
|
133 |
+
))
|
134 |
+
fig.add_trace(go.Bar(
|
135 |
+
x=measures_B["Measure"],
|
136 |
+
y=measures_B["Score"],
|
137 |
+
name='Model B rates',
|
138 |
+
marker_color='rgb(26, 118, 255)'
|
139 |
+
))
|
140 |
+
fig.update_layout(
|
141 |
+
title='Model measures comparison',
|
142 |
+
xaxis_tickfont_size=14,
|
143 |
+
xaxis=dict(
|
144 |
+
title='Measure'),
|
145 |
+
yaxis=dict(
|
146 |
+
title='Score',
|
147 |
+
titlefont_size=16,
|
148 |
+
tickfont_size=14,
|
149 |
+
),
|
150 |
+
legend=dict(
|
151 |
+
bgcolor='rgba(255, 255, 255, 0)',
|
152 |
+
bordercolor='rgba(255, 255, 255, 0)'
|
153 |
+
),
|
154 |
+
barmode='group',
|
155 |
+
bargap=0.15, # gap between bars of adjacent location coordinates.
|
156 |
+
bargroupgap=0.1 # gap between bars of the same location coordinate.
|
157 |
+
)
|
158 |
+
st.plotly_chart(fig, use_container_width = True)
|
159 |
+
if st.checkbox("Show tables"):
|
160 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
161 |
+
# CSS to inject contained in a string
|
162 |
+
hide_table_row_index = """
|
163 |
+
<style>
|
164 |
+
thead tr th:first-child {display:none}
|
165 |
+
tbody th {display:none}
|
166 |
+
</style> """
|
167 |
+
|
168 |
+
# Inject CSS with Markdown
|
169 |
+
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
170 |
+
with row1_1:
|
171 |
+
st.markdown("Accuracy rates for model A")
|
172 |
+
st.table(measures_A)
|
173 |
+
with row1_2:
|
174 |
+
st.markdown("Accuracy rates for model B")
|
175 |
+
st.table(measures_B)
|
176 |
+
|
177 |
+
def model_scores(dataframe):
|
178 |
+
st.markdown('''This section displays the distribution of scores assigned to each hypothetical employee, according to the values you set on the sliders, compared with the distribution of protected characteristics. These scores are then used to assign labels to the hypothetical employees, creating two distinct datasets to train the models. In essence, these scores explicitly and numerically mimic the viewpoints of hypothetical hiring managers A and B when deciding who to label as "top employees."''')
|
179 |
+
st.markdown('''Just as in the original NCPT dataset analysis, the scores obtained by participants in the cognitive games decline with age. This observation provides insight into the potential issue of ageism inherent in the use of gamified hiring processes.''')
|
180 |
+
# Create a selectbox to choose a protected characteristic to explore
|
181 |
+
plot_radio = st.selectbox('Characteristic to explore', characteristic_dict.keys())
|
182 |
+
row2_space1, row2_1, row2_space2 = st.columns((0.1, 5, 0.1))
|
183 |
+
|
184 |
+
with row2_1:
|
185 |
+
data = dataframe[["model_A_scores", "model_B_scores", plot_radio]]
|
186 |
+
|
187 |
+
if plot_radio == "age":
|
188 |
+
bins= [18,20,30,40,50,60,70,80,90]
|
189 |
+
labels = ['18-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90']
|
190 |
+
data['age_bins'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)
|
191 |
+
plot_radio = 'age_bins'
|
192 |
+
colours = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)']
|
193 |
+
c1, c2, c3, c4, c5 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
194 |
+
with c2:
|
195 |
+
fig = px.box(data, x = plot_radio, y="model_A_scores", labels={"model_A_scores":"Dataset A Input Scores", 'age_bins':"Age"}, title="Input scores in dataset A")
|
196 |
+
fig.update_layout(showlegend=False)
|
197 |
+
fig.update_traces(marker_color='rgba(93, 164, 214, 0.5)')
|
198 |
+
st.plotly_chart(fig, use_container_width=True)
|
199 |
+
with c4:
|
200 |
+
fig = px.box(data, x = plot_radio, y="model_B_scores", labels={"model_B_scores":"Dataset B Input Scores", 'age_bins':"Age"}, title="Input scores in dataset B")
|
201 |
+
fig.update_traces(marker_color = 'rgba(255, 144, 14, 0.5)')
|
202 |
+
fig.update_layout(showlegend=False)
|
203 |
+
st.plotly_chart(fig, use_container_width=True)
|
204 |
+
|
205 |
+
def PCA_general(full_df, dataframe_PCA):
|
206 |
+
st.markdown('''On this page, you can see the distribution of the dataset labels which were assigned based on the scores calculated from the slider values you selected previously. Principal Components Analysis, or PCA, is a technique often used to analyse and subsequently visualize datasets where there are many features per single example. This is the case with the NCPT dataset used in our simulator. Specifically, the battery which we used has 11 features per single example, the example being the player of cognitive games, and, in our metaphor, a hypothetical employee or job candidate. It is impossible to plot 11 dimensions, and PCA allows for the visualisation of multidimensional data, while also preserving as much information as possible.''')
|
207 |
+
choice = st.radio("What would you like to explore?", ("PCAs", "Components loading"), horizontal = True)
|
208 |
+
pcaA, dfA, labelsA, coeffA, componentsA = run_PCA(dataframe_PCA, 'Model_B_label', 'Model_A_label', 2)
|
209 |
+
pcaB, dfB, labelsB, coeffB, componentsB = run_PCA(dataframe_PCA, 'Model_A_label', 'Model_B_label', 2)
|
210 |
+
loadings = pcaB.components_.T * np.sqrt(pcaB.explained_variance_)
|
211 |
+
total_var = pcaA.explained_variance_ratio_.sum() * 100
|
212 |
+
dfA = dfA.rename(columns={'target': 'Dataset A'}).reset_index()
|
213 |
+
dfB = dfB.rename(columns={'target': 'Dataset B'}).reset_index()
|
214 |
+
df_all = pd.merge(dfA, dfB[['index', 'Dataset B']], on='index', how='left')
|
215 |
+
|
216 |
+
conditions = [
|
217 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 0),
|
218 |
+
(df_all['Dataset B'] == 1) & (df_all['Dataset A'] == 0),
|
219 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 1),
|
220 |
+
(df_all['Dataset A'] == 0) & (df_all['Dataset B'] == 0)]
|
221 |
+
|
222 |
+
values = ['Selected A', 'Selected B', 'Selected both', 'Not selected']
|
223 |
+
df_all['All'] = np.select(conditions, values)
|
224 |
+
|
225 |
+
df_all = df_all.drop(["index"], axis = 1)
|
226 |
+
df_all.All=pd.Categorical(df_all.All,categories=['Not selected', 'Selected A', 'Selected B', 'Selected both'])
|
227 |
+
df_all=df_all.sort_values('All')
|
228 |
+
|
229 |
+
selections_dict = {0: 'Not selected', 1: 'Selected'}
|
230 |
+
df_all = df_all.replace({"Dataset A": selections_dict, "Dataset B": selections_dict})
|
231 |
+
|
232 |
+
color_dict_sel = {'Not selected': '#3366CC', 'Selected': 'grey'}
|
233 |
+
|
234 |
+
if "pca_df" not in st.session_state:
|
235 |
+
st.session_state.pca_df = df_all
|
236 |
+
|
237 |
+
if choice == "PCAs":
|
238 |
+
c1, c2 = st.columns(2)
|
239 |
+
with c1:
|
240 |
+
fig = px.scatter(st.session_state.pca_df,
|
241 |
+
x=st.session_state.pca_df['principal component 1'].astype(str),
|
242 |
+
y=st.session_state.pca_df['principal component 2'].astype(str),
|
243 |
+
title='Dataset A PCA',
|
244 |
+
labels={"x": 'PC 1', "y": 'PC 2'},
|
245 |
+
color=st.session_state.pca_df['Dataset A'],
|
246 |
+
color_discrete_map=color_dict_sel)
|
247 |
+
fig.update_traces(marker_size = 8)
|
248 |
+
st.plotly_chart(fig, use_container_width=True)
|
249 |
+
with c2:
|
250 |
+
fig = px.scatter(st.session_state.pca_df,
|
251 |
+
x=st.session_state.pca_df['principal component 1'].astype(str),
|
252 |
+
y=st.session_state.pca_df['principal component 2'].astype(str),
|
253 |
+
title='Dataset B PCA',
|
254 |
+
labels={"x": 'PC 1', "y": 'PC 2'},
|
255 |
+
color=st.session_state.pca_df['Dataset B'],
|
256 |
+
color_discrete_map=color_dict_sel)
|
257 |
+
fig.update_traces(marker_size = 8)
|
258 |
+
st.plotly_chart(fig, use_container_width=True)
|
259 |
+
|
260 |
+
st.markdown(f'''These plots show the reduction of 11 dimensions (11 subtest results) to 2 dimensions. Total Variance for the data is {total_var:.2f}%. Both of the datasets have the same features, therefore they both have the same total variance. Total variance value indicates what percentage of information has been preserved when the dimensionality was reduced. Note that for both datasets, A and B, different points are labelled "1" or "0". This shows that the two datasets represent the two different target variable definitions which were created by you previously. The plots are interactive - zoom in to explore in detail.''')
|
261 |
+
|
262 |
+
pcaA, dfA, labelsA, coeffA, componentsA = run_PCA(dataframe_PCA, 'Model_B_label', 'Model_A_label', 2)
|
263 |
+
pcaB, dfB, labelsB, coeffB, componentsB = run_PCA(dataframe_PCA, 'Model_A_label', 'Model_B_label', 2)
|
264 |
+
loadings = pcaB.components_.T * np.sqrt(pcaB.explained_variance_)
|
265 |
+
total_var = pcaA.explained_variance_ratio_.sum() * 100
|
266 |
+
dfA = dfA.rename(columns={'target': 'Dataset A'}).reset_index()
|
267 |
+
dfB = dfB.rename(columns={'target': 'Dataset B'}).reset_index()
|
268 |
+
df_all = pd.merge(dfA, dfB[['index', 'Dataset B']], on='index', how='left')
|
269 |
+
|
270 |
+
conditions = [
|
271 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 0),
|
272 |
+
(df_all['Dataset B'] == 1) & (df_all['Dataset A'] == 0),
|
273 |
+
(df_all['Dataset A'] == 1) & (df_all['Dataset B'] == 1),
|
274 |
+
(df_all['Dataset A'] == 0) & (df_all['Dataset B'] == 0)]
|
275 |
+
|
276 |
+
values = ['Selected A', 'Selected B', 'Selected both', 'Not selected']
|
277 |
+
df_all['All'] = np.select(conditions, values)
|
278 |
+
|
279 |
+
df_all = df_all.drop(["index"], axis = 1)
|
280 |
+
df_all.All=pd.Categorical(df_all.All,categories=['Not selected', 'Selected A', 'Selected B', 'Selected both'])
|
281 |
+
df_all=df_all.sort_values('All')
|
282 |
+
|
283 |
+
selections_dict = {0: 'Not selected', 1: 'Selected'}
|
284 |
+
df_all = df_all.replace({"Dataset A": selections_dict, "Dataset B": selections_dict})
|
285 |
+
|
286 |
+
if "pca_df" not in st.session_state:
|
287 |
+
st.session_state.pca_df = df_all
|
288 |
+
|
289 |
+
fig = px.scatter(st.session_state.pca_df,
|
290 |
+
x=st.session_state.pca_df['principal component 1'],
|
291 |
+
y=st.session_state.pca_df['principal component 2'],
|
292 |
+
title="PCA with labelled groups",
|
293 |
+
color=st.session_state.pca_df["All"],
|
294 |
+
width = 800, height = 800,
|
295 |
+
color_discrete_sequence=px.colors.qualitative.Safe,
|
296 |
+
opacity = 0.95)
|
297 |
+
|
298 |
+
fig.update_yaxes(
|
299 |
+
scaleanchor="x",
|
300 |
+
scaleratio=1,
|
301 |
+
)
|
302 |
+
fig.update_traces(marker_size = 10)
|
303 |
+
st.plotly_chart(fig)
|
304 |
+
|
305 |
+
if choice == "Components loading":
|
306 |
+
c1,c2 = st.columns(2)
|
307 |
+
loadings_df = pd.DataFrame(loadings, columns = ["PC1", "PC2"])
|
308 |
+
labels_A_proper = { v:k for k,v in df_keys_dict.items()}
|
309 |
+
loadings_df["Features"] = labels_A_proper.values()
|
310 |
+
with c1:
|
311 |
+
fig = px.bar(loadings_df, x="PC1", y="Features", orientation = 'h')
|
312 |
+
st.plotly_chart(fig, use_container_width = True)
|
313 |
+
with c2:
|
314 |
+
fig = px.bar(loadings_df, x="PC2", y="Features", orientation = 'h')
|
315 |
+
st.plotly_chart(fig, use_container_width = True)
|
316 |
+
|
317 |
+
# fig = go.Figure()
|
318 |
+
# fig.add_trace(go.Bar(
|
319 |
+
# x=loadings_df["PC1"],
|
320 |
+
# y=loadings_df["Features"],
|
321 |
+
# name='Principal Component 1',
|
322 |
+
# marker_color='rgb(55, 83, 109)',
|
323 |
+
# orientation='h'
|
324 |
+
# ))
|
325 |
+
# fig.add_trace(go.Bar(
|
326 |
+
# x=loadings_df["PC2"],
|
327 |
+
# y=loadings_df["Features"],
|
328 |
+
# name='Principal Component 2',
|
329 |
+
# marker_color='rgb(26, 118, 255)',
|
330 |
+
# orientation='h'
|
331 |
+
# ))
|
332 |
+
# fig.update_layout(
|
333 |
+
# title='Component loadings',
|
334 |
+
# xaxis_tickfont_size=14,
|
335 |
+
# xaxis=dict(
|
336 |
+
# title='Loading value'),
|
337 |
+
# yaxis=dict(
|
338 |
+
# title='Feature',
|
339 |
+
# titlefont_size=16,
|
340 |
+
# tickfont_size=14,
|
341 |
+
# ),
|
342 |
+
# legend=dict(
|
343 |
+
# bgcolor='rgba(255, 255, 255, 0)',
|
344 |
+
# bordercolor='rgba(255, 255, 255, 0)'
|
345 |
+
# ),
|
346 |
+
# barmode='group',
|
347 |
+
# bargap=0.15, # gap between bars of adjacent location coordinates.
|
348 |
+
# bargroupgap=0.1 # gap between bars of the same location coordinate.
|
349 |
+
# )
|
350 |
+
# st.plotly_chart(fig, use_container_width = True)
|
351 |
+
|
352 |
+
st.markdown('''On this plot, PCA component loadings can be explored. These facilitate the understanding of how much each variable (which there are 11 of) contributes to a particular principal component. Here, the 11 variables were reduced to 2 components, which are labelled PC1 and PC2. The magnitude of the loading (here displayed as the size of the bar in the bar chart) indicates how strong the relationship between the variable and the component is. Therefore, the higher the bar, the stronger the relationship between that component and that variable. The loading's sign can be positive or negative. This indicates whether the principal component and that variable are positively or negatively correlated. We can see that multiple variables are positively correlated with PC2. Two variables, episodic verbal learning and delayed recall are negatively correlated with both of the components.''')
|
353 |
+
|
354 |
+
|
355 |
+
def model_out(full_df):
|
356 |
+
st.markdown('''This section highlights the discrepancies between your two models when presented with the same pool of new, previously unseen candidates to label. Specifically, you'll be investigating the candidates assigned a "1" label by both models. These individuals would be those considered for a job interview or chosen for the role, according to your defined target variable.''')
|
357 |
+
# Create a selectbox to choose a protected characteristic to explore
|
358 |
+
selectbox = st.selectbox('Characteristic to explore', characteristic_dict.keys())
|
359 |
+
representation = st.selectbox("Representation", ("absolute", "proportional"))
|
360 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
361 |
+
with row1_1:
|
362 |
+
st.subheader("Candidates selected by model A")
|
363 |
+
|
364 |
+
if representation == "absolute":
|
365 |
+
# Select predicted data ==1
|
366 |
+
data = full_df.loc[full_df['Predicted_A'] == 1]
|
367 |
+
|
368 |
+
# Use function plot_data to plot selected data
|
369 |
+
plot_data(data, selectbox, characteristic_dict[selectbox])
|
370 |
+
else:
|
371 |
+
display_proportional(full_df, selectbox, 'Predicted_A')
|
372 |
+
|
373 |
+
with row1_2:
|
374 |
+
st.subheader("Candidates selected by model B")
|
375 |
+
|
376 |
+
if representation == "absolute":
|
377 |
+
# Select predicted data ==1
|
378 |
+
data = full_df.loc[full_df['Predicted_B'] == 1]
|
379 |
+
|
380 |
+
# Use function plot_data to plot selected data
|
381 |
+
plot_data(data, selectbox, characteristic_dict[selectbox])
|
382 |
+
|
383 |
+
else:
|
384 |
+
display_proportional(full_df, selectbox,'Predicted_B')
|
385 |
+
|
386 |
+
|
387 |
+
st.markdown('''In this section, you're comparing the model's selections concerning four protected characteristics: age, gender, education level, and country. You can visualize these differences in two ways: "Absolute" or "Proportional".''')
|
388 |
+
st.markdown('''"Absolute" representation gives you the raw numbers or percentages of each characteristic chosen. For instance, if the model labeled 5 female candidates and 5 male candidates as "1", the "Absolute" outcome will display as 50% for both genders."Proportional" representation, on the other hand, shows the percentage of a group selected by the model relative to the total number of that group in the input data. For example, if the model evaluated 100 male candidates and selected 5, you will see a 5% representation. If it evaluated 200 female candidates and selected 5, it will show a 2.5% representation.''')
|
389 |
+
st.markdown('''If you encounter empty categories in the "Proportional" view, this indicates that while candidates from these categories were evaluated, none were labeled as "1". Hence, their proportional representation amounts to 0%.''')
|
390 |
+
|
391 |
+
def dataframe_out(full_df):
|
392 |
+
selectbox_M = st.selectbox('Choose which model output to rank by', pred_dict.keys())
|
393 |
+
|
394 |
+
# Select data
|
395 |
+
data = full_df.loc[full_df[pred_dict[selectbox_M]] == 1]
|
396 |
+
data = data.sort_values(by = prob_dict[selectbox_M], ascending = False)
|
397 |
+
data = data[['Candidate ID','Prob_1_A', 'Prob_1_B', 'Predicted_A', 'Predicted_B']]
|
398 |
+
data = data.rename(columns={"Prob_1_A": "Ranking, model A", "Prob_1_B": "Ranking, model B", "Predicted_A": "Predicted label A", "Predicted_B": "Predicted label B"})
|
399 |
+
data.index = np.arange(1, len(data) + 1)
|
400 |
+
|
401 |
+
st.table(data.style.background_gradient(subset = ["Ranking, model A", "Ranking, model B"], axis=0, vmin=0.40).highlight_max(color = '#FFCD9B', subset = ["Predicted label A", "Predicted label B"], axis=0))
|
402 |
+
|
403 |
+
st.markdown("""In this section, you can review the data for all candidates labeled "1" by the selected model, found at the top of the page. Simultaneously, you can observe the labels assigned to these same candidates by the other model. It's likely that there will be instances where candidates chosen by one model weren't selected by the other. Candidates labeled "1" are highlighted in orange in the "Predicted label A" and "Predicted label B" columns.""")
|
404 |
+
st.markdown('''In addition to this, you can see the probability with which each candidate was labeled "1". The intensity of the blue color indicates the candidate's ranking position - a darker blue represents a higher ranking (with 1 being the maximum and 0 the minimum). You may notice that some candidates highly ranked by one model may be ranked significantly lower by the other model.''')
|
405 |
+
|
406 |
+
def venn_diagram(full_df):
|
407 |
+
row2_space1, row2_1, row2_space2, row2_2, row2_space3 = st.columns((0.1, 1, 0.1, 1, 0.1))
|
408 |
+
with row2_1:
|
409 |
+
fig, ax = plt.subplots()
|
410 |
+
|
411 |
+
list_A = full_df.loc[full_df['Predicted_A'] == 1, 'Candidate ID'].astype(int)
|
412 |
+
list_B = full_df.loc[full_df['Predicted_B'] == 1, 'Candidate ID'].astype(int)
|
413 |
+
set1 = set(list_A)
|
414 |
+
set2 = set(list_B)
|
415 |
+
|
416 |
+
venn2([set1, set2], ('Model A', 'Model B'), ax=ax)
|
417 |
+
st.pyplot(fig)
|
418 |
+
|
419 |
+
with row2_2:
|
420 |
+
st.markdown('''This Venn Diagram visualizes the number of candidates chosen by both models. It's likely that some candidates will be selected by both models, while others may be chosen by only one model. If we consider Model A as the decision of one hiring manager and Model B as another's, it's easy to see how the selection outcome varies depending on the decision-maker. Some candidates may get the opportunity to be hired, while others might not. This serves as an illustration of the inherent arbitrariness in defining the target variable when dealing with highly subjective outcomes.''')
|
421 |
+
st.markdown('''For instance, it's straightforward to define a target variable in a classification problem like distinguishing dragonflies from butterflies, where there's little room for ambiguity. However, defining what makes a 'good' employee is far more challenging due to its subjective nature.''')
|
422 |
+
|
423 |
+
|
424 |
+
|
425 |
+
def model_vis(full_df):
|
426 |
+
st.markdown('''In this section, you can visualize the demographics of the different subgroups of the data. Firstly, you can see the demographic characteristics of the candidates who have positive labels ("1") and negative labels ("0") which were assigned based on the scores calculated from the slider values you selected previously. Then, you can visualize the demographic distributions of the data which was used for training and evaluation of the models.''')
|
427 |
+
choice = st.radio("**Select desired data:**", ("Positive and negative labels", "Training and evaluation data"), horizontal=True)
|
428 |
+
if choice == "Positive and negative labels":
|
429 |
+
# Create a selectbox to choose a protected characteristic to explore
|
430 |
+
selectbox_Lab = st.selectbox('Label to visualize', ('positive labels', 'negative labels'))
|
431 |
+
|
432 |
+
# Create a selectbox to choose a protected characteristic to explore
|
433 |
+
selectbox_Char = st.selectbox('Protected characteristic', characteristic_dict.keys())
|
434 |
+
|
435 |
+
row2_space1, row2_1, row2_space2, row2_2, row2_space3 = st.columns((0.1, 3, 0.1, 3, 0.1))
|
436 |
+
|
437 |
+
with row2_1:
|
438 |
+
st.subheader("Dataset A")
|
439 |
+
|
440 |
+
# Select test data
|
441 |
+
if selectbox_Lab == 'positive labels':
|
442 |
+
data = full_df.loc[full_df['Model_A_label'] == 1]
|
443 |
+
else:
|
444 |
+
data = full_df.loc[full_df['Model_A_label'] == 0]
|
445 |
+
|
446 |
+
# Use function plot_data to plot selected data
|
447 |
+
plot_data(data, selectbox_Char, characteristic_dict[selectbox_Char])
|
448 |
+
|
449 |
+
|
450 |
+
with row2_2:
|
451 |
+
st.subheader("Dataset B")
|
452 |
+
|
453 |
+
# Select test data
|
454 |
+
if selectbox_Lab == 'positive labels':
|
455 |
+
data = full_df.loc[full_df['Model_B_label'] == 1]
|
456 |
+
else:
|
457 |
+
data = full_df.loc[full_df['Model_B_label'] == 0]
|
458 |
+
|
459 |
+
# Use function plot_data to plot selected data
|
460 |
+
plot_data(data, selectbox_Char, characteristic_dict[selectbox_Char])
|
461 |
+
st.markdown('''You are visualising the demographic composition of those hypothetical employees who were assigned labels "1" or "0" based on your definitions of the target variables. You might see differences in proportions of genders between the two models for the positive labels, as well as a major difference in the age between the positive and negative labels. Visualising the labels in this manner before training the model can help understand and mitigate differences in demographic representation in the modelling outcomes. Likely, if all candidates labelled "1" were in younger age groups, the candidates selected by the model at the deployment stage will also be in younger age groups. Moreover, target variable definition affects the proportional representation. Having defined two target variables, one can choose the dataset and the model which offers more proportional representation.''')
|
462 |
+
|
463 |
+
|
464 |
+
if choice == "Training and evaluation data":
|
465 |
+
# Create a selectbox to choose a protected characteristic to explore
|
466 |
+
selectbox = st.selectbox('Characteristic to explore', characteristic_dict.keys())
|
467 |
+
row1_space1, row1_1, row1_space2, row1_2, row1_space3 = st.columns((0.1, 1, 0.1, 1, 0.1))
|
468 |
+
# Plot training data
|
469 |
+
with row1_1:
|
470 |
+
st.subheader("Training data")
|
471 |
+
|
472 |
+
# Select train data
|
473 |
+
train = full_df.loc[full_df["Predicted_A"] == "train"]
|
474 |
+
|
475 |
+
# Use function plot_data to plot selected data
|
476 |
+
plot_data(train, selectbox, characteristic_dict[selectbox])
|
477 |
+
|
478 |
+
# Plot test data
|
479 |
+
|
480 |
+
with row1_2:
|
481 |
+
st.subheader("Test data")
|
482 |
+
|
483 |
+
# Select test data
|
484 |
+
test = full_df.loc[full_df["Predicted_A"] != "train"]
|
485 |
+
|
486 |
+
# Use function plot_data to plot selected data
|
487 |
+
plot_data(test, selectbox, characteristic_dict[selectbox])
|
488 |
+
|
489 |
+
st.markdown('''To train a machine learning model, the data has to be split into two different sets. The first set is the training data, which will be used to teach the model the relationships between the input features (11 subtest results) and the corresponding labels ("0" and "1", assigned based on your definitions of target variables and the values you chose for the sliders). The second set is the test data, or evaluation data. It is used to assess the performance of the model. This is the data which is used to plot the confusion matrices and calculate the model metrics which you saw at the bottom of the "Define the target variable" page. This is also the data whose features you can explore in "Modelling outcomes". It is important that the training and testing data are balanced. Here, you can compare the demographic composition of the training and evaluation data. The training and evaluation datasets compositions were the same and contained the same candidates and same features for both models A and B. However, the labels for each dataset were different and based on what you selected in "Define target variable".''')
|
490 |
+
|
491 |
+
def filter_for_protected(data):
|
492 |
+
st.markdown('''Sometimes, the overall model metrics can be deceptive when it comes to predicting the results for different groups under consideration. Ideally, for our models, the varying model metrics would be similar across different groups, which would indicate that the overall model performance is reflected in how this model performs for a given group. It is often not the case, and it is likely that you will see that models A and B perform differently when it comes to those metrics. Even the same model can have different metrics for different subgroups.''')
|
493 |
+
model = st.selectbox('Choose which model outputs to assess', pred_dict.keys())
|
494 |
+
test = data.loc[data[pred_dict[model]] != "train"]
|
495 |
+
|
496 |
+
selectbox_Char = st.selectbox('Protected characteristic', characteristic_dict.keys())
|
497 |
+
if selectbox_Char == 'age':
|
498 |
+
bins= [18,20,30,40,50,60,70,80,91]
|
499 |
+
labels = ['18-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90']
|
500 |
+
test['age_bins'] = pd.cut(test['age'], bins=bins, labels=labels, right=False)
|
501 |
+
selectbox_Char = 'age_bins'
|
502 |
+
# which_group = st.selectbox('Which group?', test[selectbox_Char].unique())
|
503 |
+
df = pd.DataFrame({'Measure': ['True Positive Rate', 'True Negative Rate', 'Positive Predictive Value', 'Negative Predictive Value', 'False Positive Rate', 'False Negative Rate', 'False Discovery Rate']})
|
504 |
+
for group in test[selectbox_Char].unique():
|
505 |
+
rslt_df = test[test[selectbox_Char] == group]
|
506 |
+
y_true = [int(numeric_string) for numeric_string in rslt_df[model_dict[model]]]
|
507 |
+
y_pred = [int(numeric_string) for numeric_string in rslt_df[pred_dict[model]]]
|
508 |
+
cm = confusion_matrix(y_true, y_pred)
|
509 |
+
if cm.shape == (1,1):
|
510 |
+
cm = np.array([[cm[0, 0], 0], [0, 0]])
|
511 |
+
d = plot_conf_rates(cm)
|
512 |
+
df[f"{group}"] = d["Score"]
|
513 |
+
|
514 |
+
fig = go.Figure()
|
515 |
+
for group in test[selectbox_Char].unique():
|
516 |
+
fig.add_trace(go.Bar(
|
517 |
+
x=df["Measure"],
|
518 |
+
y=df[group],
|
519 |
+
name=group
|
520 |
+
))
|
521 |
+
|
522 |
+
fig.update_layout(
|
523 |
+
title='Model metrics per group',
|
524 |
+
xaxis_tickfont_size=14,
|
525 |
+
xaxis=dict(
|
526 |
+
title='Metric'),
|
527 |
+
yaxis=dict(
|
528 |
+
title='Score',
|
529 |
+
titlefont_size=16,
|
530 |
+
tickfont_size=14,
|
531 |
+
),
|
532 |
+
legend=dict(
|
533 |
+
bgcolor='rgba(255, 255, 255, 0)',
|
534 |
+
bordercolor='rgba(255, 255, 255, 0)'
|
535 |
+
),
|
536 |
+
barmode='group',
|
537 |
+
bargap=0.15, # gap between bars of adjacent location coordinates.
|
538 |
+
bargroupgap=0.1 # gap between bars of the same location coordinate.
|
539 |
+
)
|
540 |
+
st.plotly_chart(fig, use_container_width = True)
|
541 |
+
if st.checkbox("Show table of scores"):
|
542 |
+
# CSS to inject contained in a string
|
543 |
+
hide_table_row_index = """
|
544 |
+
<style>
|
545 |
+
thead tr th:first-child {display:none}
|
546 |
+
tbody th {display:none}
|
547 |
+
</style> """
|
548 |
+
|
549 |
+
# Inject CSS with Markdown
|
550 |
+
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
551 |
+
st.markdown(f"Accuracy rates for {selectbox_Char}")
|
552 |
+
st.table(df)
|
553 |
+
|
554 |
+
def data_plot(key1, key2, key3, key4):
|
555 |
+
st.title('''Visualize the Results''')
|
556 |
+
if key1 not in st.session_state:
|
557 |
+
st.error('Cannot train the models if you do not define the target variables. Go to "Define Target Variables"!', icon="🚨")
|
558 |
+
else:
|
559 |
+
tab1, tab2 = st.tabs(["Demographic", "Non-demographic"])
|
560 |
+
with tab1:
|
561 |
+
dataframe = st.session_state[key1]
|
562 |
+
clean_data = st.session_state[key2]
|
563 |
+
st.subheader('''**Select what to explore:**''')
|
564 |
+
data_choice = st.radio('''What to explore''', ("Modelling outcomes", "Input data"), horizontal = True, label_visibility = "collapsed")
|
565 |
+
if data_choice == "Modelling outcomes":
|
566 |
+
st.subheader('''Demographics of the overall modelling outcomes''')
|
567 |
+
model_out(dataframe)
|
568 |
+
st.subheader('''Demographics of the selected protected groups''')
|
569 |
+
filter_for_protected(dataframe)
|
570 |
+
else:
|
571 |
+
st.subheader('''Demographics of the input scores''')
|
572 |
+
model_scores(dataframe)
|
573 |
+
st.subheader('''Demographics of the input labels''')
|
574 |
+
model_vis(dataframe)
|
575 |
+
with tab2:
|
576 |
+
dataframe = st.session_state[key1]
|
577 |
+
clean_data = st.session_state[key2]
|
578 |
+
cmA = st.session_state[key3]
|
579 |
+
cmB = st.session_state[key4]
|
580 |
+
st.subheader('''**Select what to explore:**''')
|
581 |
+
data_choice = st.radio('''Select what to explore:''', ("Modelling outcomes", "Input data"), horizontal = True, label_visibility = "collapsed")
|
582 |
+
if data_choice == "Modelling outcomes":
|
583 |
+
st.subheader('''Labelled dataframe''')
|
584 |
+
dataframe_out(dataframe)
|
585 |
+
st.subheader('''Venn Diagram''')
|
586 |
+
venn_diagram(dataframe)
|
587 |
+
st.subheader('''Model accuracy metrics''')
|
588 |
+
mod_prop(cmA, cmB)
|
589 |
+
else:
|
590 |
+
st.subheader('''Principal Component Analysis''')
|
591 |
+
PCA_general(dataframe, clean_data)
|
592 |
+
|
593 |
+
data_plot('complete_df', 'clean_df', 'cm_A', 'cm_B')
|
pages/3_💡_Put_the_Idea_into_Practice.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:42:38 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### IMPORT LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
### PAGE CONFIG ###
|
13 |
+
st.set_page_config(page_title='FairTargetSim', page_icon=':robot_face:', layout='wide')
|
14 |
+
|
15 |
+
hide_st_style = """
|
16 |
+
<style>
|
17 |
+
#GithubIcon {visibility: hidden;}
|
18 |
+
#MainMenu {visibility: hidden;}
|
19 |
+
footer {visibility: hidden;}
|
20 |
+
header {visibility: hidden;}
|
21 |
+
</style>
|
22 |
+
"""
|
23 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
24 |
+
|
25 |
+
### IDEA IN PRACTICE PAGE ###
|
26 |
+
|
27 |
+
st.title("Put the Idea into Practice")
|
28 |
+
st.markdown('''This simulator is designed to help you understand the impacts—social, ethical, and technical—of target variable definition. On this page, we describe how to put the simulator, and the insights it affords, into practice. As with much of the simulator, we will focus on the case of hiring, but what we say applies just as well to other domains, such as college admissions or news recommendation, as we discussed on the Home page.''')
|
29 |
+
|
30 |
+
st.markdown('''How can the simulator be put into practice? The first step is to make two changes to the simulator, because you cannot simply take it “off the shelf” and immediately put it into practice. This is because:''')
|
31 |
+
|
32 |
+
st.markdown('''- The simulator is not built using your data or your models.''')
|
33 |
+
|
34 |
+
st.markdown('''- How you define a target variable in the simulator is not how it’s done in practice. (Rather, the way you define it in the simulator is a straightforward way for you to see the effects of target variable definition.)''')
|
35 |
+
st.markdown('''Below we describe how to address these two issues.''')
|
36 |
+
|
37 |
+
st.subheader('''Using your data and your models''')
|
38 |
+
|
39 |
+
st.markdown('''The simulator offers a starting point: its code is open-source. If you want to build something like the simulator for your data and your models, you now have a blue-print to work from.''')
|
40 |
+
|
41 |
+
st.subheader('''Defining the target variable in practice''')
|
42 |
+
|
43 |
+
st.markdown('''In the simulator, you define the target variable by assigning weights to different cognitive characteristics. These weights determine the “positive label:” people in our dataset who perform best on the tests, given the weights you assign, are those assigned the positive label—that is, those that the model treats as “good.” Then, the model is trained to identify people whose cognitive characteristics match those with the positive label.''')
|
44 |
+
|
45 |
+
st.markdown('''As we discussed on the Home page, a growing number of hiring algorithms use cognitive tests to identify promising job applicants. However, as we touched on in the Visualize Results page, the way target variable definition works with these real-world algorithms is different from how it works in the simulator. For example, consider Pymetrics, a leading developer of hiring software. In some cases, Pymetrics builds bespoke algorithms for a company that is hiring for a given role. Pymetrics will ask the company to identify a group of the client’s current employees in that role that the client considers “good.” Then, these “good” employees play cognitive test games similar to the ones used in our simulator. It is these employees who are assigned the positive labels. From this point on, Pymetrics’ algorithmic development goes just as it does in our simulator: a model is trained to identify job applicants whose cognitive characteristics are similar to those with the positive label.''')
|
46 |
+
|
47 |
+
st.markdown('''So, for hiring algorithms like Pymetrics’, the target variable is defined not by assigning weights to cognitive attributes, but rather by directly identifying a certain group of current employees as “good.” In the simulator, you can define different target variables by assigning different weights to the cognitive attributes. If you are in practice building an algorithm like Pymetrics’, you can define different target variables by identifying different groups of current employees as “good.”''')
|
48 |
+
|
49 |
+
st.markdown('''How might this work? As we discussed on the Home page of the simulator, reasonable minds may disagree about what makes for a good employee, and relatedly reasonable minds may disagree about which current employees are good employees. For example, within a company, two different managers—call them Manager A and Manager B—may not be perfectly aligned in who they consider to be good employees for a certain role. The managers may agree in some cases. We might imagine that there are 50 employees whom both Manager A and Manager B deem good. But the two managers might disagree about other employees. Imagine that there are 25 further employees whom Manager A thinks of as good but Manager B does not (this needn’t mean that Manager B thinks that these employees are bad, just that they are not the best). Likewise, there might be 25 further employees whom Manager B thinks of as good but Manager A does not.''')
|
50 |
+
|
51 |
+
st.markdown('''In this case, there are two different (overlapping) groups of 75 employees, each corresponding to what Managers A and B think of as good employees. These two different groups of employees—and in turn, two different target variable definitions—could be used to train two different models.''')
|
52 |
+
|
53 |
+
st.markdown('''Instead of constructing two groups of “good” employees directly from the judgments of Managers A and B, you could weight their judgments against one another. For example, you could have two groups of employees, X and Y. Both X and Y contain the 50 employees that Managers A and B agree on. But group X contains 20 of Manager A’s preferred employees and 5 of Manager B’s, while group Y contains 20 of Manager B’s preferred employees and 5 of Manager A’s. Here again we have different groups of “good” employees, and so two different target variables.''')
|
54 |
+
|
55 |
+
st.markdown('''One could select different groups of good employees in other ways still. An employer might have different metrics to evaluate employee success. Different employees might be better than others according to one metric compared to another. Depending on what importance is assigned to the different metrics—depending on how you weight the different metrics against one another—different groups of employees may emerge as “good.”''')
|
56 |
+
|
57 |
+
st.markdown('''Our focus in the simulator has been on hiring algorithms that are based on cognitive test games. There are other kinds of algorithms used in hiring—for example, algorithms that identify promising job applicants on the basis of their resumés. In designing any such algorithm, the target variable must be defined, and the notion of a “good employee” must be translated into algorithmic terms. And so the insights of this simulator apply, and can be put into practice, for almost any kind of hiring algorithm you’re working with.''')
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
plotly
|
3 |
+
seaborn
|
4 |
+
python-math
|
5 |
+
matplotlib_venn
|
6 |
+
adjustText
|
7 |
+
statsmodels
|
8 |
+
numpy
|
9 |
+
scipy
|
10 |
+
scikit-learn
|
11 |
+
pandas
|
12 |
+
ipywidgets
|
13 |
+
ipython
|
14 |
+
matplotlib
|
15 |
+
tqdm
|
16 |
+
phik
|
17 |
+
matplotlib-inline
|
18 |
+
pytest
|
19 |
+
Jinja2
|
20 |
+
statkit
|
utils.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:04:19 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### IMPORT LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
import numpy as np
|
12 |
+
import pandas as pd
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import plotly.express as px
|
15 |
+
import seaborn as sns
|
16 |
+
import math
|
17 |
+
from sklearn.decomposition import PCA
|
18 |
+
from numpy import random
|
19 |
+
from sklearn.metrics import confusion_matrix
|
20 |
+
from sklearn import svm
|
21 |
+
from sklearn import metrics
|
22 |
+
from sklearn.metrics import accuracy_score
|
23 |
+
from statkit.non_parametric import bootstrap_score
|
24 |
+
import plotly.graph_objects as go
|
25 |
+
|
26 |
+
### FUNCTIONS ###
|
27 |
+
|
28 |
+
# Function: add labels by probabilities
|
29 |
+
def assign_labels_by_probabilities(df, scores_col, label_col, probs_col, quantile=0.85, num_samples=100):
|
30 |
+
# Sort the dataframe by scores column in descending order
|
31 |
+
annotated = df.sort_values(by=scores_col, ascending=False)
|
32 |
+
annotated.reset_index(drop=True, inplace=True)
|
33 |
+
|
34 |
+
# Assign probability of 0 to bottom whatever quantile of scores
|
35 |
+
annotated.loc[annotated[scores_col] < annotated[scores_col].quantile(quantile), probs_col] = 0
|
36 |
+
|
37 |
+
# Count the number of NaN values in the probabilities column - how many scores left
|
38 |
+
num_nans = annotated[probs_col].isna().sum()
|
39 |
+
|
40 |
+
# Write a linear function to assign increasing probabilities
|
41 |
+
function = np.linspace(start=0.99, stop=0.01, num=num_nans)
|
42 |
+
sum_func = np.sum(function)
|
43 |
+
function = function/sum_func
|
44 |
+
function = pd.Series(function)
|
45 |
+
|
46 |
+
# Assign increasing probabilities to all NaNs
|
47 |
+
annotated[probs_col].fillna(value=function, inplace=True)
|
48 |
+
|
49 |
+
# Randomly select users based on assigned probabilities
|
50 |
+
selected = random.choice(annotated["user_id"], size=num_samples,replace=False, p=annotated[probs_col])
|
51 |
+
annotated[label_col] = 0
|
52 |
+
annotated.loc[annotated['user_id'].isin(selected), label_col] = 1
|
53 |
+
|
54 |
+
return annotated
|
55 |
+
|
56 |
+
# A function to remove protected characteristics and useless data
|
57 |
+
def drop_data(df):
|
58 |
+
labels_to_drop = ["user_id", "age", "gender", "education level", "country", "test_run_id", "battery_id", "time_of_day",
|
59 |
+
"model_A_scores", "model_B_scores", "Model_A_probabilities", "Model_B_probabilities"]
|
60 |
+
clean = df.drop(labels_to_drop, axis = 1)
|
61 |
+
return clean
|
62 |
+
|
63 |
+
# A function to train an SVM
|
64 |
+
def train_and_predict(name, X_train, X_test, y_train, y_test, kernel='poly'):
|
65 |
+
# Define X and Y data
|
66 |
+
name=name
|
67 |
+
|
68 |
+
# Create a svm Classifier
|
69 |
+
clf = svm.SVC(kernel=kernel, probability = True) # Polynomial Kernel
|
70 |
+
|
71 |
+
# Train the model using the training sets
|
72 |
+
model = clf.fit(X_train, y_train.values.ravel())
|
73 |
+
|
74 |
+
# Predict the response for test dataset
|
75 |
+
y_pred = clf.predict(X_test)
|
76 |
+
cm = confusion_matrix(y_pred, y_test)
|
77 |
+
|
78 |
+
# Predict the probabilities for test dataset
|
79 |
+
y_pred_proba = clf.predict_proba(X_test)
|
80 |
+
|
81 |
+
# Change class probabilities into 2 flat numpy arrays
|
82 |
+
array1 = y_pred_proba[:, 0].reshape(-1, 1).flatten()
|
83 |
+
array2 = y_pred_proba[:, 1].reshape(-1, 1).flatten()
|
84 |
+
|
85 |
+
# Append predictions to X_test dataframe
|
86 |
+
X_eval = X_test.copy(deep=True)
|
87 |
+
X_eval[f"Predicted_%s" % name] = y_pred
|
88 |
+
|
89 |
+
# Append probability predictions to X_test dataframe
|
90 |
+
X_eval[f"Prob_0_%s" % name] = array1
|
91 |
+
X_eval[f"Prob_1_%s" % name] = array2
|
92 |
+
|
93 |
+
# Mark which data was used for training
|
94 |
+
X_tr = X_train.copy(deep = True)
|
95 |
+
X_tr[f"Predicted_%s" % name] = "train"
|
96 |
+
|
97 |
+
# Concatenate training and test data
|
98 |
+
X_full = pd.concat([X_eval, X_tr])
|
99 |
+
|
100 |
+
# Reset index and retain old index to be able to get back to sensitive data
|
101 |
+
X_full = X_full.reset_index()
|
102 |
+
|
103 |
+
# Calculate accuracy
|
104 |
+
accuracy = metrics.accuracy_score(y_test, y_pred)
|
105 |
+
|
106 |
+
# Calculate precision
|
107 |
+
precision = metrics.precision_score(y_test, y_pred)
|
108 |
+
|
109 |
+
# Calculate recall
|
110 |
+
recall = metrics.recall_score(y_test, y_pred)
|
111 |
+
|
112 |
+
baseline_accuracy = bootstrap_score(y_test, y_pred, metric=accuracy_score, random_state=5)
|
113 |
+
|
114 |
+
return accuracy, precision, recall, X_full, cm, baseline_accuracy
|
115 |
+
|
116 |
+
# A function to display proportional representation of protected characteristics
|
117 |
+
|
118 |
+
def display_proportional(data, protected_characteristic, which_model):
|
119 |
+
if protected_characteristic == 'age':
|
120 |
+
bins= [18,20,30,40,50,60,70,80,90]
|
121 |
+
labels = ['18-20','21-30','31-40','41-50','51-60','61-70','71-80','81-90']
|
122 |
+
data['age_bins'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)
|
123 |
+
data_all = data.loc[data[which_model] != "train"]
|
124 |
+
info_all = data_all["age_bins"].value_counts()
|
125 |
+
data_sel = data.loc[data[which_model] == 1]
|
126 |
+
info_sel = data_sel["age_bins"].value_counts()
|
127 |
+
dict_all = dict(info_all)
|
128 |
+
dict_sel = dict(info_sel)
|
129 |
+
for key in dict_all.keys():
|
130 |
+
if key not in dict_sel.keys():
|
131 |
+
dict_sel[key] = 0
|
132 |
+
dict_percentage = {k: round(((dict_sel[k] / dict_all[k])*100), 2) for k in dict_all if k in dict_sel}
|
133 |
+
values = []
|
134 |
+
for label in labels:
|
135 |
+
values.append(dict_percentage[label])
|
136 |
+
fig = px.bar(x = labels, y = values, text_auto='.2s')
|
137 |
+
fig.update_layout(yaxis_title="percentage value", xaxis_title="category")
|
138 |
+
st.plotly_chart(fig, use_container_width=True)
|
139 |
+
else:
|
140 |
+
data_all = data.loc[data[which_model] != "train"]
|
141 |
+
info_all = data_all[protected_characteristic].value_counts()
|
142 |
+
data_sel = data.loc[data[which_model] == 1]
|
143 |
+
info_sel = data_sel[protected_characteristic].value_counts()
|
144 |
+
dict_all = dict(info_all)
|
145 |
+
dict_sel = dict(info_sel)
|
146 |
+
for key in dict_all.keys():
|
147 |
+
if key not in dict_sel.keys():
|
148 |
+
dict_sel[key] = 0
|
149 |
+
dict_percentage = {k: round(((dict_sel[k] / dict_all[k])*100), 2) for k in dict_all if k in dict_sel}
|
150 |
+
names = list(dict_percentage.keys())
|
151 |
+
values = list(dict_percentage.values())
|
152 |
+
fig = px.bar(x = names, y = values, text_auto='.2s')
|
153 |
+
fig.update_layout(yaxis_title="percentage value", xaxis_title="category")
|
154 |
+
st.plotly_chart(fig, use_container_width=True)
|
155 |
+
|
156 |
+
# A function to plot data depending on data type
|
157 |
+
def plot_data(data, protected_characteristic, colour_code):
|
158 |
+
|
159 |
+
if protected_characteristic == 'age':
|
160 |
+
mean = data.loc[:, 'age'].mean().round(2)
|
161 |
+
st.markdown(f':green[The mean age for this group is %s years.]' % mean)
|
162 |
+
bin_width= 1
|
163 |
+
nbins = math.ceil((data["age"].max() - data["age"].min()) / bin_width)
|
164 |
+
fig = px.histogram(data, x='age', nbins=nbins)
|
165 |
+
fig.update_layout(margin=dict(l=20, r=20, t=30, b=0))
|
166 |
+
st.plotly_chart(fig, use_container_width=True)
|
167 |
+
|
168 |
+
elif protected_characteristic == 'education level':
|
169 |
+
data = data[protected_characteristic].value_counts().to_frame().reset_index()
|
170 |
+
fig = px.bar(data, x=data.iloc[:,1], y=data.iloc[:,0], orientation='h',color=data.iloc[:,1])
|
171 |
+
fig.update_layout(margin=dict(l=20, r=20, t=30, b=0))
|
172 |
+
fig.update_coloraxes(showscale=False)
|
173 |
+
fig.update_layout(yaxis_title=None)
|
174 |
+
fig.update_layout(xaxis_title=None)
|
175 |
+
st.plotly_chart(fig, use_container_width=True)
|
176 |
+
|
177 |
+
else:
|
178 |
+
data = data[protected_characteristic].value_counts().to_frame().reset_index()
|
179 |
+
fig = px.pie(data, values=data.iloc[:,1], names=data.iloc[:,0], color = data.iloc[:,0],
|
180 |
+
height=300, width=200, color_discrete_map=colour_code)
|
181 |
+
fig.update_layout(margin=dict(l=20, r=20, t=30, b=0))
|
182 |
+
st.plotly_chart(fig, use_container_width=True)
|
183 |
+
|
184 |
+
# A function to run PCA with custom no of components using sklearn
|
185 |
+
def run_PCA(df, drop_1, retain_this, n):
|
186 |
+
df_clean = df.drop(columns = [drop_1, retain_this, "index"])
|
187 |
+
labels = list(df_clean.columns)
|
188 |
+
pca = PCA(n_components=n)
|
189 |
+
principalComponents = pca.fit_transform(df_clean)
|
190 |
+
if n == 2:
|
191 |
+
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
|
192 |
+
else:
|
193 |
+
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2', 'principal component 3'])
|
194 |
+
finalDf = pd.concat([principalDf, df[[retain_this]]], axis = 1)
|
195 |
+
finalDf2 = finalDf.rename(columns = {retain_this : 'target'})
|
196 |
+
coeff = np.transpose(pca.components_[0:2, :])
|
197 |
+
return pca, finalDf2, labels, coeff, principalComponents
|
198 |
+
|
199 |
+
# Plot confusion matrices as heatmaps
|
200 |
+
def create_confusion_matrix_heatmap(confusion_matrix, model):
|
201 |
+
group_names = ['True Negative (TN)','False Positive (FP)','False Negative (FN)','True Positive (TP)']
|
202 |
+
group_counts = ["{0:0.0f}".format(value) for value in confusion_matrix.flatten()]
|
203 |
+
group_percentages = ["{0:.2%}".format(value) for value in confusion_matrix.flatten()/np.sum(confusion_matrix)]
|
204 |
+
labels = [f"{v1}<br>{v2}<br>{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
|
205 |
+
labels = np.asarray(labels).reshape(2,2)
|
206 |
+
|
207 |
+
layout = {
|
208 |
+
"title": f"Confusion Matrix, {model}",
|
209 |
+
"xaxis": {
|
210 |
+
"title": "Predicted value",
|
211 |
+
"tickmode" : 'array',
|
212 |
+
"tickvals" : [0, 1],
|
213 |
+
"ticktext" : ["0", "1"]},
|
214 |
+
"yaxis": {
|
215 |
+
"title": "Actual value",
|
216 |
+
"tickmode" : 'array',
|
217 |
+
"tickvals" : [0, 1],
|
218 |
+
"ticktext" : ["0", "1"]},
|
219 |
+
}
|
220 |
+
fig = go.Figure(data=go.Heatmap(
|
221 |
+
z=confusion_matrix,
|
222 |
+
text=labels,
|
223 |
+
texttemplate="%{text}",
|
224 |
+
textfont={"size":15}), layout = layout)
|
225 |
+
st.plotly_chart(fig, use_container_width = True)
|
226 |
+
|
227 |
+
# Display model metrics as tables
|
228 |
+
def plot_conf_rates(confusion_matrix):
|
229 |
+
TN = confusion_matrix[0,0]
|
230 |
+
TP = confusion_matrix[1,1]
|
231 |
+
FP = confusion_matrix[0,1]
|
232 |
+
FN = confusion_matrix[1,0]
|
233 |
+
|
234 |
+
|
235 |
+
# Sensitivity, hit rate, recall, or true positive rate
|
236 |
+
TPR = TP/(TP+FN)
|
237 |
+
# Specificity or true negative rate
|
238 |
+
TNR = TN/(TN+FP)
|
239 |
+
# Precision or positive predictive value
|
240 |
+
PPV = TP/(TP+FP)
|
241 |
+
# Negative predictive value
|
242 |
+
NPV = TN/(TN+FN)
|
243 |
+
# Fall out or false positive rate
|
244 |
+
FPR = FP/(FP+TN)
|
245 |
+
# False negative rate
|
246 |
+
FNR = FN/(TP+FN)
|
247 |
+
# False discovery rate
|
248 |
+
FDR = FP/(TP+FP)
|
249 |
+
|
250 |
+
# Overall accuracy
|
251 |
+
ACC = (TP+TN)/(TP+FP+FN+TN)
|
252 |
+
d = {'Measure': ['True Positive Rate', 'True Negative Rate', 'Positive Predictive Value', 'Negative Predictive Value', 'False Positive Rate', 'False Negative Rate', 'False Discovery Rate'],
|
253 |
+
'Equation' : ['TPR = TP/(TP+FN)', 'TNR = TN/(TN+FP)', 'PPV = TP/(TP+FP)', 'NPV = TN/(TN+FN)', 'FPR = FP/(FP+TN)', 'FNR = FN/(TP+FN)', 'FDR = FP/(TP+FP)'],
|
254 |
+
'Score': [TPR, TNR, PPV, NPV, FPR, FNR, FDR]}
|
255 |
+
return d
|
🏠Home.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Created on Thu May 25 15:13:58 2023
|
5 |
+
|
6 |
+
@author: daliagala
|
7 |
+
"""
|
8 |
+
|
9 |
+
### LIBRARIES ###
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
### PAGE CONFIG ###
|
13 |
+
st.set_page_config(page_title='FairTargetSim', page_icon=':robot_face:', layout='wide')
|
14 |
+
|
15 |
+
hide_st_style = """
|
16 |
+
<style>
|
17 |
+
#GithubIcon {visibility: hidden;}
|
18 |
+
#MainMenu {visibility: hidden;}
|
19 |
+
footer {visibility: hidden;}
|
20 |
+
header {visibility: hidden;}
|
21 |
+
</style>
|
22 |
+
"""
|
23 |
+
st.markdown(hide_st_style, unsafe_allow_html=True)
|
24 |
+
|
25 |
+
#Set title
|
26 |
+
st.title('FairTargetSim')
|
27 |
+
st.subheader('''Interactive Target Variable Definition Simulator''')
|
28 |
+
|
29 |
+
#Project description
|
30 |
+
st.subheader('Motivation')
|
31 |
+
|
32 |
+
st.markdown('''As machine learning techniques advance, algorithmic systems play an increasing role in almost every domain of our lives. Machine learning requires translating real-world problems into mathematical representations. Sometimes, the translation is straightforward—for example, in predicting whether someone will default on a loan. Other times, things aren't so simple. Consider an algorithm that aims to predict which job applicants will be a good employee for a given job. In order for an algorithmic system to identify potentially good employees, the notion of a “good” employee must be defined in terms that an algorithm can work with. In machine learning, this is called “defining the target variable” (in the case of hiring, the algorithm “aims at the target” of finding good employees).''')
|
33 |
+
|
34 |
+
st.markdown('''Defining the target variable is difficult. Imagine that you are hiring a salesperson. What makes for a good salesperson? Simply someone who makes the most profitable sales? Or is a good salesperson also a good leader? Does a good salesperson come up with new ideas that can improve how the sales team operates as a whole, and not just their individual sales? (The list could go on.) Perhaps the answer is: some of everything.''')
|
35 |
+
|
36 |
+
st.markdown('''But then we ask: how much of each thing? How much more important are individual sales than leadership, for example? Put another way: there may be different ways of understanding which qualities matter for being a good salesperson, and to what degree; reasonable minds may disagree on these issues (as anyone who’s been on a hiring committee has experienced). Even once it’s decided what makes for a good salesperson, there is a further question of how to make the notion precise in algorithmic terms: how do we identify job applicants with sales ability, leadership qualities, or innovative thinking? In order for the algorithm to be able to positively select those applicants, those qualities have to somehow be encoded numerically.''')
|
37 |
+
|
38 |
+
st.markdown('''Defining the target variable is not only difficult; it can also have profound effects on fairness—by resulting in hiring disparities for protected groups [(Passi & Barocas 2019)](https://dl.acm.org/doi/10.1145/3287560.3287567). For example, if you define the notion of a “good” employee in one way you might end up hiring more women than if you were to define “good” in another way. Relatedly, machine learning models might behave differently depending on how “good” employee is defined. Defining the notion in one way might lead to your model being less accurate for older applicants than for younger applicants.''')
|
39 |
+
|
40 |
+
st.markdown('''These issues are not limited to hiring contexts. They arise in any case where there is no simple mathematical translation of a real-world problem. Take university admissions, for example. One might use an algorithm to predict which applicants will be “good” students.” What makes for a good student, though? A student who performs the best on exams at the end of their degree? A student who improves the most in their time at university? A student who doesn’t drop out, or who wins awards, or who gets a prestigious job after graduating, or contributes to the university in extracurricular activities? As with saying what makes for a good salesperson, the answer may be “some of everything,” and so again the question arises: how much of everything? Or consider another case: a news recommendation algorithm for an online platform. What makes for a “good” recommendation? Is it one that maximizes the user’s time on the platform, or that maximizes ad sales, or that is not “biased” along political lines (and then: which political lines?), or that best satisfies the user’s preferences, or that does not spread misinformation, or that prevents political polarization, or…? How these questions are answered—and how these different considerations are weighed against one another—has profound implications for fairness and other social and ethical concerns.''')
|
41 |
+
|
42 |
+
st.markdown('''Target variable definition, then, is not a merely technical matter. The question of what makes for a “good” employee, student, or news recommendation (and so on) is fundamentally value-laden. It calls for close attention and transparency [(Fazelpour & Danks, 2021)](https://compass.onlinelibrary.wiley.com/doi/full/10.1111/phc3.12760). All too often, though, target variables are defined in technical settings without attention to fairness. Further, stakeholders who aren't a part of the technical process—like managers in non-technical roles, or those working in upper management or human resources—either do not understand, or are simply not aware of, the fraught nature of target variable definition.''')
|
43 |
+
|
44 |
+
st.markdown('''We have developed FairTargetSim (FTS) to help address this issue. The simulator makes the implications of target variable definition explicit—and transparent—and offers a blue-print for those who want to address these effects in practice. FTS uses the case study of hiring; the lessons one can draw from it extend to any domain in which there are no clear-cut answers to the question of target variable definition.''')
|
45 |
+
|
46 |
+
st.subheader('Overview of the simulator')
|
47 |
+
st.markdown('''The simulator has three pages, which are best visited in order.''')
|
48 |
+
st.markdown('''- **Define Target Variables.** On this page, we invite you to imagine that you are building a hiring algorithm for a certain role. You can define two different target variables—two different ways of understanding what counts as a good employee. The simulator then uses your target variables to generate two datasets and two models. The first model predicts which candidates will be good employees according to your first definition of “good;” the second model predicts which candidates will be good employees according to your second definition.''')
|
49 |
+
st.markdown("- **Visualize The Results.** This page contains visualizations that illustrate how your two target variable definitions impact issues of fairness and overall model performance. You can see, for example, which model selects more female applicants, or which model is more accurate for older applicants. You can also see, among other things, how the two models differ in overall performance. In addition, you can see how your target variable definitions affect the data that go into training the model.")
|
50 |
+
st.markdown("- **Put the Idea into Practice.** This page contains guidance for putting the simulator, and the ideas behind it, into practice. A practitioner who is building or using their own hiring algorithms cannot take our simulator “off the shelf” and apply it directly to their own data or models. We give guidance for how a practitioner could adapt our simulator to use in their own work.")
|
51 |
+
|
52 |
+
st.subheader('Example')
|
53 |
+
st.markdown('''Below is an example of the simulator in action. On the Define Target Variables page, you’ll assign the importance of different cognitive characteristics by setting sliders that represent five different cognitive characteristics; you do this twice, and then the simulator builds two models, A and B. On the Visualize the Results page, you’ll see how even very small changes—such as changing one point of importance for “behavioral restraint’’ (highlighted in green)—can result in completely different outcomes for the models.''')
|
54 |
+
st.markdown('''**From the Define Target Variables page:**''')
|
55 |
+
st.image('./images/tests.png')
|
56 |
+
|
57 |
+
st.markdown('''**From the Visualize the Results page:**''')
|
58 |
+
st.image('./images/pie_charts.png')
|