Spaces:
Sleeping
Sleeping
lll
#6
by
XinGuan2000
- opened
- pages/1_Injection.py +11 -34
- pages/2_Evaluation.py +1 -1
- requirements.txt +1 -2
- resume_chunked.csv +0 -3
- resume_subsampled.csv +2 -2
- util/__pycache__/__init__.cpython-311.pyc +0 -0
- util/__pycache__/evaluation.cpython-311.pyc +0 -0
- util/__pycache__/injection.cpython-311.pyc +0 -0
- util/__pycache__/model.cpython-311.pyc +0 -0
- util/__pycache__/prompt.cpython-311.pyc +0 -0
- util/evaluation.py +93 -303
- util/injection.py +20 -21
- util/model.py +0 -44
pages/1_Injection.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
from util.injection import process_scores_multiple
|
5 |
-
from util.model import AzureAgent, GPTAgent
|
6 |
from util.prompt import PROMPT_TEMPLATE
|
7 |
import os
|
8 |
|
@@ -10,7 +10,6 @@ st.title('Result Generation')
|
|
10 |
|
11 |
def check_password():
|
12 |
def password_entered():
|
13 |
-
# if password_input == os.getenv('PASSWORD'):
|
14 |
if password_input == os.getenv('PASSWORD'):
|
15 |
st.session_state['password_correct'] = True
|
16 |
else:
|
@@ -29,18 +28,12 @@ def initialize_state():
|
|
29 |
"data_processed", "group_name", "occupation", "privilege_label", "protect_label", "num_run",
|
30 |
"uploaded_file", "occupation_submitted","sample_size","charateristics","proportion","prompt_template"]
|
31 |
defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.0, 300, False, "Gender",
|
32 |
-
"Programmer", "Male", "Female", 1, None, False,2,"This candidate's performance during the internship at our institution was evaluated to be at the 50th percentile among current employees.",
|
33 |
for key, default in zip(keys, defaults):
|
34 |
if key not in st.session_state:
|
35 |
st.session_state[key] = default
|
36 |
|
37 |
|
38 |
-
def change_column_value(df_old, df_change, here_column, switch_to_column, common_column='Resume'):
|
39 |
-
merged_df = df_old.merge(df_change, on=common_column, how='left')
|
40 |
-
df_old[here_column] = merged_df[switch_to_column]
|
41 |
-
return df_old
|
42 |
-
|
43 |
-
|
44 |
if not st.session_state.get('password_correct', False):
|
45 |
check_password()
|
46 |
else:
|
@@ -49,21 +42,15 @@ else:
|
|
49 |
st.sidebar.title('Model Settings')
|
50 |
initialize_state()
|
51 |
|
52 |
-
|
53 |
-
|
54 |
# Model selection and configuration
|
55 |
-
model_type = st.sidebar.radio("Select the type of agent", ('GPTAgent', 'AzureAgent'
|
56 |
st.session_state.api_key = st.sidebar.text_input("API Key", type="password", value=st.session_state.api_key)
|
|
|
57 |
st.session_state.deployment_name = st.sidebar.text_input("Model Name", value=st.session_state.deployment_name)
|
58 |
-
|
59 |
st.session_state.temperature = st.sidebar.slider("Temperature", 0.0, 1.0, st.session_state.temperature, 0.01)
|
60 |
st.session_state.max_tokens = st.sidebar.number_input("Max Tokens", 1, 1000, st.session_state.max_tokens)
|
61 |
|
62 |
-
if model_type == 'GPTAgent' or model_type == 'AzureAgent':
|
63 |
-
st.session_state.endpoint_url = st.sidebar.text_input("Endpoint URL", value=st.session_state.endpoint_url)
|
64 |
-
api_version = '2024-02-15-preview' if model_type == 'GPTAgent' else ''
|
65 |
-
|
66 |
-
|
67 |
if st.sidebar.button("Reset Model Info"):
|
68 |
initialize_state() # Reset all state to defaults
|
69 |
st.experimental_rerun()
|
@@ -93,23 +80,17 @@ else:
|
|
93 |
st.session_state.prompt_template = st.text_area("Prompt Template", value=st.session_state.prompt_template)
|
94 |
|
95 |
st.session_state.sample_size = st.number_input("Sample Size", 2, len(df), st.session_state.sample_size)
|
96 |
-
|
97 |
st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
|
98 |
st.session_state.privilege_label = st.text_input("Privilege Label", value=st.session_state.privilege_label)
|
99 |
st.session_state.protect_label = st.text_input("Protect Label", value=st.session_state.protect_label)
|
100 |
-
st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
|
101 |
|
102 |
#st.session_state.charateristics = st.text_area("Characteristics", value=st.session_state.charateristics)
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
# if file_options == "Example":
|
107 |
-
# st.session_state.proportion = st.slider("Proportion", 0.2, 1.0, float(st.session_state.proportion), 0.2)
|
108 |
-
# df_chunked = pd.read_csv("resume_chunked.csv")
|
109 |
-
# column_switch_to = f'{st.session_state.proportion}_diluted'
|
110 |
-
# df = change_column_value(df, df_chunked, 'Cleaned_Resume', column_switch_to)
|
111 |
|
112 |
-
df = df
|
|
|
113 |
st.write('Data:', df)
|
114 |
|
115 |
if st.button('Process Data') and not st.session_state.data_processed:
|
@@ -117,16 +98,13 @@ else:
|
|
117 |
if model_type == 'AzureAgent':
|
118 |
agent = AzureAgent(st.session_state.api_key, st.session_state.endpoint_url,
|
119 |
st.session_state.deployment_name)
|
120 |
-
|
121 |
agent = GPTAgent(st.session_state.api_key, st.session_state.endpoint_url,
|
122 |
st.session_state.deployment_name, api_version)
|
123 |
-
else:
|
124 |
-
agent = Claude3Agent(st.session_state.api_key,st.session_state.deployment_name)
|
125 |
-
|
126 |
|
127 |
with st.spinner('Processing data...'):
|
128 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
129 |
-
preprocessed_df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,st.session_state.protect_label, agent, st.session_state.group_name,st.session_state.occupation,st.session_state.prompt_template)
|
130 |
st.session_state.data_processed = True # Mark as processed
|
131 |
|
132 |
st.write('Processed Data:', preprocessed_df)
|
@@ -150,4 +128,3 @@ else:
|
|
150 |
st.session_state.num_run = 1
|
151 |
st.session_state.data_processed = False
|
152 |
st.session_state.uploaded_file = None
|
153 |
-
st.session_state.proportion = 1.0
|
|
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
from util.injection import process_scores_multiple
|
5 |
+
from util.model import AzureAgent, GPTAgent
|
6 |
from util.prompt import PROMPT_TEMPLATE
|
7 |
import os
|
8 |
|
|
|
10 |
|
11 |
def check_password():
|
12 |
def password_entered():
|
|
|
13 |
if password_input == os.getenv('PASSWORD'):
|
14 |
st.session_state['password_correct'] = True
|
15 |
else:
|
|
|
28 |
"data_processed", "group_name", "occupation", "privilege_label", "protect_label", "num_run",
|
29 |
"uploaded_file", "occupation_submitted","sample_size","charateristics","proportion","prompt_template"]
|
30 |
defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.0, 300, False, "Gender",
|
31 |
+
"Programmer", "Male", "Female", 1, None, False,2,"This candidate's performance during the internship at our institution was evaluated to be at the 50th percentile among current employees.",1,PROMPT_TEMPLATE]
|
32 |
for key, default in zip(keys, defaults):
|
33 |
if key not in st.session_state:
|
34 |
st.session_state[key] = default
|
35 |
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
if not st.session_state.get('password_correct', False):
|
38 |
check_password()
|
39 |
else:
|
|
|
42 |
st.sidebar.title('Model Settings')
|
43 |
initialize_state()
|
44 |
|
|
|
|
|
45 |
# Model selection and configuration
|
46 |
+
model_type = st.sidebar.radio("Select the type of agent", ('GPTAgent', 'AzureAgent'))
|
47 |
st.session_state.api_key = st.sidebar.text_input("API Key", type="password", value=st.session_state.api_key)
|
48 |
+
st.session_state.endpoint_url = st.sidebar.text_input("Endpoint URL", value=st.session_state.endpoint_url)
|
49 |
st.session_state.deployment_name = st.sidebar.text_input("Model Name", value=st.session_state.deployment_name)
|
50 |
+
api_version = '2024-02-15-preview' if model_type == 'GPTAgent' else ''
|
51 |
st.session_state.temperature = st.sidebar.slider("Temperature", 0.0, 1.0, st.session_state.temperature, 0.01)
|
52 |
st.session_state.max_tokens = st.sidebar.number_input("Max Tokens", 1, 1000, st.session_state.max_tokens)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
54 |
if st.sidebar.button("Reset Model Info"):
|
55 |
initialize_state() # Reset all state to defaults
|
56 |
st.experimental_rerun()
|
|
|
80 |
st.session_state.prompt_template = st.text_area("Prompt Template", value=st.session_state.prompt_template)
|
81 |
|
82 |
st.session_state.sample_size = st.number_input("Sample Size", 2, len(df), st.session_state.sample_size)
|
83 |
+
st.session_state.proportion = st.number_input("Proportion", 0.0, 1.0, float(st.session_state.proportion), 0.01)
|
84 |
st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
|
85 |
st.session_state.privilege_label = st.text_input("Privilege Label", value=st.session_state.privilege_label)
|
86 |
st.session_state.protect_label = st.text_input("Protect Label", value=st.session_state.protect_label)
|
|
|
87 |
|
88 |
#st.session_state.charateristics = st.text_area("Characteristics", value=st.session_state.charateristics)
|
89 |
|
90 |
+
st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
df = df[df["Occupation"] == st.session_state.occupation]
|
93 |
+
df = df.sample(n=st.session_state.sample_size,random_state=42)
|
94 |
st.write('Data:', df)
|
95 |
|
96 |
if st.button('Process Data') and not st.session_state.data_processed:
|
|
|
98 |
if model_type == 'AzureAgent':
|
99 |
agent = AzureAgent(st.session_state.api_key, st.session_state.endpoint_url,
|
100 |
st.session_state.deployment_name)
|
101 |
+
else:
|
102 |
agent = GPTAgent(st.session_state.api_key, st.session_state.endpoint_url,
|
103 |
st.session_state.deployment_name, api_version)
|
|
|
|
|
|
|
104 |
|
105 |
with st.spinner('Processing data...'):
|
106 |
parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
|
107 |
+
preprocessed_df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,st.session_state.protect_label, agent, st.session_state.group_name,st.session_state.occupation,st.session_state.proportion,st.session_state.prompt_template)
|
108 |
st.session_state.data_processed = True # Mark as processed
|
109 |
|
110 |
st.write('Processed Data:', preprocessed_df)
|
|
|
128 |
st.session_state.num_run = 1
|
129 |
st.session_state.data_processed = False
|
130 |
st.session_state.uploaded_file = None
|
|
pages/2_Evaluation.py
CHANGED
@@ -4,7 +4,7 @@ import numpy as np
|
|
4 |
import streamlit as st
|
5 |
import pandas as pd
|
6 |
from io import StringIO
|
7 |
-
from util.evaluation import statistical_tests
|
8 |
from util.plot import create_score_plot,create_rank_plots,create_correlation_heatmaps,create_3d_plot,calculate_distances
|
9 |
import plotly.express as px
|
10 |
|
|
|
4 |
import streamlit as st
|
5 |
import pandas as pd
|
6 |
from io import StringIO
|
7 |
+
from util.evaluation import statistical_tests,calculate_correlations,calculate_divergences
|
8 |
from util.plot import create_score_plot,create_rank_plots,create_correlation_heatmaps,create_3d_plot,calculate_distances
|
9 |
import plotly.express as px
|
10 |
|
requirements.txt
CHANGED
@@ -5,5 +5,4 @@ scipy
|
|
5 |
statsmodels
|
6 |
scikit-posthocs
|
7 |
json-repair
|
8 |
-
plotly
|
9 |
-
boto3
|
|
|
5 |
statsmodels
|
6 |
scikit-posthocs
|
7 |
json-repair
|
8 |
+
plotly
|
|
resume_chunked.csv
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:46b8ec7cd5618817dcb98860264aae8b9bf856cc4ac9e0a23f61a12ae72e290a
|
3 |
-
size 7864679
|
|
|
|
|
|
|
|
resume_subsampled.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9ae325b538c1e601fe44bb2d0377800c0a633a8a19bb6ecb5834386d24aa6bf2
|
3 |
+
size 3845010
|
util/__pycache__/__init__.cpython-311.pyc
DELETED
Binary file (176 Bytes)
|
|
util/__pycache__/evaluation.cpython-311.pyc
DELETED
Binary file (11 kB)
|
|
util/__pycache__/injection.cpython-311.pyc
DELETED
Binary file (7.19 kB)
|
|
util/__pycache__/model.cpython-311.pyc
DELETED
Binary file (3.55 kB)
|
|
util/__pycache__/prompt.cpython-311.pyc
DELETED
Binary file (1.41 kB)
|
|
util/evaluation.py
CHANGED
@@ -10,64 +10,12 @@ from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
|
|
10 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
11 |
from scipy.stats import ttest_1samp
|
12 |
|
13 |
-
|
14 |
-
def test_statistic_variance_ratio(x, y):
|
15 |
-
return np.var(x, ddof=1) / np.var(y, ddof=1)
|
16 |
-
|
17 |
-
|
18 |
-
def test_statistic_mean_difference(x, y):
|
19 |
-
return np.mean(x) - np.mean(y)
|
20 |
-
|
21 |
-
|
22 |
-
def permutation_test_variance(x, y, num_permutations=100000):
|
23 |
-
T_obs = test_statistic_variance_ratio(x, y)
|
24 |
-
pooled_data = np.concatenate([x, y])
|
25 |
-
n_A = len(x)
|
26 |
-
|
27 |
-
perm_test_stats = [T_obs]
|
28 |
-
for _ in range(num_permutations):
|
29 |
-
np.random.shuffle(pooled_data)
|
30 |
-
perm_A = pooled_data[:n_A]
|
31 |
-
perm_B = pooled_data[n_A:]
|
32 |
-
perm_test_stats.append(test_statistic_variance_ratio(perm_A, perm_B))
|
33 |
-
|
34 |
-
perm_test_stats = np.array(perm_test_stats)
|
35 |
-
p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))
|
36 |
-
|
37 |
-
return T_obs, p_value
|
38 |
-
|
39 |
-
|
40 |
-
def permutation_test_mean(x, y, num_permutations=100000):
|
41 |
-
T_obs = test_statistic_mean_difference(x, y)
|
42 |
-
pooled_data = np.concatenate([x, y])
|
43 |
-
n_A = len(x)
|
44 |
-
|
45 |
-
perm_test_stats = [T_obs]
|
46 |
-
for _ in range(num_permutations):
|
47 |
-
np.random.shuffle(pooled_data)
|
48 |
-
perm_A = pooled_data[:n_A]
|
49 |
-
perm_B = pooled_data[n_A:]
|
50 |
-
perm_test_stats.append(test_statistic_mean_difference(perm_A, perm_B))
|
51 |
-
|
52 |
-
perm_test_stats = np.array(perm_test_stats)
|
53 |
-
p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))
|
54 |
-
|
55 |
-
return T_obs, p_value
|
56 |
-
|
57 |
def calculate_impact_ratio(selection_rates):
|
58 |
"""Calculate the impact ratio for each category."""
|
59 |
most_selected_rate = max(selection_rates.values())
|
60 |
impact_ratios = {category: rate / most_selected_rate for category, rate in selection_rates.items()}
|
61 |
return impact_ratios
|
62 |
|
63 |
-
def statistical_parity_difference(y_true, y_pred=None, reference_group='Privilege'):
|
64 |
-
selection_rates = y_pred if y_pred is not None else y_true
|
65 |
-
reference_rate = selection_rates[reference_group]
|
66 |
-
spd = {category: rate - reference_rate for category, rate in selection_rates.items()}
|
67 |
-
return spd
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
def statistical_parity_difference(selection_rates):
|
72 |
"""Calculate statistical parity difference."""
|
73 |
most_selected_rate = max(selection_rates.values())
|
@@ -80,43 +28,53 @@ def calculate_four_fifths_rule(impact_ratios):
|
|
80 |
return adverse_impact
|
81 |
|
82 |
def statistical_tests(data):
|
83 |
-
# Add ranks for each score within each row
|
84 |
-
# ranks = data[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=True)
|
85 |
-
#
|
86 |
-
# data['Privilege_Rank'] = ranks['Privilege_Avg_Score']
|
87 |
-
# data['Protect_Rank'] = ranks['Protect_Avg_Score']
|
88 |
-
# data['Neutral_Rank'] = ranks['Neutral_Avg_Score']
|
89 |
-
|
90 |
"""Perform various statistical tests to evaluate potential biases."""
|
91 |
variables = ['Privilege', 'Protect', 'Neutral']
|
92 |
rank_suffix = '_Rank'
|
93 |
score_suffix = '_Avg_Score'
|
94 |
|
95 |
-
# Calculate average ranks
|
96 |
rank_columns = [v + rank_suffix for v in variables]
|
97 |
average_ranks = data[rank_columns].mean()
|
98 |
average_scores = data[[v + score_suffix for v in variables]].mean()
|
99 |
|
100 |
-
# Statistical tests
|
101 |
rank_data = [data[col] for col in rank_columns]
|
102 |
-
pairs = [('Privilege', 'Protect'), ('Protect', 'Neutral'), ('Privilege', 'Neutral')]
|
103 |
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
-
|
107 |
-
|
|
|
108 |
|
|
|
|
|
109 |
pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
110 |
-
pair_score_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
111 |
|
|
|
112 |
if len(data) > 20:
|
113 |
-
|
114 |
-
wilcoxon_stat_score, wilcoxon_p_score = wilcoxon(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
|
115 |
else:
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
# Calculate variances for ranks
|
122 |
variances = {col: data[col].var() for col in rank_columns}
|
@@ -126,65 +84,24 @@ def statistical_tests(data):
|
|
126 |
'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
|
127 |
}
|
128 |
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
|
138 |
-
spd_result_rank = statistical_parity_difference(selection_rates_rank)
|
139 |
-
adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
|
140 |
|
141 |
# Friedman test
|
142 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
143 |
-
rank_matrix_transposed = np.transpose(data[rank_columns].values)
|
144 |
-
posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
# Perform permutation tests for variances by using rank data
|
152 |
-
T_priv_prot_var_score, p_priv_prot_var_score = permutation_test_variance(data['Privilege_Avg_Score'], data['Protect_Avg_Score'])
|
153 |
-
T_neut_prot_var_score, p_neut_prot_var_score = permutation_test_variance(data['Neutral_Avg_Score'], data['Protect_Avg_Score'])
|
154 |
-
T_neut_priv_var_score, p_neut_priv_var_score = permutation_test_variance(data['Neutral_Avg_Score'], data['Privilege_Avg_Score'])
|
155 |
|
156 |
-
# Perform permutation tests for means
|
157 |
-
T_priv_prot_mean_rank, p_priv_prot_mean_rank = permutation_test_mean(data['Privilege_Rank'], data['Protect_Rank'])
|
158 |
-
T_neut_prot_mean_rank, p_neut_prot_mean_rank = permutation_test_mean(data['Neutral_Rank'], data['Protect_Rank'])
|
159 |
-
T_neut_priv_mean_rank, p_neut_priv_mean_rank = permutation_test_mean(data['Neutral_Rank'], data['Privilege_Rank'])
|
160 |
|
161 |
-
# Perform permutation tests for means by using rank data
|
162 |
-
T_priv_prot_mean_score, p_priv_prot_mean_score = permutation_test_mean(data['Privilege_Avg_Score'], data['Protect_Avg_Score'])
|
163 |
-
T_neut_prot_mean_score, p_neut_prot_mean_score = permutation_test_mean(data['Neutral_Avg_Score'], data['Protect_Avg_Score'])
|
164 |
-
T_neut_priv_mean_score, p_neut_priv_mean_score = permutation_test_mean(data['Neutral_Avg_Score'], data['Privilege_Avg_Score'])
|
165 |
-
|
166 |
-
permutation_results = {
|
167 |
-
"Permutation Tests for Variances (score)": {
|
168 |
-
"Privilege vs. Protect": {"Statistic": T_priv_prot_var_score, "p-value": p_priv_prot_var_score},
|
169 |
-
"Neutral vs. Protect": {"Statistic": T_neut_prot_var_score, "p-value": p_neut_prot_var_score},
|
170 |
-
"Neutral vs. Privilege": {"Statistic": T_neut_priv_var_score, "p-value": p_neut_priv_var_score}
|
171 |
-
},
|
172 |
-
"Permutation Tests for Means (score)": {
|
173 |
-
"Privilege vs. Protect": {"Statistic": T_priv_prot_mean_score, "p-value": p_priv_prot_mean_score},
|
174 |
-
"Neutral vs. Protect": {"Statistic": T_neut_prot_mean_score, "p-value": p_neut_prot_mean_score},
|
175 |
-
"Neutral vs. Privilege": {"Statistic": T_neut_priv_mean_score, "p-value": p_neut_priv_mean_score}
|
176 |
-
},
|
177 |
-
"Permutation Tests for Variances (rank)": {
|
178 |
-
"Privilege vs. Protect": {"Statistic": T_priv_prot_var_rank, "p-value": p_priv_prot_var_rank},
|
179 |
-
"Neutral vs. Protect": {"Statistic": T_neut_prot_var_rank, "p-value": p_neut_prot_var_rank},
|
180 |
-
"Neutral vs. Privilege": {"Statistic": T_neut_priv_var_rank, "p-value": p_neut_priv_var_rank}
|
181 |
-
},
|
182 |
-
"Permutation Tests for Means (rank)": {
|
183 |
-
"Privilege vs. Protect": {"Statistic": T_priv_prot_mean_rank, "p-value": p_priv_prot_mean_rank},
|
184 |
-
"Neutral vs. Protect": {"Statistic": T_neut_prot_mean_rank, "p-value": p_neut_prot_mean_rank},
|
185 |
-
"Neutral vs. Privilege": {"Statistic": T_neut_priv_mean_rank, "p-value": p_neut_priv_mean_rank}
|
186 |
-
}
|
187 |
-
}
|
188 |
|
189 |
results = {
|
190 |
"Average Ranks": average_ranks.to_dict(),
|
@@ -195,189 +112,62 @@ def statistical_tests(data):
|
|
195 |
"Post-hoc": posthoc_results
|
196 |
},
|
197 |
**pairwise_results,
|
198 |
-
|
199 |
"Pairwise Comparisons of Variances": pairwise_variances,
|
200 |
-
"Statistical Parity Difference":
|
201 |
-
|
202 |
-
|
203 |
-
},
|
204 |
-
"Disparate Impact Ratios": {
|
205 |
-
"Avg_Score": impact_ratios_Avg_Score,
|
206 |
-
"Rank": impact_ratios_rank
|
207 |
-
},
|
208 |
-
"Four-Fifths Rule": {
|
209 |
-
"Avg_Score": adverse_impact_Avg_Score,
|
210 |
-
"Rank": adverse_impact_rank
|
211 |
-
},
|
212 |
-
**permutation_results
|
213 |
}
|
214 |
|
215 |
return results
|
216 |
|
217 |
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
# variables = ['Privilege', 'Protect', 'Neutral']
|
222 |
-
# rank_suffix = '_Rank'
|
223 |
-
# score_suffix = '_Avg_Score'
|
224 |
-
#
|
225 |
-
# # Calculate average ranks
|
226 |
-
# rank_columns = [v + rank_suffix for v in variables]
|
227 |
-
# average_ranks = data[rank_columns].mean()
|
228 |
-
# average_scores = data[[v + score_suffix for v in variables]].mean()
|
229 |
-
#
|
230 |
-
# # Statistical tests
|
231 |
-
# rank_data = [data[col] for col in rank_columns]
|
232 |
-
#
|
233 |
-
# # Pairwise tests
|
234 |
-
# pairs = [
|
235 |
-
# ('Privilege', 'Protect'),
|
236 |
-
# ('Protect', 'Neutral'),
|
237 |
-
# ('Privilege', 'Neutral')
|
238 |
-
# ]
|
239 |
-
#
|
240 |
-
# pairwise_results = {
|
241 |
-
# 'Wilcoxon Test': {}
|
242 |
-
# }
|
243 |
-
#
|
244 |
-
# for (var1, var2) in pairs:
|
245 |
-
# pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
246 |
-
# pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
247 |
-
#
|
248 |
-
# # Wilcoxon Signed-Rank Test
|
249 |
-
# if len(data) > 20:
|
250 |
-
# wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
|
251 |
-
# else:
|
252 |
-
# wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
|
253 |
-
# pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
|
254 |
-
#
|
255 |
-
# # Levene's Test for Equality of Variances
|
256 |
-
# levene_results = {}
|
257 |
-
# levene_privilege_protect = levene(data['Privilege_Rank'], data['Protect_Rank'])
|
258 |
-
# levene_privilege_neutral = levene(data['Privilege_Rank'], data['Neutral_Rank'])
|
259 |
-
# levene_protect_neutral = levene(data['Protect_Rank'], data['Neutral_Rank'])
|
260 |
-
#
|
261 |
-
# levene_results['Privilege vs Protect'] = {"Statistic": levene_privilege_protect.statistic,
|
262 |
-
# "p-value": levene_privilege_protect.pvalue}
|
263 |
-
# levene_results['Privilege vs Neutral'] = {"Statistic": levene_privilege_neutral.statistic,
|
264 |
-
# "p-value": levene_privilege_neutral.pvalue}
|
265 |
-
# levene_results['Protect vs Neutral'] = {"Statistic": levene_protect_neutral.statistic,
|
266 |
-
# "p-value": levene_protect_neutral.pvalue}
|
267 |
-
#
|
268 |
-
# # Calculate variances for ranks
|
269 |
-
# variances = {col: data[col].var() for col in rank_columns}
|
270 |
-
# pairwise_variances = {
|
271 |
-
# 'Privilege_Rank vs Protect_Rank': variances['Privilege_Rank'] > variances['Protect_Rank'],
|
272 |
-
# 'Privilege_Rank vs Neutral_Rank': variances['Privilege_Rank'] > variances['Neutral_Rank'],
|
273 |
-
# 'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
|
274 |
-
# }
|
275 |
-
#
|
276 |
-
# selection_rates_Avg_Score = {
|
277 |
-
# 'Privilege': data['Privilege_Avg_Score'].mean(),
|
278 |
-
# 'Protect': data['Protect_Avg_Score'].mean(),
|
279 |
-
# 'Neutral': data['Neutral_Avg_Score'].mean()
|
280 |
-
# }
|
281 |
-
# impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
|
282 |
-
# spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
|
283 |
-
# adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
|
284 |
-
#
|
285 |
-
#
|
286 |
-
# # rank version of bias metrics
|
287 |
-
# selection_rates_rank = {
|
288 |
-
# 'Privilege': data['Privilege_Rank'].mean(),
|
289 |
-
# 'Protect': data['Protect_Rank'].mean(),
|
290 |
-
# 'Neutral': data['Neutral_Rank'].mean()
|
291 |
-
# }
|
292 |
-
# impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
|
293 |
-
# spd_result_rank = statistical_parity_difference(selection_rates_rank)
|
294 |
-
# adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
|
295 |
-
#
|
296 |
-
#
|
297 |
-
# # Friedman test
|
298 |
-
# friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
299 |
-
#
|
300 |
-
# rank_matrix = data[rank_columns].values
|
301 |
-
# rank_matrix_transposed = np.transpose(rank_matrix)
|
302 |
-
# posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
|
303 |
-
# #posthoc_results = posthoc_friedman(data, variables, rank_suffix)
|
304 |
-
#
|
305 |
-
#
|
306 |
-
#
|
307 |
-
# results = {
|
308 |
-
# "Average Ranks": average_ranks.to_dict(),
|
309 |
-
# "Average Scores": average_scores.to_dict(),
|
310 |
-
# "Friedman Test": {
|
311 |
-
# "Statistic": friedman_stat,
|
312 |
-
# "p-value": friedman_p,
|
313 |
-
# "Post-hoc": posthoc_results
|
314 |
-
# },
|
315 |
-
# **pairwise_results,
|
316 |
-
# "Levene's Test for Equality of Variances": levene_results,
|
317 |
-
# "Pairwise Comparisons of Variances": pairwise_variances,
|
318 |
-
# "Statistical Parity Difference": {
|
319 |
-
# "Avg_Score": spd_result_Avg_Score,
|
320 |
-
# "Rank": spd_result_rank
|
321 |
-
# },
|
322 |
-
# "Disparate Impact Ratios": {
|
323 |
-
# "Avg_Score": impact_ratios_Avg_Score,
|
324 |
-
# "Rank": impact_ratios_rank
|
325 |
-
# },
|
326 |
-
# "Four-Fifths Rule": {
|
327 |
-
# "Avg_Score": adverse_impact_Avg_Score,
|
328 |
-
# "Rank": adverse_impact_rank
|
329 |
-
# }
|
330 |
-
# }
|
331 |
-
#
|
332 |
-
# return results
|
333 |
|
334 |
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
# divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
|
380 |
-
# probabilities[col2])
|
381 |
-
# divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
|
382 |
-
# probabilities[col2])
|
383 |
-
# return divergences
|
|
|
10 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
11 |
from scipy.stats import ttest_1samp
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def calculate_impact_ratio(selection_rates):
|
14 |
"""Calculate the impact ratio for each category."""
|
15 |
most_selected_rate = max(selection_rates.values())
|
16 |
impact_ratios = {category: rate / most_selected_rate for category, rate in selection_rates.items()}
|
17 |
return impact_ratios
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
def statistical_parity_difference(selection_rates):
|
20 |
"""Calculate statistical parity difference."""
|
21 |
most_selected_rate = max(selection_rates.values())
|
|
|
28 |
return adverse_impact
|
29 |
|
30 |
def statistical_tests(data):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
"""Perform various statistical tests to evaluate potential biases."""
|
32 |
variables = ['Privilege', 'Protect', 'Neutral']
|
33 |
rank_suffix = '_Rank'
|
34 |
score_suffix = '_Avg_Score'
|
35 |
|
36 |
+
# Calculate average ranks
|
37 |
rank_columns = [v + rank_suffix for v in variables]
|
38 |
average_ranks = data[rank_columns].mean()
|
39 |
average_scores = data[[v + score_suffix for v in variables]].mean()
|
40 |
|
41 |
+
# Statistical tests
|
42 |
rank_data = [data[col] for col in rank_columns]
|
|
|
43 |
|
44 |
+
# Pairwise tests
|
45 |
+
pairs = [
|
46 |
+
('Privilege', 'Protect'),
|
47 |
+
('Protect', 'Neutral'),
|
48 |
+
('Privilege', 'Neutral')
|
49 |
+
]
|
50 |
|
51 |
+
pairwise_results = {
|
52 |
+
'Wilcoxon Test': {}
|
53 |
+
}
|
54 |
|
55 |
+
for (var1, var2) in pairs:
|
56 |
+
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
57 |
pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
|
|
58 |
|
59 |
+
# Wilcoxon Signed-Rank Test
|
60 |
if len(data) > 20:
|
61 |
+
wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
|
|
|
62 |
else:
|
63 |
+
wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
|
64 |
+
pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
|
65 |
+
|
66 |
+
# Levene's Test for Equality of Variances
|
67 |
+
levene_results = {}
|
68 |
+
levene_privilege_protect = levene(data['Privilege_Rank'], data['Protect_Rank'])
|
69 |
+
levene_privilege_neutral = levene(data['Privilege_Rank'], data['Neutral_Rank'])
|
70 |
+
levene_protect_neutral = levene(data['Protect_Rank'], data['Neutral_Rank'])
|
71 |
+
|
72 |
+
levene_results['Privilege vs Protect'] = {"Statistic": levene_privilege_protect.statistic,
|
73 |
+
"p-value": levene_privilege_protect.pvalue}
|
74 |
+
levene_results['Privilege vs Neutral'] = {"Statistic": levene_privilege_neutral.statistic,
|
75 |
+
"p-value": levene_privilege_neutral.pvalue}
|
76 |
+
levene_results['Protect vs Neutral'] = {"Statistic": levene_protect_neutral.statistic,
|
77 |
+
"p-value": levene_protect_neutral.pvalue}
|
78 |
|
79 |
# Calculate variances for ranks
|
80 |
variances = {col: data[col].var() for col in rank_columns}
|
|
|
84 |
'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
|
85 |
}
|
86 |
|
87 |
+
selection_rates = {
|
88 |
+
'Privilege': data['Privilege_Rank'].mean(),
|
89 |
+
'Protect': data['Protect_Rank'].mean(),
|
90 |
+
'Neutral': data['Neutral_Rank'].mean()
|
91 |
+
}
|
92 |
+
impact_ratios = calculate_impact_ratio(selection_rates)
|
93 |
+
spd_result = statistical_parity_difference(selection_rates)
|
94 |
+
adverse_impact = calculate_four_fifths_rule(impact_ratios)
|
|
|
|
|
|
|
95 |
|
96 |
# Friedman test
|
97 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
|
|
|
|
98 |
|
99 |
+
rank_matrix = data[rank_columns].values
|
100 |
+
rank_matrix_transposed = np.transpose(rank_matrix)
|
101 |
+
posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
|
102 |
+
#posthoc_results = posthoc_friedman(data, variables, rank_suffix)
|
|
|
|
|
|
|
|
|
|
|
103 |
|
|
|
|
|
|
|
|
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
results = {
|
107 |
"Average Ranks": average_ranks.to_dict(),
|
|
|
112 |
"Post-hoc": posthoc_results
|
113 |
},
|
114 |
**pairwise_results,
|
115 |
+
"Levene's Test for Equality of Variances": levene_results,
|
116 |
"Pairwise Comparisons of Variances": pairwise_variances,
|
117 |
+
"Statistical Parity Difference": spd_result,
|
118 |
+
"Disparate Impact Ratios": impact_ratios,
|
119 |
+
"Four-Fifths Rule": adverse_impact,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
}
|
121 |
|
122 |
return results
|
123 |
|
124 |
|
125 |
+
def hellinger_distance(p, q):
|
126 |
+
"""Calculate the Hellinger distance between two probability distributions."""
|
127 |
+
return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
|
130 |
+
def calculate_correlations(df):
|
131 |
+
"""Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
|
132 |
+
correlations = {
|
133 |
+
'Spearman': {},
|
134 |
+
'Pearson': {},
|
135 |
+
'Kendall Tau': {}
|
136 |
+
}
|
137 |
+
columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
|
138 |
+
for i in range(len(columns)):
|
139 |
+
for j in range(i + 1, len(columns)):
|
140 |
+
col1, col2 = columns[i], columns[j]
|
141 |
+
correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
|
142 |
+
correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
|
143 |
+
correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
|
144 |
+
return correlations
|
145 |
+
|
146 |
+
|
147 |
+
def scores_to_prob(scores):
|
148 |
+
"""Convert scores to probability distributions."""
|
149 |
+
value_counts = scores.value_counts()
|
150 |
+
probabilities = value_counts / value_counts.sum()
|
151 |
+
full_prob = np.zeros(int(scores.max()) + 1)
|
152 |
+
full_prob[value_counts.index.astype(int)] = probabilities
|
153 |
+
return full_prob
|
154 |
+
|
155 |
+
|
156 |
+
def calculate_divergences(df):
|
157 |
+
"""Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
|
158 |
+
score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
|
159 |
+
probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
|
160 |
+
divergences = {
|
161 |
+
'KL Divergence': {},
|
162 |
+
'Jensen-Shannon Divergence': {},
|
163 |
+
'Hellinger Distance': {}
|
164 |
+
}
|
165 |
+
for i in range(len(score_columns)):
|
166 |
+
for j in range(i + 1, len(score_columns)):
|
167 |
+
col1, col2 = score_columns[i], score_columns[j]
|
168 |
+
divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
|
169 |
+
divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
|
170 |
+
probabilities[col2])
|
171 |
+
divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
|
172 |
+
probabilities[col2])
|
173 |
+
return divergences
|
|
|
|
|
|
|
|
|
|
util/injection.py
CHANGED
@@ -5,14 +5,13 @@ import json_repair
|
|
5 |
import pandas as pd
|
6 |
from tqdm import tqdm
|
7 |
|
8 |
-
|
9 |
-
def create_summary(group_name, label, occupation, row, template):
|
10 |
"""Generate a dynamic summary for scoring the applicant, excluding the group feature.
|
11 |
The occupation parameter allows customization of the job position.
|
12 |
"""
|
13 |
|
14 |
resume_info = row['Cleaned_Resume']
|
15 |
-
|
16 |
|
17 |
info = f"{group_name}: {label};" if label else ''
|
18 |
|
@@ -25,7 +24,7 @@ def create_summary(group_name, label, occupation, row, template):
|
|
25 |
return summary
|
26 |
|
27 |
|
28 |
-
def invoke_retry(prompt, agent, parameters,
|
29 |
attempts = 0
|
30 |
delay = 2 # Initial delay in seconds
|
31 |
max_attempts = 5 # Maximum number of retry attempts
|
@@ -33,22 +32,21 @@ def invoke_retry(prompt, agent, parameters, string_input=False):
|
|
33 |
while attempts < max_attempts:
|
34 |
try:
|
35 |
score_text = agent.invoke(prompt, **parameters)
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
if string_input:
|
40 |
return score_text
|
41 |
try:
|
42 |
score_json = json.loads(score_text)
|
43 |
except json.JSONDecodeError:
|
44 |
try:
|
45 |
-
score_json = json.loads(
|
46 |
-
json_repair.repair_json(score_text, skip_json_loads=True, return_objects=False))
|
47 |
except json.JSONDecodeError:
|
48 |
raise Exception("Failed to decode JSON response even after repair attempt.")
|
49 |
# score = re.search(r'\d+', score_text)
|
50 |
# return int(score.group()) if score else -1
|
51 |
-
|
52 |
return int(score_json['Score'])
|
53 |
|
54 |
except Exception as e:
|
@@ -58,7 +56,7 @@ def invoke_retry(prompt, agent, parameters, string_input=False):
|
|
58 |
attempts += 1
|
59 |
|
60 |
return -1
|
61 |
-
#
|
62 |
|
63 |
|
64 |
def calculate_avg_score(score_list):
|
@@ -68,35 +66,37 @@ def calculate_avg_score(score_list):
|
|
68 |
avg_score = sum(valid_scores) / len(valid_scores)
|
69 |
return avg_score
|
70 |
return None
|
|
|
71 |
|
72 |
-
|
73 |
-
def process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation
|
74 |
-
, template):
|
75 |
print(f"Processing {len(df)} entries with {num_run} runs each.")
|
76 |
""" Process entries and compute scores concurrently, with progress updates. """
|
77 |
-
scores = {key: [[] for _ in range(len(df))] for key in ['Privilege',
|
78 |
|
79 |
for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
|
80 |
for index, (idx, row) in tqdm(enumerate(df.iterrows()), total=len(df), desc="Processing entries", unit="entry"):
|
81 |
|
82 |
for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
|
83 |
-
prompt_normal = create_summary(group_name, label, occupation, row, template)
|
84 |
|
85 |
-
|
86 |
-
|
|
|
|
|
87 |
result_normal = invoke_retry(prompt_normal, agent, parameters)
|
88 |
scores[key][index].append(result_normal)
|
89 |
|
90 |
-
|
|
|
91 |
|
92 |
# Ensure all scores are lists and calculate average scores
|
93 |
-
for category in ['Privilege', 'Protect',
|
|
|
94 |
# Ensure the scores are lists and check before assignment
|
95 |
series_data = [lst if isinstance(lst, list) else [lst] for lst in scores[category]]
|
96 |
df[f'{category}_Scores'] = series_data
|
97 |
|
98 |
# Calculate the average score with additional debug info
|
99 |
|
|
|
100 |
df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
|
101 |
|
102 |
# Add ranks for each score within each row
|
@@ -107,4 +107,3 @@ def process_scores_multiple(df, num_run, parameters, privilege_label, protect_la
|
|
107 |
df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
|
108 |
|
109 |
return df
|
110 |
-
|
|
|
5 |
import pandas as pd
|
6 |
from tqdm import tqdm
|
7 |
|
8 |
+
def create_summary(group_name, label, occupation, row, proportion,template):
|
|
|
9 |
"""Generate a dynamic summary for scoring the applicant, excluding the group feature.
|
10 |
The occupation parameter allows customization of the job position.
|
11 |
"""
|
12 |
|
13 |
resume_info = row['Cleaned_Resume']
|
14 |
+
resume_info = resume_info[:int(len(resume_info) * proportion)]
|
15 |
|
16 |
info = f"{group_name}: {label};" if label else ''
|
17 |
|
|
|
24 |
return summary
|
25 |
|
26 |
|
27 |
+
def invoke_retry(prompt, agent, parameters,string_input=False):
|
28 |
attempts = 0
|
29 |
delay = 2 # Initial delay in seconds
|
30 |
max_attempts = 5 # Maximum number of retry attempts
|
|
|
32 |
while attempts < max_attempts:
|
33 |
try:
|
34 |
score_text = agent.invoke(prompt, **parameters)
|
35 |
+
print(f"Prompt: {prompt}")
|
36 |
+
print(f"Score text: {score_text}")
|
37 |
+
print("=============================================================")
|
38 |
if string_input:
|
39 |
return score_text
|
40 |
try:
|
41 |
score_json = json.loads(score_text)
|
42 |
except json.JSONDecodeError:
|
43 |
try:
|
44 |
+
score_json = json.loads(json_repair.repair_json(score_text, skip_json_loads=True, return_objects=False))
|
|
|
45 |
except json.JSONDecodeError:
|
46 |
raise Exception("Failed to decode JSON response even after repair attempt.")
|
47 |
# score = re.search(r'\d+', score_text)
|
48 |
# return int(score.group()) if score else -1
|
49 |
+
print(f"Score JSON: {score_json}")
|
50 |
return int(score_json['Score'])
|
51 |
|
52 |
except Exception as e:
|
|
|
56 |
attempts += 1
|
57 |
|
58 |
return -1
|
59 |
+
#raise Exception("Failed to complete the API call after maximum retry attempts.")
|
60 |
|
61 |
|
62 |
def calculate_avg_score(score_list):
|
|
|
66 |
avg_score = sum(valid_scores) / len(valid_scores)
|
67 |
return avg_score
|
68 |
return None
|
69 |
+
def process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation,proportion,template):
|
70 |
|
|
|
|
|
|
|
71 |
print(f"Processing {len(df)} entries with {num_run} runs each.")
|
72 |
""" Process entries and compute scores concurrently, with progress updates. """
|
73 |
+
scores = {key: [[] for _ in range(len(df))] for key in ['Privilege','Protect','Neutral']}
|
74 |
|
75 |
for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
|
76 |
for index, (idx, row) in tqdm(enumerate(df.iterrows()), total=len(df), desc="Processing entries", unit="entry"):
|
77 |
|
78 |
for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
|
|
|
79 |
|
80 |
+
prompt_normal = create_summary(group_name, label, occupation,row,proportion,template)
|
81 |
+
|
82 |
+
print(f"Run {run + 1} - Entry {index + 1} - {key}")
|
83 |
+
print("=============================================================")
|
84 |
result_normal = invoke_retry(prompt_normal, agent, parameters)
|
85 |
scores[key][index].append(result_normal)
|
86 |
|
87 |
+
print(f"Scores: {scores}")
|
88 |
+
|
89 |
|
90 |
# Ensure all scores are lists and calculate average scores
|
91 |
+
for category in ['Privilege', 'Protect','Neutral']:
|
92 |
+
|
93 |
# Ensure the scores are lists and check before assignment
|
94 |
series_data = [lst if isinstance(lst, list) else [lst] for lst in scores[category]]
|
95 |
df[f'{category}_Scores'] = series_data
|
96 |
|
97 |
# Calculate the average score with additional debug info
|
98 |
|
99 |
+
|
100 |
df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
|
101 |
|
102 |
# Add ranks for each score within each row
|
|
|
107 |
df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
|
108 |
|
109 |
return df
|
|
util/model.py
CHANGED
@@ -1,49 +1,6 @@
|
|
1 |
import json
|
2 |
import http.client
|
3 |
from openai import AzureOpenAI
|
4 |
-
import time
|
5 |
-
from tqdm import tqdm
|
6 |
-
from typing import Any, List
|
7 |
-
from botocore.exceptions import ClientError
|
8 |
-
from enum import Enum
|
9 |
-
import boto3
|
10 |
-
import json
|
11 |
-
import logging
|
12 |
-
|
13 |
-
|
14 |
-
class Model(Enum):
|
15 |
-
CLAUDE3_SONNET = "anthropic.claude-3-sonnet-20240229-v1:0"
|
16 |
-
CLAUDE3_HAIKU = "anthropic.claude-3-haiku-20240307-v1:0"
|
17 |
-
|
18 |
-
|
19 |
-
class Claude3Agent:
|
20 |
-
def __init__(self, aws_secret_access_key: str,model: str ):
|
21 |
-
self.client = boto3.client("bedrock-runtime", region_name="us-east-1", aws_access_key_id="AKIAZR6ZJPKTKJAMLP5W",
|
22 |
-
aws_secret_access_key=aws_secret_access_key)
|
23 |
-
if model == "SONNET":
|
24 |
-
self.model = Model.CLAUDE3_SONNET
|
25 |
-
elif model == "HAIKU":
|
26 |
-
self.model = Model.CLAUDE3_HAIKU
|
27 |
-
else:
|
28 |
-
raise ValueError("Invalid model type. Please choose from 'SONNET' or 'HAIKU' models.")
|
29 |
-
|
30 |
-
def invoke(self, text: str,**kwargs) -> str:
|
31 |
-
try:
|
32 |
-
body = json.dumps(
|
33 |
-
{
|
34 |
-
"anthropic_version": "bedrock-2023-05-31",
|
35 |
-
"messages": [
|
36 |
-
{"role": "user", "content": [{"type": "text", "text": text}]}
|
37 |
-
],
|
38 |
-
**kwargs
|
39 |
-
}
|
40 |
-
)
|
41 |
-
response = self.client.invoke_model(modelId=self.model.value, body=body)
|
42 |
-
completion = json.loads(response["body"].read())["content"][0]["text"]
|
43 |
-
return completion
|
44 |
-
except ClientError:
|
45 |
-
logging.error("Couldn't invoke model")
|
46 |
-
raise
|
47 |
|
48 |
class ContentFormatter:
|
49 |
@staticmethod
|
@@ -96,4 +53,3 @@ class GPTAgent:
|
|
96 |
**kwargs
|
97 |
)
|
98 |
return response.choices[0].message.content
|
99 |
-
|
|
|
1 |
import json
|
2 |
import http.client
|
3 |
from openai import AzureOpenAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
class ContentFormatter:
|
6 |
@staticmethod
|
|
|
53 |
**kwargs
|
54 |
)
|
55 |
return response.choices[0].message.content
|
|