Zekun Wu commited on
Commit
24180f4
1 Parent(s): 8e6aee2
pages/2_Injection_Multiple.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
- from util.generation import process_scores_multiple
5
  from util.model import AzureAgent, GPTAgent
6
 
7
  # Set up the Streamlit interface
@@ -74,9 +74,9 @@ if st.session_state.model_submitted:
74
  # Process data and display results
75
  with st.spinner('Processing data...'):
76
  parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
77
- df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,
78
  st.session_state.protect_label, agent, st.session_state.group_name,
79
- st.session_state.occupation)
80
  st.session_state.data_processed = True # Mark as processed
81
 
82
  st.write('Processed Data:', df)
 
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
+ from util.generation import process_scores
5
  from util.model import AzureAgent, GPTAgent
6
 
7
  # Set up the Streamlit interface
 
74
  # Process data and display results
75
  with st.spinner('Processing data...'):
76
  parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
77
+ df = process_scores(df, st.session_state.num_run, parameters, st.session_state.privilege_label,
78
  st.session_state.protect_label, agent, st.session_state.group_name,
79
+ st.session_state.occupation, test_type='multiple')
80
  st.session_state.data_processed = True # Mark as processed
81
 
82
  st.write('Processed Data:', df)
pages/4_Injection_Single.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
- from util.generation import process_scores_single
5
  from util.model import AzureAgent, GPTAgent
6
 
7
  # Set up the Streamlit interface
@@ -73,9 +73,9 @@ if st.session_state.model_submitted:
73
  # Process data and display results
74
  with st.spinner('Processing data...'):
75
  parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
76
- df = process_scores_single(df, st.session_state.num_run, parameters, st.session_state.counterfactual_label,
77
  agent, st.session_state.group_name,
78
- st.session_state.occupation)
79
  st.session_state.data_processed = True # Mark as processed
80
 
81
  st.write('Processed Data:', df)
 
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
+ from util.generation import process_scores
5
  from util.model import AzureAgent, GPTAgent
6
 
7
  # Set up the Streamlit interface
 
73
  # Process data and display results
74
  with st.spinner('Processing data...'):
75
  parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
76
+ df = process_scores(df, st.session_state.num_run, parameters, st.session_state.counterfactual_label,
77
  agent, st.session_state.group_name,
78
+ st.session_state.occupation,test_type='multiple')
79
  st.session_state.data_processed = True # Mark as processed
80
 
81
  st.write('Processed Data:', df)
util/analysis.py CHANGED
@@ -3,232 +3,6 @@ import numpy as np
3
  from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
4
  from statsmodels.stats.multicomp import MultiComparison
5
 
6
- def statistical_tests_multiple(data):
7
- # Calculate average ranks
8
- average_ranks = data[['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']].mean()
9
-
10
- # Statistical tests
11
- stat_friedman, p_friedman = friedmanchisquare(data['Privilege_Rank'], data['Protect_Rank'], data['Neutral_Rank'])
12
- kw_stat, kw_p = kruskal(data['Privilege_Rank'], data['Protect_Rank'], data['Neutral_Rank'])
13
- mw_stat, mw_p = mannwhitneyu(data['Privilege_Rank'], data['Protect_Rank'])
14
-
15
- # Wilcoxon Signed-Rank Test between pairs
16
- if len(data) > 20: # Check if the sample size is sufficient for Wilcoxon test
17
- p_value_privilege_protect = wilcoxon(data['Privilege_Rank'], data['Protect_Rank']).pvalue
18
- else:
19
- p_value_privilege_protect = "Sample size too small for Wilcoxon test."
20
-
21
- # Levene's Test for equality of variances
22
- levene_stat, levene_p = levene(data['Privilege_Avg_Score'], data['Protect_Avg_Score'])
23
-
24
- # T-test for independent samples (Privilege vs Protect)
25
- if levene_p > 0.05: # Assume equal variances if Levene's test is not significant
26
- t_stat, t_p = ttest_ind(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], equal_var=True)
27
- else:
28
- t_stat, t_p = ttest_ind(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], equal_var=False)
29
-
30
- # ANOVA and post-hoc tests if applicable
31
- anova_stat, anova_p = f_oneway(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], data['Neutral_Avg_Score'])
32
- if anova_p < 0.05:
33
- mc = MultiComparison(
34
- pd.concat([data['Privilege_Avg_Score'], data['Protect_Avg_Score'], data['Neutral_Avg_Score']]),
35
- np.repeat(['Privilege', 'Protect', 'Neutral'], len(data)))
36
- tukey_result = mc.tukeyhsd()
37
- else:
38
- tukey_result = "ANOVA not significant, no post-hoc test performed."
39
-
40
- results = {
41
- "Average Ranks": average_ranks,
42
- "Friedman Test": {"Statistic": stat_friedman, "p-value": p_friedman},
43
- "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
44
- "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
45
- "Wilcoxon Test Between Privilege and Protect": p_value_privilege_protect,
46
- "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
47
- "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
48
- "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
49
- "Tukey HSD Test": tukey_result
50
- }
51
-
52
- return results
53
-
54
-
55
- def result_evaluation_multiple(test_results):
56
- evaluation = {}
57
-
58
- # Average Ranks: Provide insights based on the ranking
59
- evaluation['Average Ranks'] = "Privilege: {:.2f}, Protect: {:.2f}, Neutral: {:.2f}".format(
60
- test_results['Average Ranks']['Privilege_Rank'],
61
- test_results['Average Ranks']['Protect_Rank'],
62
- test_results['Average Ranks']['Neutral_Rank']
63
- )
64
- min_rank = test_results['Average Ranks'].idxmin()
65
- max_rank = test_results['Average Ranks'].idxmax()
66
- rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
67
- evaluation['Rank Analysis'] = rank_analysis
68
-
69
- # Friedman Test evaluation
70
- evaluation[
71
- 'Friedman Test'] = "Significant differences between ranks observed (p = {:.5f}), suggesting potential bias.".format(
72
- test_results['Friedman Test']['p-value']
73
- ) if test_results['Friedman Test']['p-value'] < 0.05 else "No significant differences between ranks."
74
-
75
- # Kruskal-Wallis Test evaluation
76
- evaluation[
77
- 'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
78
- test_results['Kruskal-Wallis Test']['p-value']
79
- ) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."
80
-
81
- # Mann-Whitney U Test evaluation
82
- evaluation[
83
- 'Mann-Whitney U Test'] = "Significant difference between Privilege and Protect ranks (p = {:.5f}), suggesting bias.".format(
84
- test_results['Mann-Whitney U Test']['p-value']
85
- ) if test_results['Mann-Whitney U Test'][
86
- 'p-value'] < 0.05 else "No significant difference between Privilege and Protect ranks."
87
-
88
- # Wilcoxon Test evaluation
89
- if test_results['Wilcoxon Test Between Privilege and Protect'] == "Sample size too small for Wilcoxon test.":
90
- evaluation['Wilcoxon Test Between Privilege and Protect'] = test_results[
91
- 'Wilcoxon Test Between Privilege and Protect']
92
- else:
93
- evaluation[
94
- 'Wilcoxon Test Between Privilege and Protect'] = "Significant rank difference between Privilege and Protect (p = {:.5f}), indicating bias.".format(
95
- test_results['Wilcoxon Test Between Privilege and Protect']
96
- ) if test_results['Wilcoxon Test Between Privilege and Protect'] < 0.05 else "No significant rank difference between Privilege and Protect."
97
-
98
- # Levene's Test evaluation
99
- evaluation[
100
- "Levene's Test"] = "No significant variance differences between Privilege and Protect (p = {:.5f}).".format(
101
- test_results["Levene's Test"]['p-value']
102
- )
103
-
104
- # T-Test evaluation
105
- evaluation[
106
- 'T-Test (Independent)'] = "No significant mean difference between Privilege and Protect (p = {:.5f}).".format(
107
- test_results['T-Test (Independent)']['p-value']
108
- )
109
-
110
- # ANOVA Test evaluation
111
- evaluation[
112
- 'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
113
- test_results['ANOVA Test']['p-value']
114
- )
115
-
116
- # Tukey HSD Test evaluation
117
- evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
118
-
119
- return evaluation
120
-
121
- def statistical_tests_single(data):
122
- # Calculate average ranks
123
- average_ranks = data[['Counterfactual_Rank','Neutral_Rank']].mean()
124
-
125
- # Statistical tests
126
- kw_stat, kw_p = kruskal(data['Counterfactual_Rank'],data['Neutral_Rank'])
127
- mw_stat, mw_p = mannwhitneyu(data['Counterfactual_Rank'], data['Neutral_Rank'])
128
-
129
- # Wilcoxon Signed-Rank Test between pairs
130
- if len(data) > 20: # Check if the sample size is sufficient for Wilcoxon test
131
- p_value_privilege_protect = wilcoxon(data['Counterfactual_Rank'], data['Neutral_Rank']).pvalue
132
- else:
133
- p_value_privilege_protect = "Sample size too small for Wilcoxon test."
134
-
135
- # Levene's Test for equality of variances
136
- levene_stat, levene_p = levene(data['Counterfactual_Rank'], data['Neutral_Rank'])
137
-
138
- # T-test for independent samples (Privilege vs Protect)
139
- if levene_p > 0.05: # Assume equal variances if Levene's test is not significant
140
- t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=True)
141
- else:
142
- t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=False)
143
-
144
- # ANOVA and post-hoc tests if applicable
145
- anova_stat, anova_p = f_oneway(data['Counterfactual_Rank'], data['Neutral_Rank'])
146
- if anova_p < 0.05:
147
- mc = MultiComparison(
148
- pd.concat([data['Counterfactual_Avg_Score'], data['Neutral_Avg_Score']]),
149
- np.repeat(['Counterfactual', 'Neutral'], len(data)))
150
- tukey_result = mc.tukeyhsd()
151
- else:
152
- tukey_result = "ANOVA not significant, no post-hoc test performed."
153
-
154
- results = {
155
- "Average Ranks": average_ranks,
156
- "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
157
- "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
158
- "Wilcoxon Test Between Counterfactual and Neutral": p_value_privilege_protect,
159
- "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
160
- "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
161
- "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
162
- "Tukey HSD Test": tukey_result
163
- }
164
-
165
- return results
166
-
167
-
168
- def result_evaluation_single(test_results):
169
- evaluation = {}
170
-
171
- # Average Ranks: Provide insights based on the ranking
172
- evaluation['Average Ranks'] = "Counterfactual: {:.2f}, Neutral: {:.2f}".format(
173
- test_results['Average Ranks']['Counterfactual_Rank'],
174
- test_results['Average Ranks']['Neutral_Rank']
175
- )
176
- min_rank = test_results['Average Ranks'].idxmin()
177
- max_rank = test_results['Average Ranks'].idxmax()
178
- rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
179
- evaluation['Rank Analysis'] = rank_analysis
180
-
181
- # Kruskal-Wallis Test evaluation
182
- evaluation[
183
- 'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
184
- test_results['Kruskal-Wallis Test']['p-value']
185
- ) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."
186
-
187
- # Mann-Whitney U Test evaluation
188
- evaluation[
189
- 'Mann-Whitney U Test'] = "Significant difference between Counterfactual and Neutral ranks (p = {:.5f}), suggesting bias.".format(
190
- test_results['Mann-Whitney U Test']['p-value']
191
- ) if test_results['Mann-Whitney U Test'][
192
- 'p-value'] < 0.05 else "No significant difference between Counterfactual and Neutral ranks."
193
-
194
- # Wilcoxon Test evaluation
195
- if test_results['Wilcoxon Test Between Counterfactual and Neutral'] == "Sample size too small for Wilcoxon test.":
196
- evaluation['Wilcoxon Test Between Counterfactual and Neutral'] = test_results[
197
- 'Wilcoxon Test Between Counterfactual and Neutral']
198
- else:
199
- evaluation[
200
- 'Wilcoxon Test Between Counterfactual and Neutral'] = "Significant rank difference between Counterfactual and Neutral (p = {:.5f}), indicating bias.".format(
201
- test_results['Wilcoxon Test Between Counterfactual and Neutral']
202
- ) if test_results['Wilcoxon Test Between Counterfactual and Neutral'] < 0.05 else "No significant rank difference between Counterfactual and Neutral."
203
-
204
- # Levene's Test evaluation
205
- evaluation[
206
- "Levene's Test"] = "No significant variance differences between Counterfactual and Neutral (p = {:.5f}).".format(
207
- test_results["Levene's Test"]['p-value']
208
- )
209
-
210
- # T-Test evaluation
211
- evaluation[
212
- 'T-Test (Independent)'] = "No significant mean difference between Counterfactual and Neutral (p = {:.5f}).".format(
213
- test_results['T-Test (Independent)']['p-value']
214
- )
215
-
216
- # ANOVA Test evaluation
217
- evaluation[
218
- 'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
219
- test_results['ANOVA Test']['p-value']
220
- )
221
-
222
- # Tukey HSD Test evaluation
223
- evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
224
-
225
- return evaluation
226
-
227
-
228
-
229
-
230
-
231
-
232
  def statistical_tests(data, test_type='multiple'):
233
  if test_type == 'multiple':
234
  variables = ['Privilege', 'Protect', 'Neutral']
 
3
  from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
4
  from statsmodels.stats.multicomp import MultiComparison
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def statistical_tests(data, test_type='multiple'):
7
  if test_type == 'multiple':
8
  variables = ['Privilege', 'Protect', 'Neutral']
util/generation.py CHANGED
@@ -55,46 +55,34 @@ def invoke_retry(prompt,agent,parameters):
55
 
56
  raise Exception("Failed to complete the API call after maximum retry attempts.")
57
 
58
- def process_scores_multiple(df, num_run,parameters,privilege_label,protect_label,agent,group_name,occupation):
59
- """ Process entries and compute scores concurrently, with progress updates. """
60
- scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}
61
-
62
- for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
63
- for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries", unit="entry"):
64
- for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, None]):
65
- prompt_temp = create_summary(row,group_name,label,occupation)
66
- # print(f"Run {run + 1} - Entry {index + 1} - {key}:\n{prompt_temp}")
67
- # print("=============================================================")
68
- result = invoke_retry(prompt_temp,agent,parameters)
69
- scores[key][index].append(result)
70
-
71
- # Assign score lists and calculate average scores
72
- for category in ['Privilege', 'Protect', 'Neutral']:
73
- df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
74
- df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
75
- lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
76
- )
77
-
78
- return df
79
 
80
- def process_scores_single(df, num_run,parameters,counterfactual_label,agent,group_name,occupation):
81
- """ Process entries and compute scores concurrently, with progress updates. """
82
- scores = {key: [[] for _ in range(len(df))] for key in ['Counterfactual', 'Neutral']}
83
 
 
84
  for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
85
  for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries", unit="entry"):
86
- for key, label in zip(['Counterfactual', 'Neutral'], [counterfactual_label, None]):
87
- prompt_temp = create_summary(row,group_name,label,occupation)
88
- # print(f"Run {run + 1} - Entry {index + 1} - {key}:\n{prompt_temp}")
89
- # print("=============================================================")
90
- result = invoke_retry(prompt_temp,agent,parameters)
91
- scores[key][index].append(result)
92
 
93
  # Assign score lists and calculate average scores
94
- for category in ['Counterfactual', 'Neutral']:
95
  df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
96
  df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
97
  lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
98
  )
99
 
100
- return df
 
55
 
56
  raise Exception("Failed to complete the API call after maximum retry attempts.")
57
 
58
+ def process_scores(df, num_run, parameters, labels, agent, group_name, occupation, test_type='multiple'):
59
+ """
60
+ Process entries and compute scores concurrently, with progress updates.
61
+ Accepts test_type to switch between 'multiple' and 'single' processing modes.
62
+ """
63
+ if test_type == 'multiple':
64
+ categories = ['Privilege', 'Protect', 'Neutral']
65
+ elif test_type == 'single':
66
+ categories = ['Counterfactual', 'Neutral']
67
+ else:
68
+ raise ValueError("test_type must be either 'multiple' or 'single'")
 
 
 
 
 
 
 
 
 
 
69
 
70
+ # Initialize scores dictionary
71
+ scores = {category: [[] for _ in range(len(df))] for category in categories}
 
72
 
73
+ # Processing loop
74
  for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
75
  for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries", unit="entry"):
76
+ for category, label in zip(categories, labels):
77
+ prompt_temp = create_summary(row, group_name, label, occupation)
78
+ result = invoke_retry(prompt_temp, agent, parameters)
79
+ scores[category][index].append(result)
 
 
80
 
81
  # Assign score lists and calculate average scores
82
+ for category in categories:
83
  df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
84
  df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
85
  lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
86
  )
87
 
88
+ return df