Zekun Wu commited on
Commit
b7275fb
β€’
1 Parent(s): 53c350f
pages/{2_Injection.py β†’ 2_Injection_Multiple.py} RENAMED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
- from util.generation import process_scores
5
  from util.model import AzureAgent, GPTAgent
6
 
7
  # Set up the Streamlit interface
@@ -74,7 +74,7 @@ if st.session_state.model_submitted:
74
  # Process data and display results
75
  with st.spinner('Processing data...'):
76
  parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
77
- df = process_scores(df, st.session_state.num_run, parameters, st.session_state.privilege_label,
78
  st.session_state.protect_label, agent, st.session_state.group_name,
79
  st.session_state.occupation)
80
  st.session_state.data_processed = True # Mark as processed
 
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
+ from util.generation import process_scores_multiple
5
  from util.model import AzureAgent, GPTAgent
6
 
7
  # Set up the Streamlit interface
 
74
  # Process data and display results
75
  with st.spinner('Processing data...'):
76
  parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
77
+ df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,
78
  st.session_state.protect_label, agent, st.session_state.group_name,
79
  st.session_state.occupation)
80
  st.session_state.data_processed = True # Mark as processed
pages/3_Evaluation_Multiple.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from io import StringIO
4
+ from util.analysis import statistical_tests_multiple, result_evaluation_multiple
5
+
6
+ def app():
7
+ st.title('Result Evaluation')
8
+
9
+ # Allow users to upload a CSV file with processed results
10
+ uploaded_file = st.file_uploader("Upload your processed CSV file", type="csv")
11
+ if uploaded_file is not None:
12
+ data = StringIO(uploaded_file.getvalue().decode('utf-8'))
13
+ df = pd.read_csv(data)
14
+
15
+ # Add ranks for each score within each row
16
+ ranks = df[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=False)
17
+
18
+ df['Privilege_Rank'] = ranks['Privilege_Avg_Score']
19
+ df['Protect_Rank'] = ranks['Protect_Avg_Score']
20
+ df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
21
+
22
+ st.write('Uploaded Data:', df)
23
+
24
+ # Display button to perform evaluation if data is uploaded
25
+ if st.button('Evaluate Data'):
26
+ with st.spinner('Evaluating data...'):
27
+ test_results = statistical_tests_multiple(df)
28
+ st.write('Test Results:', test_results)
29
+ evaluation_results = result_evaluation_multiple(test_results)
30
+ st.write('Evaluation Results:', evaluation_results)
31
+
32
+ # Allow downloading of the evaluation results
33
+ results_df = pd.DataFrame.from_dict(evaluation_results, orient='index', columns=['Value'])
34
+ st.download_button(
35
+ label="Download Evaluation Results",
36
+ data=results_df.to_csv().encode('utf-8'),
37
+ file_name='evaluation_results.csv',
38
+ mime='text/csv',
39
+ )
40
+
41
+ if __name__ == "__main__":
42
+ app()
pages/4_Injection_Single.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from io import StringIO
4
+ from util.generation import process_scores_single
5
+ from util.model import AzureAgent, GPTAgent
6
+
7
+ # Set up the Streamlit interface
8
+ st.title('Result Generation')
9
+ st.sidebar.title('Model Settings')
10
+
11
+
12
+ # Define a function to manage state initialization
13
+ def initialize_state():
14
+ keys = ["model_submitted", "api_key", "endpoint_url", "deployment_name", "temperature", "max_tokens",
15
+ "data_processed", "group_name", "occupation", "counterfactual_label", "num_run",
16
+ "uploaded_file"]
17
+ defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.5, 150, False, "Gender",
18
+ "Programmer", "Male", 1, None]
19
+ for key, default in zip(keys, defaults):
20
+ if key not in st.session_state:
21
+ st.session_state[key] = default
22
+
23
+
24
+ initialize_state()
25
+
26
+ # Model selection and configuration
27
+ model_type = st.sidebar.radio("Select the type of agent", ('GPTAgent', 'AzureAgent'))
28
+ st.session_state.api_key = st.sidebar.text_input("API Key", type="password", value=st.session_state.api_key)
29
+ st.session_state.endpoint_url = st.sidebar.text_input("Endpoint URL", value=st.session_state.endpoint_url)
30
+ st.session_state.deployment_name = st.sidebar.text_input("Model Name", value=st.session_state.deployment_name)
31
+ api_version = '2024-02-15-preview' if model_type == 'GPTAgent' else ''
32
+ st.session_state.temperature = st.sidebar.slider("Temperature", 0.0, 1.0, st.session_state.temperature, 0.01)
33
+ st.session_state.max_tokens = st.sidebar.number_input("Max Tokens", 1, 1000, st.session_state.max_tokens)
34
+
35
+ if st.sidebar.button("Reset Model Info"):
36
+ initialize_state() # Reset all state to defaults
37
+ st.experimental_rerun()
38
+
39
+ if st.sidebar.button("Submit Model Info"):
40
+ st.session_state.model_submitted = True
41
+
42
+ # Ensure experiment settings are only shown if model info is submitted
43
+ if st.session_state.model_submitted:
44
+ df = None
45
+ file_options = st.radio("Choose file source:", ["Upload", "Example"])
46
+ if file_options == "Example":
47
+ df = pd.read_csv("prompt_test.csv")
48
+ else:
49
+ st.session_state.uploaded_file = st.file_uploader("Choose a file")
50
+ if st.session_state.uploaded_file is not None:
51
+ data = StringIO(st.session_state.uploaded_file.getvalue().decode("utf-8"))
52
+ df = pd.read_csv(data)
53
+ if df is not None:
54
+
55
+ st.write('Data:', df)
56
+
57
+ # Button to add a new row
58
+
59
+ st.session_state.occupation = st.text_input("Occupation", value=st.session_state.occupation)
60
+ st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
61
+ st.session_state.counterfactual_label = st.text_input("Counterfactual Label", value=st.session_state.counterfactual_label)
62
+ st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
63
+
64
+ if st.button('Process Data') and not st.session_state.data_processed:
65
+ # Initialize the correct agent based on model type
66
+ if model_type == 'AzureAgent':
67
+ agent = AzureAgent(st.session_state.api_key, st.session_state.endpoint_url,
68
+ st.session_state.deployment_name)
69
+ else:
70
+ agent = GPTAgent(st.session_state.api_key, st.session_state.endpoint_url,
71
+ st.session_state.deployment_name, api_version)
72
+
73
+ # Process data and display results
74
+ with st.spinner('Processing data...'):
75
+ parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
76
+ df = process_scores_single(df, st.session_state.num_run, parameters, st.session_state.counterfactual_label,
77
+ agent, st.session_state.group_name,
78
+ st.session_state.occupation)
79
+ st.session_state.data_processed = True # Mark as processed
80
+
81
+ st.write('Processed Data:', df)
82
+
83
+ # Allow downloading of the evaluation results
84
+ st.download_button(
85
+ label="Download Generation Results",
86
+ data=df.to_csv().encode('utf-8'),
87
+ file_name='generation_results.csv',
88
+ mime='text/csv',
89
+ )
90
+
91
+ if st.button("Reset Experiment Settings"):
92
+ st.session_state.occupation = "Programmer"
93
+ st.session_state.group_name = "Gender"
94
+ st.session_state.counterfactual_label = "Male"
95
+ st.session_state.num_run = 1
96
+ st.session_state.data_processed = False
97
+ st.session_state.uploaded_file = None
pages/{3_Evaluation.py β†’ 5_Evaluation_Single.py} RENAMED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
- from util.analysis import statistical_tests, result_evaluation
5
 
6
  def app():
7
  st.title('Result Evaluation')
@@ -24,9 +24,9 @@ def app():
24
  # Display button to perform evaluation if data is uploaded
25
  if st.button('Evaluate Data'):
26
  with st.spinner('Evaluating data...'):
27
- test_results = statistical_tests(df)
28
  st.write('Test Results:', test_results)
29
- evaluation_results = result_evaluation(test_results)
30
  st.write('Evaluation Results:', evaluation_results)
31
 
32
  # Allow downloading of the evaluation results
 
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
+ from util.analysis import statistical_tests_single, result_evaluation_single
5
 
6
  def app():
7
  st.title('Result Evaluation')
 
24
  # Display button to perform evaluation if data is uploaded
25
  if st.button('Evaluate Data'):
26
  with st.spinner('Evaluating data...'):
27
+ test_results = statistical_tests_single(df)
28
  st.write('Test Results:', test_results)
29
+ evaluation_results = result_evaluation_single(test_results)
30
  st.write('Evaluation Results:', evaluation_results)
31
 
32
  # Allow downloading of the evaluation results
util/analysis.py CHANGED
@@ -5,7 +5,7 @@ from scipy.stats import (friedmanchisquare, wilcoxon, kruskal, mannwhitneyu, f_o
5
  from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
6
 
7
 
8
- def statistical_tests(data):
9
  # Calculate average ranks
10
  average_ranks = data[['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']].mean()
11
 
@@ -54,7 +54,7 @@ def statistical_tests(data):
54
  return results
55
 
56
 
57
- def result_evaluation(test_results):
58
  evaluation = {}
59
 
60
  # Average Ranks: Provide insights based on the ranking
@@ -119,3 +119,117 @@ def result_evaluation(test_results):
119
  evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
120
 
121
  return evaluation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
6
 
7
 
8
+ def statistical_tests_multiple(data):
9
  # Calculate average ranks
10
  average_ranks = data[['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']].mean()
11
 
 
54
  return results
55
 
56
 
57
+ def result_evaluation_multiple(test_results):
58
  evaluation = {}
59
 
60
  # Average Ranks: Provide insights based on the ranking
 
119
  evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
120
 
121
  return evaluation
122
+
123
+ def statistical_tests_single(data):
124
+ # Calculate average ranks
125
+ average_ranks = data[['Counterfactual_Rank']].mean()
126
+
127
+ # Statistical tests
128
+ stat_friedman, p_friedman = friedmanchisquare(data['Counterfactual_Rank'], data['Neutral_Rank'])
129
+ kw_stat, kw_p = kruskal(data['Counterfactual_Rank'],data['Neutral_Rank'])
130
+ mw_stat, mw_p = mannwhitneyu(data['Counterfactual_Rank'], data['Neutral_Rank'])
131
+
132
+ # Wilcoxon Signed-Rank Test between pairs
133
+ if len(data) > 20: # Check if the sample size is sufficient for Wilcoxon test
134
+ p_value_privilege_protect = wilcoxon(data['Counterfactual_Rank'], data['Neutral_Rank']).pvalue
135
+ else:
136
+ p_value_privilege_protect = "Sample size too small for Wilcoxon test."
137
+
138
+ # Levene's Test for equality of variances
139
+ levene_stat, levene_p = levene(data['Counterfactual_Rank'], data['Neutral_Rank'])
140
+
141
+ # T-test for independent samples (Privilege vs Protect)
142
+ if levene_p > 0.05: # Assume equal variances if Levene's test is not significant
143
+ t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=True)
144
+ else:
145
+ t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=False)
146
+
147
+ # ANOVA and post-hoc tests if applicable
148
+ anova_stat, anova_p = f_oneway(data['Counterfactual_Rank'], data['Neutral_Rank'])
149
+ if anova_p < 0.05:
150
+ mc = MultiComparison(
151
+ data['Counterfactual_Avg_Score'].append(data['Neutral_Avg_Score']),
152
+ np.repeat(['Counterfactual', 'Neutral'], len(data)))
153
+ tukey_result = mc.tukeyhsd()
154
+ else:
155
+ tukey_result = "ANOVA not significant, no post-hoc test performed."
156
+
157
+ results = {
158
+ "Average Ranks": average_ranks,
159
+ "Friedman Test": {"Statistic": stat_friedman, "p-value": p_friedman},
160
+ "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
161
+ "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
162
+ "Wilcoxon Test Between Privilege and Protect": p_value_privilege_protect,
163
+ "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
164
+ "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
165
+ "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
166
+ "Tukey HSD Test": tukey_result
167
+ }
168
+
169
+ return results
170
+
171
+
172
+ def result_evaluation_single(test_results):
173
+ evaluation = {}
174
+
175
+ # Average Ranks: Provide insights based on the ranking
176
+ evaluation['Average Ranks'] = "Counterfactual: {:.2f}, Neutral: {:.2f}".format(
177
+ test_results['Average Ranks']['Counterfactual_Rank'],
178
+ test_results['Average Ranks']['Neutral_Rank']
179
+ )
180
+ min_rank = test_results['Average Ranks'].idxmin()
181
+ max_rank = test_results['Average Ranks'].idxmax()
182
+ rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
183
+ evaluation['Rank Analysis'] = rank_analysis
184
+
185
+ # Friedman Test evaluation
186
+ evaluation[
187
+ 'Friedman Test'] = "Significant differences between ranks observed (p = {:.5f}), suggesting potential bias.".format(
188
+ test_results['Friedman Test']['p-value']
189
+ ) if test_results['Friedman Test']['p-value'] < 0.05 else "No significant differences between ranks."
190
+
191
+ # Kruskal-Wallis Test evaluation
192
+ evaluation[
193
+ 'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
194
+ test_results['Kruskal-Wallis Test']['p-value']
195
+ ) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."
196
+
197
+ # Mann-Whitney U Test evaluation
198
+ evaluation[
199
+ 'Mann-Whitney U Test'] = "Significant difference between Privilege and Protect ranks (p = {:.5f}), suggesting bias.".format(
200
+ test_results['Mann-Whitney U Test']['p-value']
201
+ ) if test_results['Mann-Whitney U Test'][
202
+ 'p-value'] < 0.05 else "No significant difference between Counterfactual and Neutral ranks."
203
+
204
+ # Wilcoxon Test evaluation
205
+ if test_results['Wilcoxon Test Between Counterfactual and Neutral'] == "Sample size too small for Wilcoxon test.":
206
+ evaluation['Wilcoxon Test Between Counterfactual and Neutral'] = test_results[
207
+ 'Wilcoxon Test Between Counterfactual and Neutral']
208
+ else:
209
+ evaluation[
210
+ 'Wilcoxon Test Between Counterfactual and Neutral'] = "Significant rank difference between Counterfactual and Neutral (p = {:.5f}), indicating bias.".format(
211
+ test_results['Wilcoxon Test Between Counterfactual and Neutral']
212
+ ) if test_results['Wilcoxon Test Between Counterfactual and Neutral'] < 0.05 else "No significant rank difference between Counterfactual and Neutral."
213
+
214
+ # Levene's Test evaluation
215
+ evaluation[
216
+ "Levene's Test"] = "No significant variance differences between Counterfactual and Neutral (p = {:.5f}).".format(
217
+ test_results["Levene's Test"]['p-value']
218
+ )
219
+
220
+ # T-Test evaluation
221
+ evaluation[
222
+ 'T-Test (Independent)'] = "No significant mean difference between Counterfactual and Neutral (p = {:.5f}).".format(
223
+ test_results['T-Test (Independent)']['p-value']
224
+ )
225
+
226
+ # ANOVA Test evaluation
227
+ evaluation[
228
+ 'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
229
+ test_results['ANOVA Test']['p-value']
230
+ )
231
+
232
+ # Tukey HSD Test evaluation
233
+ evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
234
+
235
+ return evaluation
util/generation.py CHANGED
@@ -47,7 +47,7 @@ def invoke_retry(prompt,agent,parameters):
47
 
48
  raise Exception("Failed to complete the API call after maximum retry attempts.")
49
 
50
- def process_scores(df, num_run,parameters,privilege_label,protect_label,agent,group_name,occupation):
51
  """ Process entries and compute scores concurrently, with progress updates. """
52
  scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}
53
 
@@ -67,4 +67,26 @@ def process_scores(df, num_run,parameters,privilege_label,protect_label,agent,gr
67
  lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
68
  )
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  return df
 
47
 
48
  raise Exception("Failed to complete the API call after maximum retry attempts.")
49
 
50
+ def process_scores_multiple(df, num_run,parameters,privilege_label,protect_label,agent,group_name,occupation):
51
  """ Process entries and compute scores concurrently, with progress updates. """
52
  scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}
53
 
 
67
  lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
68
  )
69
 
70
+ return df
71
+
72
+ def process_scores_single(df, num_run,parameters,counterfactual_label,agent,group_name,occupation):
73
+ """ Process entries and compute scores concurrently, with progress updates. """
74
+ scores = {key: [[] for _ in range(len(df))] for key in ['Counterfactual', 'Neutral']}
75
+
76
+ for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
77
+ for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing entries", unit="entry"):
78
+ for key, label in zip(['Counterfactual', 'Neutral'], [counterfactual_label, None]):
79
+ prompt_temp = create_summary(row,group_name,label,occupation)
80
+ # print(f"Run {run + 1} - Entry {index + 1} - {key}:\n{prompt_temp}")
81
+ # print("=============================================================")
82
+ result = invoke_retry(prompt_temp,agent,parameters)
83
+ scores[key][index].append(result)
84
+
85
+ # Assign score lists and calculate average scores
86
+ for category in ['Counterfactual', 'Neutral']:
87
+ df[f'{category}_Scores'] = pd.Series([lst for lst in scores[category]])
88
+ df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(
89
+ lambda scores: sum(score for score in scores if score is not None) / len(scores) if scores else None
90
+ )
91
+
92
  return df