Spaces:

holistic-ai
/

job-fair

Sleeping

App Files Files Community

lll

by XinGuan2000 - opened May 30

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+128

-410

Files changed (13) hide show

pages/1_Injection.py +11 -34
pages/2_Evaluation.py +1 -1
requirements.txt +1 -2
resume_chunked.csv +0 -3
resume_subsampled.csv +2 -2
util/__pycache__/__init__.cpython-311.pyc +0 -0
util/__pycache__/evaluation.cpython-311.pyc +0 -0
util/__pycache__/injection.cpython-311.pyc +0 -0
util/__pycache__/model.cpython-311.pyc +0 -0
util/__pycache__/prompt.cpython-311.pyc +0 -0
util/evaluation.py +93 -303
util/injection.py +20 -21
util/model.py +0 -44

pages/1_Injection.py CHANGED Viewed

@@ -2,7 +2,7 @@ import streamlit as st
 import pandas as pd
 from io import StringIO
 from util.injection import process_scores_multiple
-from util.model import AzureAgent, GPTAgent,Claude3Agent
 from util.prompt import PROMPT_TEMPLATE
 import os
@@ -10,7 +10,6 @@ st.title('Result Generation')
 def check_password():
     def password_entered():
-        # if password_input == os.getenv('PASSWORD'):
         if password_input == os.getenv('PASSWORD'):
             st.session_state['password_correct'] = True
         else:
@@ -29,18 +28,12 @@ def initialize_state():
             "data_processed", "group_name", "occupation", "privilege_label", "protect_label", "num_run",
             "uploaded_file", "occupation_submitted","sample_size","charateristics","proportion","prompt_template"]
     defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.0, 300, False, "Gender",
-                "Programmer", "Male", "Female", 1, None, False,2,"This candidate's performance during the internship at our institution was evaluated to be at the 50th percentile among current employees.", 1.0 ,PROMPT_TEMPLATE]
     for key, default in zip(keys, defaults):
         if key not in st.session_state:
             st.session_state[key] = default
-def change_column_value(df_old, df_change, here_column, switch_to_column, common_column='Resume'):
-    merged_df = df_old.merge(df_change, on=common_column, how='left')
-    df_old[here_column] = merged_df[switch_to_column]
-    return df_old
 if not st.session_state.get('password_correct', False):
     check_password()
 else:
@@ -49,21 +42,15 @@ else:
     st.sidebar.title('Model Settings')
     initialize_state()
     # Model selection and configuration
-    model_type = st.sidebar.radio("Select the type of agent", ('GPTAgent', 'AzureAgent','Claude3Agent'))
     st.session_state.api_key = st.sidebar.text_input("API Key", type="password", value=st.session_state.api_key)
     st.session_state.deployment_name = st.sidebar.text_input("Model Name", value=st.session_state.deployment_name)
     st.session_state.temperature = st.sidebar.slider("Temperature", 0.0, 1.0, st.session_state.temperature, 0.01)
     st.session_state.max_tokens = st.sidebar.number_input("Max Tokens", 1, 1000, st.session_state.max_tokens)
-    if model_type == 'GPTAgent' or model_type == 'AzureAgent':
-        st.session_state.endpoint_url = st.sidebar.text_input("Endpoint URL", value=st.session_state.endpoint_url)
-        api_version = '2024-02-15-preview' if model_type == 'GPTAgent' else ''
     if st.sidebar.button("Reset Model Info"):
         initialize_state()  # Reset all state to defaults
         st.experimental_rerun()
@@ -93,23 +80,17 @@ else:
             st.session_state.prompt_template = st.text_area("Prompt Template", value=st.session_state.prompt_template)
             st.session_state.sample_size = st.number_input("Sample Size", 2, len(df), st.session_state.sample_size)
             st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
             st.session_state.privilege_label = st.text_input("Privilege Label", value=st.session_state.privilege_label)
             st.session_state.protect_label = st.text_input("Protect Label", value=st.session_state.protect_label)
-            st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
             #st.session_state.charateristics = st.text_area("Characteristics", value=st.session_state.charateristics)
-            df = df[df["Occupation"] == st.session_state.occupation]
-            # if file_options == "Example":
-            #     st.session_state.proportion = st.slider("Proportion", 0.2, 1.0, float(st.session_state.proportion), 0.2)
-            #     df_chunked = pd.read_csv("resume_chunked.csv")
-            #     column_switch_to = f'{st.session_state.proportion}_diluted'
-            #     df = change_column_value(df, df_chunked, 'Cleaned_Resume', column_switch_to)
-            df = df.sample(n=st.session_state.sample_size, random_state=42)
             st.write('Data:', df)
             if st.button('Process Data') and not st.session_state.data_processed:
@@ -117,16 +98,13 @@ else:
                 if model_type == 'AzureAgent':
                     agent = AzureAgent(st.session_state.api_key, st.session_state.endpoint_url,
                                        st.session_state.deployment_name)
-                elif model_type == 'GPTAgent':
                     agent = GPTAgent(st.session_state.api_key, st.session_state.endpoint_url,
                                      st.session_state.deployment_name, api_version)
-                else:
-                    agent = Claude3Agent(st.session_state.api_key,st.session_state.deployment_name)
                 with st.spinner('Processing data...'):
                     parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
-                    preprocessed_df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,st.session_state.protect_label, agent, st.session_state.group_name,st.session_state.occupation,st.session_state.prompt_template)
                     st.session_state.data_processed = True  # Mark as processed
                 st.write('Processed Data:', preprocessed_df)
@@ -150,4 +128,3 @@ else:
                 st.session_state.num_run = 1
                 st.session_state.data_processed = False
                 st.session_state.uploaded_file = None
-                st.session_state.proportion = 1.0

 import pandas as pd
 from io import StringIO
 from util.injection import process_scores_multiple
+from util.model import AzureAgent, GPTAgent
 from util.prompt import PROMPT_TEMPLATE
 import os
 def check_password():
     def password_entered():
         if password_input == os.getenv('PASSWORD'):
             st.session_state['password_correct'] = True
         else:
             "data_processed", "group_name", "occupation", "privilege_label", "protect_label", "num_run",
             "uploaded_file", "occupation_submitted","sample_size","charateristics","proportion","prompt_template"]
     defaults = [False, "", "https://safeguard-monitor.openai.azure.com/", "gpt35-1106", 0.0, 300, False, "Gender",
+                "Programmer", "Male", "Female", 1, None, False,2,"This candidate's performance during the internship at our institution was evaluated to be at the 50th percentile among current employees.",1,PROMPT_TEMPLATE]
     for key, default in zip(keys, defaults):
         if key not in st.session_state:
             st.session_state[key] = default
 if not st.session_state.get('password_correct', False):
     check_password()
 else:
     st.sidebar.title('Model Settings')
     initialize_state()
     # Model selection and configuration
+    model_type = st.sidebar.radio("Select the type of agent", ('GPTAgent', 'AzureAgent'))
     st.session_state.api_key = st.sidebar.text_input("API Key", type="password", value=st.session_state.api_key)
+    st.session_state.endpoint_url = st.sidebar.text_input("Endpoint URL", value=st.session_state.endpoint_url)
     st.session_state.deployment_name = st.sidebar.text_input("Model Name", value=st.session_state.deployment_name)
+    api_version = '2024-02-15-preview' if model_type == 'GPTAgent' else ''
     st.session_state.temperature = st.sidebar.slider("Temperature", 0.0, 1.0, st.session_state.temperature, 0.01)
     st.session_state.max_tokens = st.sidebar.number_input("Max Tokens", 1, 1000, st.session_state.max_tokens)
     if st.sidebar.button("Reset Model Info"):
         initialize_state()  # Reset all state to defaults
         st.experimental_rerun()
             st.session_state.prompt_template = st.text_area("Prompt Template", value=st.session_state.prompt_template)
             st.session_state.sample_size = st.number_input("Sample Size", 2, len(df), st.session_state.sample_size)
+            st.session_state.proportion = st.number_input("Proportion", 0.0, 1.0, float(st.session_state.proportion), 0.01)
             st.session_state.group_name = st.text_input("Group Name", value=st.session_state.group_name)
             st.session_state.privilege_label = st.text_input("Privilege Label", value=st.session_state.privilege_label)
             st.session_state.protect_label = st.text_input("Protect Label", value=st.session_state.protect_label)
             #st.session_state.charateristics = st.text_area("Characteristics", value=st.session_state.charateristics)
+            st.session_state.num_run = st.number_input("Number of Runs", 1, 10, st.session_state.num_run)
+            df = df[df["Occupation"] == st.session_state.occupation]
+            df = df.sample(n=st.session_state.sample_size,random_state=42)
             st.write('Data:', df)
             if st.button('Process Data') and not st.session_state.data_processed:
                 if model_type == 'AzureAgent':
                     agent = AzureAgent(st.session_state.api_key, st.session_state.endpoint_url,
                                        st.session_state.deployment_name)
+                else:
                     agent = GPTAgent(st.session_state.api_key, st.session_state.endpoint_url,
                                      st.session_state.deployment_name, api_version)
                 with st.spinner('Processing data...'):
                     parameters = {"temperature": st.session_state.temperature, "max_tokens": st.session_state.max_tokens}
+                    preprocessed_df = process_scores_multiple(df, st.session_state.num_run, parameters, st.session_state.privilege_label,st.session_state.protect_label, agent, st.session_state.group_name,st.session_state.occupation,st.session_state.proportion,st.session_state.prompt_template)
                     st.session_state.data_processed = True  # Mark as processed
                 st.write('Processed Data:', preprocessed_df)
                 st.session_state.num_run = 1
                 st.session_state.data_processed = False
                 st.session_state.uploaded_file = None

pages/2_Evaluation.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import streamlit as st
 import pandas as pd
 from io import StringIO
-from util.evaluation import statistical_tests
 from util.plot import create_score_plot,create_rank_plots,create_correlation_heatmaps,create_3d_plot,calculate_distances
 import plotly.express as px

 import streamlit as st
 import pandas as pd
 from io import StringIO
+from util.evaluation import statistical_tests,calculate_correlations,calculate_divergences
 from util.plot import create_score_plot,create_rank_plots,create_correlation_heatmaps,create_3d_plot,calculate_distances
 import plotly.express as px

requirements.txt CHANGED Viewed

@@ -5,5 +5,4 @@ scipy
 statsmodels
 scikit-posthocs
 json-repair
-plotly
-boto3

 statsmodels
 scikit-posthocs
 json-repair
+plotly

resume_chunked.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:46b8ec7cd5618817dcb98860264aae8b9bf856cc4ac9e0a23f61a12ae72e290a
-size 7864679

resume_subsampled.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ead8d4a52de48139bc0c98ab8e5b61210dd93e10856f024adf6f26570ea1353c
-size 3845012

 version https://git-lfs.github.com/spec/v1
+oid sha256:9ae325b538c1e601fe44bb2d0377800c0a633a8a19bb6ecb5834386d24aa6bf2
+size 3845010

util/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (176 Bytes)

util/__pycache__/evaluation.cpython-311.pyc DELETED Viewed

Binary file (11 kB)

util/__pycache__/injection.cpython-311.pyc DELETED Viewed

Binary file (7.19 kB)

util/__pycache__/model.cpython-311.pyc DELETED Viewed

Binary file (3.55 kB)

util/__pycache__/prompt.cpython-311.pyc DELETED Viewed

Binary file (1.41 kB)

util/evaluation.py CHANGED Viewed

@@ -10,64 +10,12 @@ from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
 from statsmodels.stats.multicomp import pairwise_tukeyhsd
 from scipy.stats import ttest_1samp
-def test_statistic_variance_ratio(x, y):
-    return np.var(x, ddof=1) / np.var(y, ddof=1)
-def test_statistic_mean_difference(x, y):
-    return np.mean(x) - np.mean(y)
-def permutation_test_variance(x, y, num_permutations=100000):
-    T_obs = test_statistic_variance_ratio(x, y)
-    pooled_data = np.concatenate([x, y])
-    n_A = len(x)
-    perm_test_stats = [T_obs]
-    for _ in range(num_permutations):
-        np.random.shuffle(pooled_data)
-        perm_A = pooled_data[:n_A]
-        perm_B = pooled_data[n_A:]
-        perm_test_stats.append(test_statistic_variance_ratio(perm_A, perm_B))
-    perm_test_stats = np.array(perm_test_stats)
-    p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))
-    return T_obs, p_value
-def permutation_test_mean(x, y, num_permutations=100000):
-    T_obs = test_statistic_mean_difference(x, y)
-    pooled_data = np.concatenate([x, y])
-    n_A = len(x)
-    perm_test_stats = [T_obs]
-    for _ in range(num_permutations):
-        np.random.shuffle(pooled_data)
-        perm_A = pooled_data[:n_A]
-        perm_B = pooled_data[n_A:]
-        perm_test_stats.append(test_statistic_mean_difference(perm_A, perm_B))
-    perm_test_stats = np.array(perm_test_stats)
-    p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))
-    return T_obs, p_value
 def calculate_impact_ratio(selection_rates):
     """Calculate the impact ratio for each category."""
     most_selected_rate = max(selection_rates.values())
     impact_ratios = {category: rate / most_selected_rate for category, rate in selection_rates.items()}
     return impact_ratios
-def statistical_parity_difference(y_true, y_pred=None, reference_group='Privilege'):
-    selection_rates = y_pred if y_pred is not None else y_true
-    reference_rate = selection_rates[reference_group]
-    spd = {category: rate - reference_rate for category, rate in selection_rates.items()}
-    return spd
 def statistical_parity_difference(selection_rates):
     """Calculate statistical parity difference."""
     most_selected_rate = max(selection_rates.values())
@@ -80,43 +28,53 @@ def calculate_four_fifths_rule(impact_ratios):
     return adverse_impact
 def statistical_tests(data):
-    # Add ranks for each score within each row
-    # ranks = data[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=True)
-    #
-    # data['Privilege_Rank'] = ranks['Privilege_Avg_Score']
-    # data['Protect_Rank'] = ranks['Protect_Avg_Score']
-    # data['Neutral_Rank'] = ranks['Neutral_Avg_Score']
     """Perform various statistical tests to evaluate potential biases."""
     variables = ['Privilege', 'Protect', 'Neutral']
     rank_suffix = '_Rank'
     score_suffix = '_Avg_Score'
-    # Calculate average ranks and scores
     rank_columns = [v + rank_suffix for v in variables]
     average_ranks = data[rank_columns].mean()
     average_scores = data[[v + score_suffix for v in variables]].mean()
-    # Statistical tests setup
     rank_data = [data[col] for col in rank_columns]
-    pairs = [('Privilege', 'Protect'), ('Protect', 'Neutral'), ('Privilege', 'Neutral')]
-    pairwise_results = {'Wilcoxon Test': {}}
-    # Pairwise Wilcoxon Signed-Rank Test
-    for var1, var2 in pairs:
         pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
-        pair_score_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
         if len(data) > 20:
-            wilcoxon_stat_rank, wilcoxon_p_rank = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
-            wilcoxon_stat_score, wilcoxon_p_score = wilcoxon(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
         else:
-            wilcoxon_stat_rank, wilcoxon_p_rank = np.nan, "Sample size too small for Wilcoxon test."
-            wilcoxon_stat_score, wilcoxon_p_score = np.nan, "Sample size too small for Wilcoxon test."
-        pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat_rank, "p-value": wilcoxon_p_rank}
-        pairwise_results['Wilcoxon Test'][pair_score_score] = {"Statistic": wilcoxon_stat_score, "p-value": wilcoxon_p_score}
     # Calculate variances for ranks
     variances = {col: data[col].var() for col in rank_columns}
@@ -126,65 +84,24 @@ def statistical_tests(data):
         'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
     }
-    # Bias metrics calculations
-    selection_rates_Avg_Score = {v: data[f'{v}{score_suffix}'].mean() for v in variables}
-    selection_rates_rank = {v: data[f'{v}{rank_suffix}'].mean() for v in variables}
-    impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
-    spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
-    adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
-    impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
-    spd_result_rank = statistical_parity_difference(selection_rates_rank)
-    adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
     # Friedman test
     friedman_stat, friedman_p = friedmanchisquare(*rank_data)
-    rank_matrix_transposed = np.transpose(data[rank_columns].values)
-    posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
-    # Perform permutation tests for variances
-    T_priv_prot_var_rank, p_priv_prot_var_rank = permutation_test_variance(data['Privilege_Rank'], data['Protect_Rank'])
-    T_neut_prot_var_rank, p_neut_prot_var_rank = permutation_test_variance(data['Neutral_Rank'], data['Protect_Rank'])
-    T_neut_priv_var_rank, p_neut_priv_var_rank = permutation_test_variance(data['Neutral_Rank'], data['Privilege_Rank'])
-    # Perform permutation tests for variances by using rank data
-    T_priv_prot_var_score, p_priv_prot_var_score = permutation_test_variance(data['Privilege_Avg_Score'], data['Protect_Avg_Score'])
-    T_neut_prot_var_score, p_neut_prot_var_score = permutation_test_variance(data['Neutral_Avg_Score'], data['Protect_Avg_Score'])
-    T_neut_priv_var_score, p_neut_priv_var_score = permutation_test_variance(data['Neutral_Avg_Score'], data['Privilege_Avg_Score'])
-    # Perform permutation tests for means
-    T_priv_prot_mean_rank, p_priv_prot_mean_rank = permutation_test_mean(data['Privilege_Rank'], data['Protect_Rank'])
-    T_neut_prot_mean_rank, p_neut_prot_mean_rank = permutation_test_mean(data['Neutral_Rank'], data['Protect_Rank'])
-    T_neut_priv_mean_rank, p_neut_priv_mean_rank = permutation_test_mean(data['Neutral_Rank'], data['Privilege_Rank'])
-    # Perform permutation tests for means by using rank data
-    T_priv_prot_mean_score, p_priv_prot_mean_score = permutation_test_mean(data['Privilege_Avg_Score'], data['Protect_Avg_Score'])
-    T_neut_prot_mean_score, p_neut_prot_mean_score = permutation_test_mean(data['Neutral_Avg_Score'], data['Protect_Avg_Score'])
-    T_neut_priv_mean_score, p_neut_priv_mean_score = permutation_test_mean(data['Neutral_Avg_Score'], data['Privilege_Avg_Score'])
-    permutation_results = {
-        "Permutation Tests for Variances (score)": {
-            "Privilege vs. Protect": {"Statistic": T_priv_prot_var_score, "p-value": p_priv_prot_var_score},
-            "Neutral vs. Protect": {"Statistic": T_neut_prot_var_score, "p-value": p_neut_prot_var_score},
-            "Neutral vs. Privilege": {"Statistic": T_neut_priv_var_score, "p-value": p_neut_priv_var_score}
-        },
-        "Permutation Tests for Means (score)": {
-            "Privilege vs. Protect": {"Statistic": T_priv_prot_mean_score, "p-value": p_priv_prot_mean_score},
-            "Neutral vs. Protect": {"Statistic": T_neut_prot_mean_score, "p-value": p_neut_prot_mean_score},
-            "Neutral vs. Privilege": {"Statistic": T_neut_priv_mean_score, "p-value": p_neut_priv_mean_score}
-        },
-        "Permutation Tests for Variances (rank)": {
-            "Privilege vs. Protect": {"Statistic": T_priv_prot_var_rank, "p-value": p_priv_prot_var_rank},
-            "Neutral vs. Protect": {"Statistic": T_neut_prot_var_rank, "p-value": p_neut_prot_var_rank},
-            "Neutral vs. Privilege": {"Statistic": T_neut_priv_var_rank, "p-value": p_neut_priv_var_rank}
-        },
-        "Permutation Tests for Means (rank)": {
-            "Privilege vs. Protect": {"Statistic": T_priv_prot_mean_rank, "p-value": p_priv_prot_mean_rank},
-            "Neutral vs. Protect": {"Statistic": T_neut_prot_mean_rank, "p-value": p_neut_prot_mean_rank},
-            "Neutral vs. Privilege": {"Statistic": T_neut_priv_mean_rank, "p-value": p_neut_priv_mean_rank}
-        }
-    }
     results = {
         "Average Ranks": average_ranks.to_dict(),
@@ -195,189 +112,62 @@ def statistical_tests(data):
             "Post-hoc": posthoc_results
         },
         **pairwise_results,
-        #"Levene's Test for Equality of Variances": levene_results,
         "Pairwise Comparisons of Variances": pairwise_variances,
-        "Statistical Parity Difference": {
-            "Avg_Score": spd_result_Avg_Score,
-            "Rank": spd_result_rank
-        },
-        "Disparate Impact Ratios": {
-            "Avg_Score": impact_ratios_Avg_Score,
-            "Rank": impact_ratios_rank
-        },
-        "Four-Fifths Rule": {
-            "Avg_Score": adverse_impact_Avg_Score,
-            "Rank": adverse_impact_rank
-        },
-        **permutation_results
     }
     return results
-#
-# def statistical_tests(data):
-#     """Perform various statistical tests to evaluate potential biases."""
-#     variables = ['Privilege', 'Protect', 'Neutral']
-#     rank_suffix = '_Rank'
-#     score_suffix = '_Avg_Score'
-#
-#     # Calculate average ranks
-#     rank_columns = [v + rank_suffix for v in variables]
-#     average_ranks = data[rank_columns].mean()
-#     average_scores = data[[v + score_suffix for v in variables]].mean()
-#
-#     # Statistical tests
-#     rank_data = [data[col] for col in rank_columns]
-#
-#     # Pairwise tests
-#     pairs = [
-#         ('Privilege', 'Protect'),
-#         ('Protect', 'Neutral'),
-#         ('Privilege', 'Neutral')
-#     ]
-#
-#     pairwise_results = {
-#         'Wilcoxon Test': {}
-#     }
-#
-#     for (var1, var2) in pairs:
-#         pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
-#         pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
-#
-#         # Wilcoxon Signed-Rank Test
-#         if len(data) > 20:
-#             wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
-#         else:
-#             wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
-#         pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
-#
-#     # Levene's Test for Equality of Variances
-#     levene_results = {}
-#     levene_privilege_protect = levene(data['Privilege_Rank'], data['Protect_Rank'])
-#     levene_privilege_neutral = levene(data['Privilege_Rank'], data['Neutral_Rank'])
-#     levene_protect_neutral = levene(data['Protect_Rank'], data['Neutral_Rank'])
-#
-#     levene_results['Privilege vs Protect'] = {"Statistic": levene_privilege_protect.statistic,
-#                                               "p-value": levene_privilege_protect.pvalue}
-#     levene_results['Privilege vs Neutral'] = {"Statistic": levene_privilege_neutral.statistic,
-#                                               "p-value": levene_privilege_neutral.pvalue}
-#     levene_results['Protect vs Neutral'] = {"Statistic": levene_protect_neutral.statistic,
-#                                             "p-value": levene_protect_neutral.pvalue}
-#
-#     # Calculate variances for ranks
-#     variances = {col: data[col].var() for col in rank_columns}
-#     pairwise_variances = {
-#         'Privilege_Rank vs Protect_Rank': variances['Privilege_Rank'] > variances['Protect_Rank'],
-#         'Privilege_Rank vs Neutral_Rank': variances['Privilege_Rank'] > variances['Neutral_Rank'],
-#         'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
-#     }
-#
-#     selection_rates_Avg_Score = {
-#         'Privilege': data['Privilege_Avg_Score'].mean(),
-#         'Protect': data['Protect_Avg_Score'].mean(),
-#         'Neutral': data['Neutral_Avg_Score'].mean()
-#     }
-#     impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
-#     spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
-#     adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
-#
-#
-#     # rank version of bias metrics
-#     selection_rates_rank = {
-#         'Privilege': data['Privilege_Rank'].mean(),
-#         'Protect': data['Protect_Rank'].mean(),
-#         'Neutral': data['Neutral_Rank'].mean()
-#     }
-#     impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
-#     spd_result_rank = statistical_parity_difference(selection_rates_rank)
-#     adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
-#
-#
-#     # Friedman test
-#     friedman_stat, friedman_p = friedmanchisquare(*rank_data)
-#
-#     rank_matrix = data[rank_columns].values
-#     rank_matrix_transposed = np.transpose(rank_matrix)
-#     posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
-#     #posthoc_results = posthoc_friedman(data, variables, rank_suffix)
-#
-#
-#
-#     results = {
-#         "Average Ranks": average_ranks.to_dict(),
-#         "Average Scores": average_scores.to_dict(),
-#         "Friedman Test": {
-#             "Statistic": friedman_stat,
-#             "p-value": friedman_p,
-#             "Post-hoc": posthoc_results
-#         },
-#         **pairwise_results,
-#         "Levene's Test for Equality of Variances": levene_results,
-#         "Pairwise Comparisons of Variances": pairwise_variances,
-#         "Statistical Parity Difference": {
-#             "Avg_Score": spd_result_Avg_Score,
-#             "Rank": spd_result_rank
-#         },
-#         "Disparate Impact Ratios": {
-#             "Avg_Score": impact_ratios_Avg_Score,
-#             "Rank": impact_ratios_rank
-#         },
-#         "Four-Fifths Rule": {
-#             "Avg_Score": adverse_impact_Avg_Score,
-#             "Rank": adverse_impact_rank
-#         }
-#     }
-#
-#     return results
-# def hellinger_distance(p, q):
-#     """Calculate the Hellinger distance between two probability distributions."""
-#     return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
-#
-#
-# def calculate_correlations(df):
-#     """Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
-#     correlations = {
-#         'Spearman': {},
-#         'Pearson': {},
-#         'Kendall Tau': {}
-#     }
-#     columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
-#     for i in range(len(columns)):
-#         for j in range(i + 1, len(columns)):
-#             col1, col2 = columns[i], columns[j]
-#             correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
-#             correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
-#             correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
-#     return correlations
-#
-#
-# def scores_to_prob(scores):
-#     """Convert scores to probability distributions."""
-#     value_counts = scores.value_counts()
-#     probabilities = value_counts / value_counts.sum()
-#     full_prob = np.zeros(int(scores.max()) + 1)
-#     full_prob[value_counts.index.astype(int)] = probabilities
-#     return full_prob
-# def calculate_divergences(df):
-#     """Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
-#     score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
-#     probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
-#     divergences = {
-#         'KL Divergence': {},
-#         'Jensen-Shannon Divergence': {},
-#         'Hellinger Distance': {}
-#     }
-#     for i in range(len(score_columns)):
-#         for j in range(i + 1, len(score_columns)):
-#             col1, col2 = score_columns[i], score_columns[j]
-#             divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
-#             divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
-#                                                                                           probabilities[col2])
-#             divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
-#                                                                                         probabilities[col2])
-#     return divergences

 from statsmodels.stats.multicomp import pairwise_tukeyhsd
 from scipy.stats import ttest_1samp
 def calculate_impact_ratio(selection_rates):
     """Calculate the impact ratio for each category."""
     most_selected_rate = max(selection_rates.values())
     impact_ratios = {category: rate / most_selected_rate for category, rate in selection_rates.items()}
     return impact_ratios
 def statistical_parity_difference(selection_rates):
     """Calculate statistical parity difference."""
     most_selected_rate = max(selection_rates.values())
     return adverse_impact
 def statistical_tests(data):
     """Perform various statistical tests to evaluate potential biases."""
     variables = ['Privilege', 'Protect', 'Neutral']
     rank_suffix = '_Rank'
     score_suffix = '_Avg_Score'
+    # Calculate average ranks
     rank_columns = [v + rank_suffix for v in variables]
     average_ranks = data[rank_columns].mean()
     average_scores = data[[v + score_suffix for v in variables]].mean()
+    # Statistical tests
     rank_data = [data[col] for col in rank_columns]
+    # Pairwise tests
+    pairs = [
+        ('Privilege', 'Protect'),
+        ('Protect', 'Neutral'),
+        ('Privilege', 'Neutral')
+    ]
+    pairwise_results = {
+        'Wilcoxon Test': {}
+    }
+    for (var1, var2) in pairs:
+        pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
         pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
+        # Wilcoxon Signed-Rank Test
         if len(data) > 20:
+            wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
         else:
+            wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
+        pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
+    # Levene's Test for Equality of Variances
+    levene_results = {}
+    levene_privilege_protect = levene(data['Privilege_Rank'], data['Protect_Rank'])
+    levene_privilege_neutral = levene(data['Privilege_Rank'], data['Neutral_Rank'])
+    levene_protect_neutral = levene(data['Protect_Rank'], data['Neutral_Rank'])
+    levene_results['Privilege vs Protect'] = {"Statistic": levene_privilege_protect.statistic,
+                                              "p-value": levene_privilege_protect.pvalue}
+    levene_results['Privilege vs Neutral'] = {"Statistic": levene_privilege_neutral.statistic,
+                                              "p-value": levene_privilege_neutral.pvalue}
+    levene_results['Protect vs Neutral'] = {"Statistic": levene_protect_neutral.statistic,
+                                            "p-value": levene_protect_neutral.pvalue}
     # Calculate variances for ranks
     variances = {col: data[col].var() for col in rank_columns}
         'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
     }
+    selection_rates = {
+        'Privilege': data['Privilege_Rank'].mean(),
+        'Protect': data['Protect_Rank'].mean(),
+        'Neutral': data['Neutral_Rank'].mean()
+    }
+    impact_ratios = calculate_impact_ratio(selection_rates)
+    spd_result = statistical_parity_difference(selection_rates)
+    adverse_impact = calculate_four_fifths_rule(impact_ratios)
     # Friedman test
     friedman_stat, friedman_p = friedmanchisquare(*rank_data)
+    rank_matrix = data[rank_columns].values
+    rank_matrix_transposed = np.transpose(rank_matrix)
+    posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
+    #posthoc_results = posthoc_friedman(data, variables, rank_suffix)
     results = {
         "Average Ranks": average_ranks.to_dict(),
             "Post-hoc": posthoc_results
         },
         **pairwise_results,
+        "Levene's Test for Equality of Variances": levene_results,
         "Pairwise Comparisons of Variances": pairwise_variances,
+        "Statistical Parity Difference": spd_result,
+        "Disparate Impact Ratios": impact_ratios,
+        "Four-Fifths Rule": adverse_impact,
     }
     return results
+def hellinger_distance(p, q):
+    """Calculate the Hellinger distance between two probability distributions."""
+    return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
+def calculate_correlations(df):
+    """Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
+    correlations = {
+        'Spearman': {},
+        'Pearson': {},
+        'Kendall Tau': {}
+    }
+    columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
+    for i in range(len(columns)):
+        for j in range(i + 1, len(columns)):
+            col1, col2 = columns[i], columns[j]
+            correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
+            correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
+            correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
+    return correlations
+def scores_to_prob(scores):
+    """Convert scores to probability distributions."""
+    value_counts = scores.value_counts()
+    probabilities = value_counts / value_counts.sum()
+    full_prob = np.zeros(int(scores.max()) + 1)
+    full_prob[value_counts.index.astype(int)] = probabilities
+    return full_prob
+def calculate_divergences(df):
+    """Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
+    score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
+    probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
+    divergences = {
+        'KL Divergence': {},
+        'Jensen-Shannon Divergence': {},
+        'Hellinger Distance': {}
+    }
+    for i in range(len(score_columns)):
+        for j in range(i + 1, len(score_columns)):
+            col1, col2 = score_columns[i], score_columns[j]
+            divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
+            divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
+                                                                                          probabilities[col2])
+            divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
+                                                                                        probabilities[col2])
+    return divergences

util/injection.py CHANGED Viewed

@@ -5,14 +5,13 @@ import json_repair
 import pandas as pd
 from tqdm import tqdm
-def create_summary(group_name, label, occupation, row, template):
     """Generate a dynamic summary for scoring the applicant, excluding the group feature.
        The occupation parameter allows customization of the job position.
     """
     resume_info = row['Cleaned_Resume']
-    # resume_info = resume_info[:int(len(resume_info) * proportion)]
     info = f"{group_name}: {label};" if label else ''
@@ -25,7 +24,7 @@ def create_summary(group_name, label, occupation, row, template):
     return summary
-def invoke_retry(prompt, agent, parameters, string_input=False):
     attempts = 0
     delay = 2  # Initial delay in seconds
     max_attempts = 5  # Maximum number of retry attempts
@@ -33,22 +32,21 @@ def invoke_retry(prompt, agent, parameters, string_input=False):
     while attempts < max_attempts:
         try:
             score_text = agent.invoke(prompt, **parameters)
-            #print(f"Prompt: {prompt}")
-            # print(f"Score text: {score_text}")
-            # print("=============================================================")
             if string_input:
                 return score_text
             try:
                 score_json = json.loads(score_text)
             except json.JSONDecodeError:
                 try:
-                    score_json = json.loads(
-                        json_repair.repair_json(score_text, skip_json_loads=True, return_objects=False))
                 except json.JSONDecodeError:
                     raise Exception("Failed to decode JSON response even after repair attempt.")
             # score = re.search(r'\d+', score_text)
             # return int(score.group()) if score else -1
-            #print(f"Score JSON: {score_json}")
             return int(score_json['Score'])
         except Exception as e:
@@ -58,7 +56,7 @@ def invoke_retry(prompt, agent, parameters, string_input=False):
             attempts += 1
     return -1
-    # raise Exception("Failed to complete the API call after maximum retry attempts.")
 def calculate_avg_score(score_list):
@@ -68,35 +66,37 @@ def calculate_avg_score(score_list):
             avg_score = sum(valid_scores) / len(valid_scores)
             return avg_score
     return None
-def process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation
-                            , template):
     print(f"Processing {len(df)} entries with {num_run} runs each.")
     """ Process entries and compute scores concurrently, with progress updates. """
-    scores = {key: [[] for _ in range(len(df))] for key in ['Privilege', 'Protect', 'Neutral']}
     for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
         for index, (idx, row) in tqdm(enumerate(df.iterrows()), total=len(df), desc="Processing entries", unit="entry"):
             for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
-                prompt_normal = create_summary(group_name, label, occupation, row, template)
-                # print(f"Run {run + 1} - Entry {index + 1} - {key}")
-                # print("=============================================================")
                 result_normal = invoke_retry(prompt_normal, agent, parameters)
                 scores[key][index].append(result_normal)
-    #print(f"Scores: {scores}")
     # Ensure all scores are lists and calculate average scores
-    for category in ['Privilege', 'Protect', 'Neutral']:
         # Ensure the scores are lists and check before assignment
         series_data = [lst if isinstance(lst, list) else [lst] for lst in scores[category]]
         df[f'{category}_Scores'] = series_data
         # Calculate the average score with additional debug info
         df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
     # Add ranks for each score within each row
@@ -107,4 +107,3 @@ def process_scores_multiple(df, num_run, parameters, privilege_label, protect_la
     df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
     return df

 import pandas as pd
 from tqdm import tqdm
+def create_summary(group_name, label, occupation, row, proportion,template):
     """Generate a dynamic summary for scoring the applicant, excluding the group feature.
        The occupation parameter allows customization of the job position.
     """
     resume_info = row['Cleaned_Resume']
+    resume_info = resume_info[:int(len(resume_info) * proportion)]
     info = f"{group_name}: {label};" if label else ''
     return summary
+def invoke_retry(prompt, agent, parameters,string_input=False):
     attempts = 0
     delay = 2  # Initial delay in seconds
     max_attempts = 5  # Maximum number of retry attempts
     while attempts < max_attempts:
         try:
             score_text = agent.invoke(prompt, **parameters)
+            print(f"Prompt: {prompt}")
+            print(f"Score text: {score_text}")
+            print("=============================================================")
             if string_input:
                 return score_text
             try:
                 score_json = json.loads(score_text)
             except json.JSONDecodeError:
                 try:
+                    score_json = json.loads(json_repair.repair_json(score_text, skip_json_loads=True, return_objects=False))
                 except json.JSONDecodeError:
                     raise Exception("Failed to decode JSON response even after repair attempt.")
             # score = re.search(r'\d+', score_text)
             # return int(score.group()) if score else -1
+            print(f"Score JSON: {score_json}")
             return int(score_json['Score'])
         except Exception as e:
             attempts += 1
     return -1
+    #raise Exception("Failed to complete the API call after maximum retry attempts.")
 def calculate_avg_score(score_list):
             avg_score = sum(valid_scores) / len(valid_scores)
             return avg_score
     return None
+def process_scores_multiple(df, num_run, parameters, privilege_label, protect_label, agent, group_name, occupation,proportion,template):
     print(f"Processing {len(df)} entries with {num_run} runs each.")
     """ Process entries and compute scores concurrently, with progress updates. """
+    scores = {key: [[] for _ in range(len(df))] for key in ['Privilege','Protect','Neutral']}
     for run in tqdm(range(num_run), desc="Processing runs", unit="run"):
         for index, (idx, row) in tqdm(enumerate(df.iterrows()), total=len(df), desc="Processing entries", unit="entry"):
             for key, label in zip(['Privilege', 'Protect', 'Neutral'], [privilege_label, protect_label, False]):
+                prompt_normal = create_summary(group_name, label, occupation,row,proportion,template)
+                print(f"Run {run + 1} - Entry {index + 1} - {key}")
+                print("=============================================================")
                 result_normal = invoke_retry(prompt_normal, agent, parameters)
                 scores[key][index].append(result_normal)
+    print(f"Scores: {scores}")
     # Ensure all scores are lists and calculate average scores
+    for category in ['Privilege', 'Protect','Neutral']:
         # Ensure the scores are lists and check before assignment
         series_data = [lst if isinstance(lst, list) else [lst] for lst in scores[category]]
         df[f'{category}_Scores'] = series_data
         # Calculate the average score with additional debug info
         df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
     # Add ranks for each score within each row
     df['Neutral_Rank'] = ranks['Neutral_Avg_Score']
     return df

util/model.py CHANGED Viewed

@@ -1,49 +1,6 @@
 import json
 import http.client
 from openai import AzureOpenAI
-import time
-from tqdm import tqdm
-from typing import Any, List
-from botocore.exceptions import ClientError
-from enum import Enum
-import boto3
-import json
-import logging
-class Model(Enum):
-    CLAUDE3_SONNET = "anthropic.claude-3-sonnet-20240229-v1:0"
-    CLAUDE3_HAIKU = "anthropic.claude-3-haiku-20240307-v1:0"
-class Claude3Agent:
-    def __init__(self, aws_secret_access_key: str,model: str ):
-        self.client = boto3.client("bedrock-runtime", region_name="us-east-1", aws_access_key_id="AKIAZR6ZJPKTKJAMLP5W",
-                                   aws_secret_access_key=aws_secret_access_key)
-        if model == "SONNET":
-            self.model = Model.CLAUDE3_SONNET
-        elif model == "HAIKU":
-            self.model = Model.CLAUDE3_HAIKU
-        else:
-            raise ValueError("Invalid model type. Please choose from 'SONNET' or 'HAIKU' models.")
-    def invoke(self, text: str,**kwargs) -> str:
-        try:
-            body = json.dumps(
-                {
-                    "anthropic_version": "bedrock-2023-05-31",
-                    "messages": [
-                        {"role": "user", "content": [{"type": "text", "text": text}]}
-                    ],
-                    **kwargs
-                }
-            )
-            response = self.client.invoke_model(modelId=self.model.value, body=body)
-            completion = json.loads(response["body"].read())["content"][0]["text"]
-            return completion
-        except ClientError:
-            logging.error("Couldn't invoke model")
-            raise
 class ContentFormatter:
     @staticmethod
@@ -96,4 +53,3 @@ class GPTAgent:
             **kwargs
         )
         return response.choices[0].message.content

 import json
 import http.client
 from openai import AzureOpenAI
 class ContentFormatter:
     @staticmethod
             **kwargs
         )
         return response.choices[0].message.content