biasaware / scripts /gender_profession_bias.py
freyam
Add sample size limit and AVID report
8ab9329
raw
history blame
No virus
4.31 kB
import re
import json
import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer")
def call_multiprocessing_pool(df_text):
concurrent = multiprocessing.cpu_count()
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
pool.close()
flat_return_list = [item for sublist in result_list for item in sublist]
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
return_df = pd.DataFrame(flat_return_list, columns=cols)
return return_df
def get_gender_prof_match_details(df_text):
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
male_pronouns = gender_lexicons.get("male_pronouns")
female_pronouns = gender_lexicons.get("female_pronouns")
professions = profession_lexicons.get("professions")
male_pronoun_pat, female_pronoun_pat, professions_pat = (
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
for pattern in [male_pronouns, female_pronouns, professions]
)
doc = nlp(df_text)
split_text = [sent for sent in doc.sents]
results = []
for text in split_text:
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
prof_match = re.findall(professions_pat, str(text))
both_match = "No"
if len(male_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
male_pronoun_match = ",".join(male_pronoun_match)
female_pronoun_match = ",".join(female_pronoun_match)
prof_match = ",".join(prof_match)
results.append(
(
str(text),
male_pronoun_match,
female_pronoun_match,
prof_match,
both_match,
)
)
return results
def get_statistics(result):
stats = {
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
"count_male_pronoun_profession": str(
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"count_female_pronoun_profession": str(
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"total_sentence": str(len(result)),
}
return stats
def get_plot(result_json):
both_gender_prof_match = int(result_json["both_gender_prof_match"])
count_male_pronoun = int(result_json["count_male_pronoun"])
count_female_pronoun = int(result_json["count_female_pronoun"])
count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
count_female_pronoun_profession = int(
result_json["count_female_pronoun_profession"]
)
data = {
"Labels": [
"Both Gender & Profession Match",
"Male Pronoun",
"Female Pronoun",
"Male Pronoun & Profession",
"Female Pronoun & Profession",
],
"Values": [
both_gender_prof_match,
count_male_pronoun,
count_female_pronoun,
count_male_pronoun_profession,
count_female_pronoun_profession,
],
}
fig = px.pie(
data,
names="Labels",
values="Values",
title="Gender & Profession Match Statistics",
)
return fig
def eval_gender_profession(data):
data = data[data.columns[0]].str.lower().str.strip()
result = call_multiprocessing_pool(data)
result_json = get_statistics(result)
result_plot = get_plot(result_json)
result_df = (
pd.DataFrame.from_dict(result_json, orient="index")
.reset_index()
.rename(columns={"index": "Metric", 0: "Value"})
)
return result_df, result_plot