File size: 4,306 Bytes
e0db39e 0946447 e0db39e d1a2df2 7192c24 d1a2df2 e0db39e 7192c24 e0db39e 7192c24 e0db39e 7192c24 e0db39e 6d2d9db e0db39e d1a2df2 0946447 e0db39e 0946447 8ab9329 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import re
import json
import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English
nlp = English()
nlp.add_pipe("sentencizer")
def call_multiprocessing_pool(df_text):
concurrent = multiprocessing.cpu_count()
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
pool.close()
flat_return_list = [item for sublist in result_list for item in sublist]
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
return_df = pd.DataFrame(flat_return_list, columns=cols)
return return_df
def get_gender_prof_match_details(df_text):
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
male_pronouns = gender_lexicons.get("male_pronouns")
female_pronouns = gender_lexicons.get("female_pronouns")
professions = profession_lexicons.get("professions")
male_pronoun_pat, female_pronoun_pat, professions_pat = (
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
for pattern in [male_pronouns, female_pronouns, professions]
)
doc = nlp(df_text)
split_text = [sent for sent in doc.sents]
results = []
for text in split_text:
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
prof_match = re.findall(professions_pat, str(text))
both_match = "No"
if len(male_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
both_match = "Yes"
male_pronoun_match = ",".join(male_pronoun_match)
female_pronoun_match = ",".join(female_pronoun_match)
prof_match = ",".join(prof_match)
results.append(
(
str(text),
male_pronoun_match,
female_pronoun_match,
prof_match,
both_match,
)
)
return results
def get_statistics(result):
stats = {
"both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
"count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
"count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
"count_male_pronoun_profession": str(
((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"count_female_pronoun_profession": str(
((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
),
"total_sentence": str(len(result)),
}
return stats
def get_plot(result_json):
both_gender_prof_match = int(result_json["both_gender_prof_match"])
count_male_pronoun = int(result_json["count_male_pronoun"])
count_female_pronoun = int(result_json["count_female_pronoun"])
count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
count_female_pronoun_profession = int(
result_json["count_female_pronoun_profession"]
)
data = {
"Labels": [
"Both Gender & Profession Match",
"Male Pronoun",
"Female Pronoun",
"Male Pronoun & Profession",
"Female Pronoun & Profession",
],
"Values": [
both_gender_prof_match,
count_male_pronoun,
count_female_pronoun,
count_male_pronoun_profession,
count_female_pronoun_profession,
],
}
fig = px.pie(
data,
names="Labels",
values="Values",
title="Gender & Profession Match Statistics",
)
return fig
def eval_gender_profession(data):
data = data[data.columns[0]].str.lower().str.strip()
result = call_multiprocessing_pool(data)
result_json = get_statistics(result)
result_plot = get_plot(result_json)
result_df = (
pd.DataFrame.from_dict(result_json, orient="index")
.reset_index()
.rename(columns={"index": "Metric", 0: "Value"})
)
return result_df, result_plot
|