File size: 4,306 Bytes
e0db39e
 
 
 
0946447
e0db39e
 
 
 
 
 
 
 
d1a2df2
7192c24
d1a2df2
 
 
 
 
 
 
 
 
 
 
 
e0db39e
7192c24
 
 
e0db39e
 
 
 
7192c24
 
 
e0db39e
 
7192c24
 
e0db39e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d2d9db
 
 
 
 
 
 
 
 
 
 
e0db39e
 
 
 
 
d1a2df2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0946447
 
e0db39e
 
 
 
 
0946447
 
 
 
 
 
 
 
 
8ab9329
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
import json

import pandas as pd
import plotly.express as px
import multiprocessing.pool
from spacy.lang.en import English


nlp = English()
nlp.add_pipe("sentencizer")


def call_multiprocessing_pool(df_text):
    concurrent = multiprocessing.cpu_count()
    pool = multiprocessing.pool.ThreadPool(processes=concurrent)
    result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
    pool.close()

    flat_return_list = [item for sublist in result_list for item in sublist]

    cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
    return_df = pd.DataFrame(flat_return_list, columns=cols)

    return return_df


def get_gender_prof_match_details(df_text):
    gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
    profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))

    male_pronouns = gender_lexicons.get("male_pronouns")
    female_pronouns = gender_lexicons.get("female_pronouns")
    professions = profession_lexicons.get("professions")

    male_pronoun_pat, female_pronoun_pat, professions_pat = (
        re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
        for pattern in [male_pronouns, female_pronouns, professions]
    )

    doc = nlp(df_text)
    split_text = [sent for sent in doc.sents]

    results = []

    for text in split_text:
        male_pronoun_match = re.findall(male_pronoun_pat, str(text))
        female_pronoun_match = re.findall(female_pronoun_pat, str(text))

        prof_match = re.findall(professions_pat, str(text))

        both_match = "No"

        if len(male_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        if len(female_pronoun_match) != 0 and len(prof_match) != 0:
            both_match = "Yes"

        male_pronoun_match = ",".join(male_pronoun_match)
        female_pronoun_match = ",".join(female_pronoun_match)

        prof_match = ",".join(prof_match)

        results.append(
            (
                str(text),
                male_pronoun_match,
                female_pronoun_match,
                prof_match,
                both_match,
            )
        )

    return results


def get_statistics(result):
    stats = {
        "both_gender_prof_match": str((result["Both Match"] == "Yes").sum()),
        "count_male_pronoun": str((result["Male Pronoun"] != "").sum()),
        "count_female_pronoun": str((result["Female Pronoun"] != "").sum()),
        "count_male_pronoun_profession": str(
            ((result["Male Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "count_female_pronoun_profession": str(
            ((result["Female Pronoun"] != "") & (result["Profession"] != "")).sum()
        ),
        "total_sentence": str(len(result)),
    }

    return stats


def get_plot(result_json):
    both_gender_prof_match = int(result_json["both_gender_prof_match"])
    count_male_pronoun = int(result_json["count_male_pronoun"])
    count_female_pronoun = int(result_json["count_female_pronoun"])
    count_male_pronoun_profession = int(result_json["count_male_pronoun_profession"])
    count_female_pronoun_profession = int(
        result_json["count_female_pronoun_profession"]
    )

    data = {
        "Labels": [
            "Both Gender & Profession Match",
            "Male Pronoun",
            "Female Pronoun",
            "Male Pronoun & Profession",
            "Female Pronoun & Profession",
        ],
        "Values": [
            both_gender_prof_match,
            count_male_pronoun,
            count_female_pronoun,
            count_male_pronoun_profession,
            count_female_pronoun_profession,
        ],
    }

    fig = px.pie(
        data,
        names="Labels",
        values="Values",
        title="Gender & Profession Match Statistics",
    )

    return fig


def eval_gender_profession(data):
    data = data[data.columns[0]].str.lower().str.strip()

    result = call_multiprocessing_pool(data)

    result_json = get_statistics(result)
    result_plot = get_plot(result_json)

    result_df = (
        pd.DataFrame.from_dict(result_json, orient="index")
        .reset_index()
        .rename(columns={"index": "Metric", 0: "Value"})
    )

    return result_df, result_plot