Restructure Logic and Data Flow
Browse files- README.md +1 -1
- app.py +102 -74
- config/gender_lexicons.json +28 -0
- methodologies.json β config/methodologies.json +3 -3
- config/profession_lexicons.json +156 -0
- data/z_house.csv +0 -7
- data/z_sentences.csv +11 -0
- requirements.txt +1 -1
- scripts/genbit.py +14 -0
- scripts/genbit_metrics.py +0 -48
- scripts/{gender_tagging.py β gender_divide.py} +64 -57
- scripts/{gender_profession_tagging.py β gender_profession_bias.py} +50 -68
- utils/config.json +0 -160
- utils/load_csv.py +0 -23
- utils/read_config.py +0 -13
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π¦
|
|
4 |
colorFrom: indigo
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
4 |
colorFrom: indigo
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.43.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
app.py
CHANGED
@@ -3,78 +3,105 @@ import gradio as gr
|
|
3 |
import pandas as pd
|
4 |
import os
|
5 |
|
6 |
-
from scripts.
|
7 |
-
from scripts.
|
8 |
-
from scripts.
|
9 |
-
from utils.load_csv import *
|
10 |
-
from utils.read_config import get_args
|
11 |
|
12 |
-
methodologies = json.load(open("methodologies.json", "r"))
|
13 |
|
|
|
14 |
|
15 |
-
def get_methodology_metadata(methodology):
|
16 |
-
title = "## " + methodology
|
17 |
-
description = methodologies.get(methodology).get("description")
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
)
|
34 |
|
35 |
-
return gr.JSON.update(status, visible=True)
|
36 |
-
|
37 |
|
38 |
-
def
|
39 |
-
|
|
|
40 |
|
41 |
-
|
42 |
|
43 |
return (
|
44 |
-
gr.
|
45 |
-
|
46 |
-
info="Determines the scope of the dataset to be analyzed",
|
47 |
-
choices=["First", "Last", "Random"],
|
48 |
-
value="First",
|
49 |
-
visible=True,
|
50 |
-
interactive=True,
|
51 |
-
),
|
52 |
-
gr.Slider.update(
|
53 |
-
label=f"Number of Entries",
|
54 |
-
info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {get_args('first_records')}.",
|
55 |
-
minimum=1,
|
56 |
-
maximum=min(data.shape[0], get_args("first_records")),
|
57 |
-
value=min(data.shape[0], get_args("first_records")) // 2,
|
58 |
-
visible=True,
|
59 |
-
interactive=True,
|
60 |
-
),
|
61 |
-
gr.Radio.update(
|
62 |
-
label="Column",
|
63 |
-
info="Determines the column to be analyzed. These are the columns with text data.",
|
64 |
-
choices=columns,
|
65 |
-
value=columns[0],
|
66 |
-
visible=True,
|
67 |
-
interactive=True,
|
68 |
-
),
|
69 |
-
)
|
70 |
-
|
71 |
-
|
72 |
-
def get_column_metadata(dataset, column):
|
73 |
-
data = pd.read_csv(dataset.name)
|
74 |
-
corpus = data[column].head(10).tolist()
|
75 |
-
|
76 |
-
return gr.Dataframe.update(
|
77 |
-
value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
|
78 |
)
|
79 |
|
80 |
|
@@ -89,19 +116,19 @@ with BiasAware:
|
|
89 |
with gr.Column(scale=2):
|
90 |
gr.Markdown("## Dataset")
|
91 |
|
92 |
-
dataset_file = gr.File(label="Dataset")
|
93 |
dataset_examples = gr.Examples(
|
94 |
[
|
95 |
os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
|
96 |
os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
|
97 |
-
os.path.join(os.path.dirname(__file__), "data/
|
98 |
],
|
99 |
inputs=dataset_file,
|
100 |
label="Example Datasets",
|
101 |
)
|
102 |
|
103 |
-
|
104 |
-
|
105 |
dataset_column = gr.Radio(visible=False)
|
106 |
|
107 |
dataset_corpus = gr.Dataframe(
|
@@ -114,14 +141,10 @@ with BiasAware:
|
|
114 |
methodology = gr.Radio(
|
115 |
label="Methodology",
|
116 |
info="Determines the methodology to be used for bias detection",
|
117 |
-
choices=
|
118 |
-
"Gender Divide (Term Identity Diversity)",
|
119 |
-
"Gender Profession Bias (Lexical Evaluation)",
|
120 |
-
"GenBiT (Microsoft Responsible AI Gender Bias Tool)",
|
121 |
-
],
|
122 |
)
|
123 |
|
124 |
-
evalButton = gr.Button("Run Evaluation")
|
125 |
|
126 |
methodology_metadata = gr.Markdown(visible=False)
|
127 |
|
@@ -134,13 +157,18 @@ with BiasAware:
|
|
134 |
)
|
135 |
|
136 |
dataset_file.change(
|
137 |
-
fn=
|
138 |
inputs=[dataset_file],
|
139 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
140 |
)
|
141 |
|
142 |
dataset_column.change(
|
143 |
-
fn=
|
144 |
inputs=[dataset_file, dataset_column],
|
145 |
outputs=[dataset_corpus],
|
146 |
)
|
@@ -148,15 +176,15 @@ with BiasAware:
|
|
148 |
methodology.change(
|
149 |
fn=get_methodology_metadata,
|
150 |
inputs=[methodology],
|
151 |
-
outputs=[methodology_metadata],
|
152 |
)
|
153 |
|
154 |
evalButton.click(
|
155 |
fn=evaluate,
|
156 |
inputs=[
|
157 |
dataset_file,
|
158 |
-
|
159 |
-
|
160 |
dataset_column,
|
161 |
methodology,
|
162 |
],
|
|
|
3 |
import pandas as pd
|
4 |
import os
|
5 |
|
6 |
+
from scripts.genbit import *
|
7 |
+
from scripts.gender_profession_bias import *
|
8 |
+
from scripts.gender_divide import *
|
|
|
|
|
9 |
|
10 |
+
methodologies = json.load(open("config/methodologies.json", "r"))
|
11 |
|
12 |
+
MAX_THRESHOLD = 1000
|
13 |
|
|
|
|
|
|
|
14 |
|
15 |
+
def evaluate(dataset, sampling_method, sampling_size, column, methodology):
|
16 |
+
try:
|
17 |
+
print(
|
18 |
+
f"[{dataset.name.split('/')[-1]}::{column}] - {sampling_method} {sampling_size} entries"
|
19 |
+
)
|
20 |
+
data = pd.read_csv(dataset.name, usecols=[column])
|
21 |
|
22 |
+
if sampling_method == "First":
|
23 |
+
data = data.head(sampling_size)
|
24 |
+
elif sampling_method == "Last":
|
25 |
+
data = data.tail(sampling_size)
|
26 |
+
elif sampling_method == "Random":
|
27 |
+
data = data.sample(n=sampling_size, random_state=42)
|
28 |
|
29 |
+
result = globals()[methodologies.get(methodology).get("fx")](data)
|
30 |
|
31 |
+
return gr.JSON.update(result, visible=True)
|
32 |
+
except Exception as e:
|
33 |
+
return gr.JSON.update(
|
34 |
+
{
|
35 |
+
"error": f"An error occurred while processing the dataset. Please check the dataset and try again. Error: {e}"
|
36 |
+
},
|
37 |
+
visible=True,
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
+
def display_dataset_config(dataset):
|
42 |
+
try:
|
43 |
+
data = pd.read_csv(dataset.name)
|
44 |
+
|
45 |
+
columns = data.select_dtypes(include=["object"]).columns.tolist()
|
46 |
+
corpus = data[columns[0]].tolist()
|
47 |
+
|
48 |
+
return (
|
49 |
+
gr.Radio.update(
|
50 |
+
label="Scope",
|
51 |
+
info="Determines the scope of the dataset to be analyzed",
|
52 |
+
choices=["First", "Last", "Random"],
|
53 |
+
value="First",
|
54 |
+
visible=True,
|
55 |
+
interactive=True,
|
56 |
+
),
|
57 |
+
gr.Slider.update(
|
58 |
+
label=f"Number of Entries",
|
59 |
+
info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
|
60 |
+
minimum=1,
|
61 |
+
maximum=min(data.shape[0], MAX_THRESHOLD),
|
62 |
+
value=min(data.shape[0], MAX_THRESHOLD) // 2,
|
63 |
+
visible=True,
|
64 |
+
interactive=True,
|
65 |
+
),
|
66 |
+
gr.Radio.update(
|
67 |
+
label="Column",
|
68 |
+
info="Determines the column to be analyzed. These are the columns with text data.",
|
69 |
+
choices=columns,
|
70 |
+
value=columns[0],
|
71 |
+
visible=True,
|
72 |
+
interactive=True,
|
73 |
+
),
|
74 |
+
gr.DataFrame.update(
|
75 |
+
value=pd.DataFrame({f"Data Corpus: {columns[0]}": corpus}), visible=True
|
76 |
+
),
|
77 |
+
)
|
78 |
+
except:
|
79 |
+
return (
|
80 |
+
gr.Radio.update(visible=False),
|
81 |
+
gr.Slider.update(visible=False),
|
82 |
+
gr.Radio.update(visible=False),
|
83 |
+
gr.DataFrame.update(visible=False),
|
84 |
+
)
|
85 |
+
|
86 |
+
|
87 |
+
def update_column_metadata(dataset, column):
|
88 |
+
data = pd.read_csv(dataset.name)
|
89 |
+
corpus = data[column].tolist()
|
90 |
|
91 |
+
return gr.Dataframe.update(
|
92 |
+
value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
|
93 |
)
|
94 |
|
|
|
|
|
95 |
|
96 |
+
def get_methodology_metadata(methodology):
|
97 |
+
title = "## " + methodology
|
98 |
+
description = methodologies.get(methodology).get("description")
|
99 |
|
100 |
+
metadata = f"{title}\n\n{description}"
|
101 |
|
102 |
return (
|
103 |
+
gr.Markdown.update(metadata, visible=True),
|
104 |
+
gr.Button.update(interactive=True, visible=True),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
)
|
106 |
|
107 |
|
|
|
116 |
with gr.Column(scale=2):
|
117 |
gr.Markdown("## Dataset")
|
118 |
|
119 |
+
dataset_file = gr.File(label="Dataset", file_types=["csv"])
|
120 |
dataset_examples = gr.Examples(
|
121 |
[
|
122 |
os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
|
123 |
os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
|
124 |
+
os.path.join(os.path.dirname(__file__), "data/z_sentences.csv"),
|
125 |
],
|
126 |
inputs=dataset_file,
|
127 |
label="Example Datasets",
|
128 |
)
|
129 |
|
130 |
+
dataset_sampling_method = gr.Radio(visible=False)
|
131 |
+
dataset_sampling_size = gr.Slider(visible=False)
|
132 |
dataset_column = gr.Radio(visible=False)
|
133 |
|
134 |
dataset_corpus = gr.Dataframe(
|
|
|
141 |
methodology = gr.Radio(
|
142 |
label="Methodology",
|
143 |
info="Determines the methodology to be used for bias detection",
|
144 |
+
choices=methodologies.keys(),
|
|
|
|
|
|
|
|
|
145 |
)
|
146 |
|
147 |
+
evalButton = gr.Button(value="Run Evaluation", interactive=False)
|
148 |
|
149 |
methodology_metadata = gr.Markdown(visible=False)
|
150 |
|
|
|
157 |
)
|
158 |
|
159 |
dataset_file.change(
|
160 |
+
fn=display_dataset_config,
|
161 |
inputs=[dataset_file],
|
162 |
+
outputs=[
|
163 |
+
dataset_sampling_method,
|
164 |
+
dataset_sampling_size,
|
165 |
+
dataset_column,
|
166 |
+
dataset_corpus,
|
167 |
+
],
|
168 |
)
|
169 |
|
170 |
dataset_column.change(
|
171 |
+
fn=update_column_metadata,
|
172 |
inputs=[dataset_file, dataset_column],
|
173 |
outputs=[dataset_corpus],
|
174 |
)
|
|
|
176 |
methodology.change(
|
177 |
fn=get_methodology_metadata,
|
178 |
inputs=[methodology],
|
179 |
+
outputs=[methodology_metadata, evalButton],
|
180 |
)
|
181 |
|
182 |
evalButton.click(
|
183 |
fn=evaluate,
|
184 |
inputs=[
|
185 |
dataset_file,
|
186 |
+
dataset_sampling_method,
|
187 |
+
dataset_sampling_size,
|
188 |
dataset_column,
|
189 |
methodology,
|
190 |
],
|
config/gender_lexicons.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"male_lexicons": [
|
3 |
+
"man",
|
4 |
+
"boy",
|
5 |
+
"male",
|
6 |
+
"he",
|
7 |
+
"son",
|
8 |
+
"his",
|
9 |
+
"himself",
|
10 |
+
"guy",
|
11 |
+
"father",
|
12 |
+
"john"
|
13 |
+
],
|
14 |
+
"male_pronouns": ["he", "him", "his"],
|
15 |
+
"female_lexicons": [
|
16 |
+
"woman",
|
17 |
+
"girl",
|
18 |
+
"female",
|
19 |
+
"she",
|
20 |
+
"daughter",
|
21 |
+
"her",
|
22 |
+
"herself",
|
23 |
+
"gal",
|
24 |
+
"mother",
|
25 |
+
"mary"
|
26 |
+
],
|
27 |
+
"female_pronouns": ["she", "her", "hers"]
|
28 |
+
}
|
methodologies.json β config/methodologies.json
RENAMED
@@ -1,14 +1,14 @@
|
|
1 |
{
|
2 |
"Gender Divide (Term Identity Diversity)": {
|
3 |
"description": "333",
|
4 |
-
"fx": "
|
5 |
},
|
6 |
"Gender Profession Bias (Lexical Evaluation)": {
|
7 |
"description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
|
8 |
-
"fx": "
|
9 |
},
|
10 |
"GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
|
11 |
"description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
|
12 |
-
"fx": "
|
13 |
}
|
14 |
}
|
|
|
1 |
{
|
2 |
"Gender Divide (Term Identity Diversity)": {
|
3 |
"description": "333",
|
4 |
+
"fx": "eval_gender_divide"
|
5 |
},
|
6 |
"Gender Profession Bias (Lexical Evaluation)": {
|
7 |
"description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
|
8 |
+
"fx": "eval_gender_profession"
|
9 |
},
|
10 |
"GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
|
11 |
"description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
|
12 |
+
"fx": "eval_genbit"
|
13 |
}
|
14 |
}
|
config/profession_lexicons.json
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"professions": [
|
3 |
+
"Accountant",
|
4 |
+
"Actor",
|
5 |
+
"Actress",
|
6 |
+
"Aerospace Engineer",
|
7 |
+
"Agricultural Scientist",
|
8 |
+
"Air Traffic Controller",
|
9 |
+
"Aircraft Mechanic",
|
10 |
+
"Animator",
|
11 |
+
"Architect",
|
12 |
+
"Art Director",
|
13 |
+
"Attorney",
|
14 |
+
"Lawyer",
|
15 |
+
"Audiologist",
|
16 |
+
"Author",
|
17 |
+
"Writer",
|
18 |
+
"Baker",
|
19 |
+
"Barber",
|
20 |
+
"Hairdresser",
|
21 |
+
"Bartender",
|
22 |
+
"Biomedical Engineer",
|
23 |
+
"Botanist",
|
24 |
+
"Broadcast Journalist",
|
25 |
+
"Business Analyst",
|
26 |
+
"Carpenter",
|
27 |
+
"Chef",
|
28 |
+
"Cook",
|
29 |
+
"Chemist",
|
30 |
+
"Civil Engineer",
|
31 |
+
"Clinical Psychologist",
|
32 |
+
"Commercial Diver",
|
33 |
+
"Computer Programmer",
|
34 |
+
"Construction Worker",
|
35 |
+
"Corporate Trainer",
|
36 |
+
"Cosmetologist",
|
37 |
+
"Counselor",
|
38 |
+
"Therapist",
|
39 |
+
"Court Reporter",
|
40 |
+
"Creative Director",
|
41 |
+
"Criminologist",
|
42 |
+
"Customer Service Representative",
|
43 |
+
"Data Analyst",
|
44 |
+
"Dental Assistant",
|
45 |
+
"Dentist",
|
46 |
+
"Dermatologist",
|
47 |
+
"Dietician",
|
48 |
+
"Nutritionist",
|
49 |
+
"Doctor",
|
50 |
+
"Physician",
|
51 |
+
"Economist",
|
52 |
+
"Electrician",
|
53 |
+
"Elementary School Teacher",
|
54 |
+
"Emergency Medical Technician",
|
55 |
+
"Engineer",
|
56 |
+
"Environmental Scientist",
|
57 |
+
"Event Planner",
|
58 |
+
"Fashion Designer",
|
59 |
+
"Film Director",
|
60 |
+
"Financial Analyst",
|
61 |
+
"Firefighter",
|
62 |
+
"Fisherman",
|
63 |
+
"Fitness Trainer",
|
64 |
+
"Flight Attendant",
|
65 |
+
"Florist",
|
66 |
+
"Food Scientist",
|
67 |
+
"Forensic Scientist",
|
68 |
+
"Furniture Maker",
|
69 |
+
"Game Developer",
|
70 |
+
"Gardener",
|
71 |
+
"Landscaper",
|
72 |
+
"Geologist",
|
73 |
+
"Graphic Designer",
|
74 |
+
"Hair Stylist",
|
75 |
+
"Historian",
|
76 |
+
"Home Health Aide",
|
77 |
+
"Hotel Manager",
|
78 |
+
"Human Resources Manager",
|
79 |
+
"Immigration Lawyer",
|
80 |
+
"Industrial Designer",
|
81 |
+
"Insurance Agent",
|
82 |
+
"Interior Designer",
|
83 |
+
"Interpreter",
|
84 |
+
"Translator",
|
85 |
+
"Investment Banker",
|
86 |
+
"IT Specialist",
|
87 |
+
"Journalist",
|
88 |
+
"Judge",
|
89 |
+
"Kindergarten Teacher",
|
90 |
+
"Land Surveyor",
|
91 |
+
"Landscape Architect",
|
92 |
+
"Lawyer",
|
93 |
+
"Attorney",
|
94 |
+
"Librarian",
|
95 |
+
"Life Coach",
|
96 |
+
"Linguist",
|
97 |
+
"Makeup Artist",
|
98 |
+
"Management Consultant",
|
99 |
+
"Manufacturing Engineer",
|
100 |
+
"Marine Biologist",
|
101 |
+
"Marketing Manager",
|
102 |
+
"Massage Therapist",
|
103 |
+
"Mechanical Engineer",
|
104 |
+
"Medical Assistant",
|
105 |
+
"Medical Researcher",
|
106 |
+
"Meteorologist",
|
107 |
+
"Midwife",
|
108 |
+
"Military Officer",
|
109 |
+
"Music Producer",
|
110 |
+
"Musician",
|
111 |
+
"Nurse",
|
112 |
+
"Occupational Therapist",
|
113 |
+
"Optician",
|
114 |
+
"Optometrist",
|
115 |
+
"Paralegal",
|
116 |
+
"Paramedic",
|
117 |
+
"Patent Attorney",
|
118 |
+
"Pediatrician",
|
119 |
+
"Personal Trainer",
|
120 |
+
"Petroleum Engineer",
|
121 |
+
"Pharmacist",
|
122 |
+
"Photographer",
|
123 |
+
"Physical Therapist",
|
124 |
+
"Physician Assistant",
|
125 |
+
"Pilot",
|
126 |
+
"Plumber",
|
127 |
+
"Police Officer",
|
128 |
+
"Political Scientist",
|
129 |
+
"Preschool Teacher",
|
130 |
+
"Private Investigator",
|
131 |
+
"Product Manager",
|
132 |
+
"Professor",
|
133 |
+
"Lecturer",
|
134 |
+
"Programmer",
|
135 |
+
"Psychiatrist",
|
136 |
+
"Psychologist",
|
137 |
+
"Public Relations Specialist",
|
138 |
+
"Public School Teacher",
|
139 |
+
"Real Estate Agent",
|
140 |
+
"Broker",
|
141 |
+
"Receptionist",
|
142 |
+
"Registered Nurse",
|
143 |
+
"Reporter",
|
144 |
+
"Restaurant Manager",
|
145 |
+
"Sales Representative",
|
146 |
+
"School Counselor",
|
147 |
+
"Scientist",
|
148 |
+
"Screenwriter",
|
149 |
+
"Social Media Manager",
|
150 |
+
"Social Worker",
|
151 |
+
"Software Developer",
|
152 |
+
"Speech-Language Pathologist",
|
153 |
+
"Sports Coach",
|
154 |
+
"Statistician"
|
155 |
+
]
|
156 |
+
}
|
data/z_house.csv
DELETED
@@ -1,7 +0,0 @@
|
|
1 |
-
PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
|
2 |
-
1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
|
3 |
-
2,456 Elm St,New York,NY,10001,2,1,1200,750000
|
4 |
-
3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
|
5 |
-
4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
|
6 |
-
5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
|
7 |
-
6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/z_sentences.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ID,Sentences,Gender Bias Test
|
2 |
+
1,The doctor examined the patient.,Neutral
|
3 |
+
2,The teacher helped the student with their homework.,Neutral
|
4 |
+
3,The engineer designed the new software.,Neutral
|
5 |
+
4,The nurse cared for the sick child.,Neutral
|
6 |
+
5,The chef prepared a delicious meal.,Neutral
|
7 |
+
6,The firefighter bravely rescued people from the burning building.,Neutral
|
8 |
+
7,The police officer maintained law and order in the city.,Neutral
|
9 |
+
8,The scientist conducted groundbreaking research.,Neutral
|
10 |
+
9,The artist created a beautiful masterpiece.,Neutral
|
11 |
+
10,The CEO made important decisions for the company.,Neutral
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
gradio==3.
|
2 |
gradio_client==0.5.0
|
3 |
numpy==1.25.2
|
4 |
pandas==2.0.3
|
|
|
1 |
+
gradio==3.43.2
|
2 |
gradio_client==0.5.0
|
3 |
numpy==1.25.2
|
4 |
pandas==2.0.3
|
scripts/genbit.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from genbit.genbit_metrics import GenBitMetrics
|
2 |
+
|
3 |
+
|
4 |
+
def eval_genbit(data):
|
5 |
+
genbit_metrics = GenBitMetrics(
|
6 |
+
language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80
|
7 |
+
)
|
8 |
+
|
9 |
+
data[data.columns[0]] = data[data.columns[0]].to_list()
|
10 |
+
|
11 |
+
genbit_metrics.add_data(data, tokenized=False)
|
12 |
+
genbit_metrics = genbit_metrics.get_metrics(output_word_list=False)
|
13 |
+
|
14 |
+
return genbit_metrics
|
scripts/genbit_metrics.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
from genbit.genbit_metrics import GenBitMetrics
|
2 |
-
import pandas as pd
|
3 |
-
from utils.read_config import get_args
|
4 |
-
from utils.load_csv import load_sample
|
5 |
-
|
6 |
-
|
7 |
-
def cal_metrics(dataset):
|
8 |
-
# Create a GenBit object with the desired settings:
|
9 |
-
|
10 |
-
genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
|
11 |
-
|
12 |
-
# Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
|
13 |
-
#dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
|
14 |
-
|
15 |
-
genbit_metrics_object.add_data(dataset, tokenized=False)
|
16 |
-
|
17 |
-
|
18 |
-
# To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
|
19 |
-
metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
|
20 |
-
|
21 |
-
return metrics
|
22 |
-
|
23 |
-
|
24 |
-
# Function to extract genbit metrics
|
25 |
-
def extract_genbit_metris(stats):
|
26 |
-
metrics = {}
|
27 |
-
metrics["genbit_score"] = str(stats["genbit_score"])
|
28 |
-
metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
|
29 |
-
metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
|
30 |
-
metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
|
31 |
-
metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
|
32 |
-
metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
|
33 |
-
metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
|
34 |
-
|
35 |
-
return metrics
|
36 |
-
|
37 |
-
def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
|
38 |
-
|
39 |
-
|
40 |
-
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
41 |
-
|
42 |
-
# Turn into a list of text.
|
43 |
-
sample_text = sample_df[col_name].tolist()
|
44 |
-
|
45 |
-
# Call cal_metrics function
|
46 |
-
stats = cal_metrics(sample_text)
|
47 |
-
metrics = extract_genbit_metris(stats)
|
48 |
-
return metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/{gender_tagging.py β gender_divide.py}
RENAMED
@@ -1,26 +1,23 @@
|
|
1 |
-
# Import required libraries
|
2 |
-
import pandas as pd
|
3 |
import re
|
4 |
-
|
5 |
-
|
|
|
|
|
6 |
|
7 |
-
# Function to get count of male terms in text
|
8 |
def count_male_terms(text, male_terms):
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
# Function to get count of female terms in text
|
15 |
def count_female_terms(text, female_terms):
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
|
21 |
-
# Function to get gender tag categories
|
22 |
def get_gender_tag(count_m_term, count_f_term):
|
23 |
-
tag =
|
24 |
if count_m_term == 0 and count_f_term == 0:
|
25 |
tag = "No Gender"
|
26 |
|
@@ -44,50 +41,60 @@ def get_gender_tag(count_m_term, count_f_term):
|
|
44 |
return tag
|
45 |
|
46 |
|
47 |
-
# Function to calculate PG and SPG
|
48 |
def get_pg_spg(sample_df):
|
49 |
-
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"][
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
-
count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
|
58 |
-
count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
|
59 |
-
|
60 |
return {
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
}
|
69 |
|
70 |
-
# Function to load dataset and get the analysis done
|
71 |
-
def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
|
72 |
-
# Read config file
|
73 |
-
male_terms = get_args("male_terms")
|
74 |
-
female_terms = get_args("female_terms")
|
75 |
-
# Load sample
|
76 |
-
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
77 |
-
|
78 |
-
# Lowercase of text
|
79 |
-
sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
|
80 |
-
|
81 |
-
# Get new columns of count - male terms and female terms
|
82 |
-
sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
|
83 |
-
sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
|
84 |
-
|
85 |
-
# Get tag categories
|
86 |
-
sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
|
87 |
-
|
88 |
-
# Get statistics
|
89 |
-
collection = get_pg_spg(sample_df)
|
90 |
-
return collection
|
91 |
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
+
import json
|
3 |
+
|
4 |
+
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
|
5 |
+
|
6 |
|
|
|
7 |
def count_male_terms(text, male_terms):
|
8 |
+
pattern = r"\b({})\b".format("|".join(male_terms))
|
9 |
+
match = re.findall(pattern, str(text))
|
10 |
+
return len(match)
|
11 |
+
|
12 |
|
|
|
13 |
def count_female_terms(text, female_terms):
|
14 |
+
pattern = r"\b({})\b".format("|".join(female_terms))
|
15 |
+
match = re.findall(pattern, str(text))
|
16 |
+
return len(match)
|
17 |
+
|
18 |
|
|
|
19 |
def get_gender_tag(count_m_term, count_f_term):
|
20 |
+
tag = ""
|
21 |
if count_m_term == 0 and count_f_term == 0:
|
22 |
tag = "No Gender"
|
23 |
|
|
|
41 |
return tag
|
42 |
|
43 |
|
|
|
44 |
def get_pg_spg(sample_df):
|
45 |
+
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"][
|
46 |
+
"gender_cat"
|
47 |
+
].count()
|
48 |
+
|
49 |
+
count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"][
|
50 |
+
"gender_cat"
|
51 |
+
].count()
|
52 |
+
count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"][
|
53 |
+
"gender_cat"
|
54 |
+
].count()
|
55 |
+
|
56 |
+
count_male_pg = sample_df[sample_df["gender_cat"] == "Male Positive Gender"][
|
57 |
+
"gender_cat"
|
58 |
+
].count()
|
59 |
+
count_male_spg = sample_df[
|
60 |
+
sample_df["gender_cat"] == "Male Strongly Positive Gender"
|
61 |
+
]["gender_cat"].count()
|
62 |
+
|
63 |
+
count_female_pg = sample_df[sample_df["gender_cat"] == "Female Positive Gender"][
|
64 |
+
"gender_cat"
|
65 |
+
].count()
|
66 |
+
count_female_spg = sample_df[
|
67 |
+
sample_df["gender_cat"] == "Female Stronly Positive Gender"
|
68 |
+
]["gender_cat"].count()
|
69 |
|
|
|
|
|
|
|
70 |
return {
|
71 |
+
"gender": str(count_gender_sentences),
|
72 |
+
"no gender": str(count_no_gender_sentences),
|
73 |
+
"equal gender": str(count_equal_gender),
|
74 |
+
"female pg": str(count_female_pg),
|
75 |
+
"male pg": str(count_male_pg),
|
76 |
+
"female spg": str(count_female_spg),
|
77 |
+
"male spg": str(count_male_spg),
|
78 |
}
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
def eval_gender_divide(data):
|
82 |
+
male_terms = gender_lexicons.get("male_lexicons")
|
83 |
+
female_terms = gender_lexicons.get("female_lexicons")
|
84 |
+
|
85 |
+
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
|
86 |
+
|
87 |
+
data["count_male_term"] = data.apply(
|
88 |
+
lambda x: count_male_terms(x[data.columns[0]], male_terms), axis=1
|
89 |
+
)
|
90 |
+
data["count_female_term"] = data.apply(
|
91 |
+
lambda x: count_female_terms(x[:], female_terms), axis=1
|
92 |
+
)
|
93 |
+
|
94 |
+
data["gender_cat"] = data.apply(
|
95 |
+
lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]),
|
96 |
+
axis=1,
|
97 |
+
)
|
98 |
+
|
99 |
+
collection = get_pg_spg(data)
|
100 |
+
return collection
|
scripts/{gender_profession_tagging.py β gender_profession_bias.py}
RENAMED
@@ -1,43 +1,43 @@
|
|
1 |
-
import pandas as pd
|
2 |
import re
|
3 |
-
import
|
4 |
-
from spacy.lang.en import English
|
5 |
-
import time
|
6 |
-
from tqdm import tqdm
|
7 |
-
import multiprocessing.pool
|
8 |
|
9 |
-
import
|
10 |
-
|
11 |
-
from
|
12 |
-
from utils.load_csv import load_sample
|
13 |
|
|
|
|
|
14 |
|
15 |
-
# For sentence split
|
16 |
nlp = English()
|
17 |
nlp.add_pipe("sentencizer")
|
18 |
|
19 |
-
# Function to split sentences
|
20 |
-
def get_split_text(text):
|
21 |
|
|
|
22 |
doc = nlp(text)
|
23 |
sentences = [sent for sent in doc.sents]
|
24 |
return sentences
|
25 |
|
26 |
-
def get_gender_prof_match_details(df_text):
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
-
# Get regex pattern
|
34 |
-
male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
split_text = get_split_text(df_text)
|
38 |
|
39 |
results = []
|
40 |
-
|
41 |
for text in split_text:
|
42 |
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
|
43 |
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
|
@@ -52,78 +52,60 @@ def get_gender_prof_match_details(df_text):
|
|
52 |
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
|
53 |
both_match = "Yes"
|
54 |
|
55 |
-
# Unpack from list
|
56 |
male_pronoun_match = ",".join(male_pronoun_match)
|
57 |
female_pronoun_match = ",".join(female_pronoun_match)
|
58 |
|
59 |
prof_match = ",".join(prof_match)
|
60 |
|
61 |
-
results.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
return results
|
64 |
|
65 |
-
|
66 |
def call_multiprocessing_pool(df_text):
|
67 |
concurrent = 2000
|
68 |
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
69 |
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
70 |
pool.close()
|
71 |
|
72 |
-
# return_list is nested -- we need to flatten it
|
73 |
flat_return_list = [item for sublist in result_list for item in sublist]
|
74 |
|
75 |
-
|
76 |
-
cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
|
77 |
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
78 |
|
79 |
return return_df
|
80 |
|
81 |
-
# Function to get statistics
|
82 |
-
def get_statistics(results_df):
|
83 |
-
count_total_sentence = results_df.shape[0]
|
84 |
-
count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
|
85 |
-
count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
|
86 |
-
count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
|
87 |
-
|
88 |
-
count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
|
89 |
-
count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
|
90 |
-
|
91 |
-
return{
|
92 |
-
"total_sentence" : str(count_total_sentence),
|
93 |
-
"both_gender_prof_match" : str(count_both_match),
|
94 |
-
"count_male_pronoun" : str(count_male_pronoun),
|
95 |
-
"count_female_pronoun" : str(count_female_pronoun),
|
96 |
-
"count_male_pronoun_profession" : str(count_male_pronoun_profession),
|
97 |
-
"count_female_pronoun_profession" : str(count_female_pronoun_profession)
|
98 |
-
}
|
99 |
-
|
100 |
-
# Function to return regular expression patterns
|
101 |
-
def get_regex_pattern(male_pronoun, female_pronoun, professions):
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
112 |
|
|
|
113 |
|
114 |
-
|
115 |
-
# Get args from config file
|
116 |
|
117 |
-
|
118 |
|
119 |
|
120 |
-
|
121 |
-
|
122 |
|
123 |
-
|
124 |
-
|
125 |
|
126 |
-
stats
|
127 |
-
|
128 |
-
# Get statistics
|
129 |
-
return stats
|
|
|
|
|
1 |
import re
|
2 |
+
import json
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
import pandas as pd
|
5 |
+
import multiprocessing.pool
|
6 |
+
from spacy.lang.en import English
|
|
|
7 |
|
8 |
+
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
|
9 |
+
profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
|
10 |
|
|
|
11 |
nlp = English()
|
12 |
nlp.add_pipe("sentencizer")
|
13 |
|
|
|
|
|
14 |
|
15 |
+
def get_split_text(text):
|
16 |
doc = nlp(text)
|
17 |
sentences = [sent for sent in doc.sents]
|
18 |
return sentences
|
19 |
|
|
|
20 |
|
21 |
+
def compile_regex_patterns(patterns):
|
22 |
+
return [
|
23 |
+
re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
|
24 |
+
for pattern in patterns
|
25 |
+
]
|
26 |
|
|
|
|
|
27 |
|
28 |
+
def get_gender_prof_match_details(df_text):
|
29 |
+
male_pronouns = gender_lexicons.get("male_pronouns")
|
30 |
+
female_pronouns = gender_lexicons.get("female_pronouns")
|
31 |
+
professions = profession_lexicons.get("professions")
|
32 |
+
|
33 |
+
male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
|
34 |
+
[male_pronouns, female_pronouns, professions]
|
35 |
+
)
|
36 |
|
37 |
split_text = get_split_text(df_text)
|
38 |
|
39 |
results = []
|
40 |
+
|
41 |
for text in split_text:
|
42 |
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
|
43 |
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
|
|
|
52 |
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
|
53 |
both_match = "Yes"
|
54 |
|
|
|
55 |
male_pronoun_match = ",".join(male_pronoun_match)
|
56 |
female_pronoun_match = ",".join(female_pronoun_match)
|
57 |
|
58 |
prof_match = ",".join(prof_match)
|
59 |
|
60 |
+
results.append(
|
61 |
+
(
|
62 |
+
str(text),
|
63 |
+
male_pronoun_match,
|
64 |
+
female_pronoun_match,
|
65 |
+
prof_match,
|
66 |
+
both_match,
|
67 |
+
)
|
68 |
+
)
|
69 |
|
70 |
return results
|
71 |
|
72 |
+
|
73 |
def call_multiprocessing_pool(df_text):
|
74 |
concurrent = 2000
|
75 |
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
76 |
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
77 |
pool.close()
|
78 |
|
|
|
79 |
flat_return_list = [item for sublist in result_list for item in sublist]
|
80 |
|
81 |
+
cols = ["Split Text", "Male Pronoun", "Female Pronoun", "Profession", "Both Match"]
|
|
|
82 |
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
83 |
|
84 |
return return_df
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
+
def get_statistics(result):
|
88 |
+
conditions = {
|
89 |
+
"both_gender_prof_match": result["Both Match"].eq("Yes"),
|
90 |
+
"count_male_pronoun": result["Male Pronoun"].ne(""),
|
91 |
+
"count_female_pronoun": result["Female Pronoun"].ne(""),
|
92 |
+
"count_male_pronoun_profession": result["Male Pronoun"].ne("")
|
93 |
+
& result["Profession"].ne(""),
|
94 |
+
"count_female_pronoun_profession": result["Female Pronoun"].ne("")
|
95 |
+
& result["Profession"].ne(""),
|
96 |
+
}
|
97 |
|
98 |
+
stats = {key: str(value.sum()) for key, value in conditions.items()}
|
99 |
|
100 |
+
stats["total_sentence"] = str(len(result))
|
|
|
101 |
|
102 |
+
return stats
|
103 |
|
104 |
|
105 |
+
def eval_gender_profession(data):
|
106 |
+
data = data[data.columns[0]].str.lower().str.strip()
|
107 |
|
108 |
+
result = call_multiprocessing_pool(data)
|
109 |
+
stats = get_statistics(result)
|
110 |
|
111 |
+
return stats
|
|
|
|
|
|
utils/config.json
DELETED
@@ -1,160 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"first_records" : 2000,
|
3 |
-
"random_seed" : 42,
|
4 |
-
"male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
|
5 |
-
"female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
|
6 |
-
"male_pronoun" : ["he", "him", "his"],
|
7 |
-
"female_pronoun" : ["she", "her", "hers"],
|
8 |
-
"professions" : ["Accountant",
|
9 |
-
"Actor",
|
10 |
-
"Actress",
|
11 |
-
"Aerospace Engineer",
|
12 |
-
"Agricultural Scientist",
|
13 |
-
"Air Traffic Controller",
|
14 |
-
"Aircraft Mechanic",
|
15 |
-
"Animator",
|
16 |
-
"Architect",
|
17 |
-
"Art Director",
|
18 |
-
"Attorney",
|
19 |
-
"Lawyer",
|
20 |
-
"Audiologist",
|
21 |
-
"Author",
|
22 |
-
"Writer",
|
23 |
-
"Baker",
|
24 |
-
"Barber",
|
25 |
-
"Hairdresser",
|
26 |
-
"Bartender",
|
27 |
-
"Biomedical Engineer",
|
28 |
-
"Botanist",
|
29 |
-
"Broadcast Journalist",
|
30 |
-
"Business Analyst",
|
31 |
-
"Carpenter",
|
32 |
-
"Chef",
|
33 |
-
"Cook",
|
34 |
-
"Chemist",
|
35 |
-
"Civil Engineer",
|
36 |
-
"Clinical Psychologist",
|
37 |
-
"Commercial Diver",
|
38 |
-
"Computer Programmer",
|
39 |
-
"Construction Worker",
|
40 |
-
"Corporate Trainer",
|
41 |
-
"Cosmetologist",
|
42 |
-
"Counselor",
|
43 |
-
"Therapist",
|
44 |
-
"Court Reporter",
|
45 |
-
"Creative Director",
|
46 |
-
"Criminologist",
|
47 |
-
"Customer Service Representative",
|
48 |
-
"Data Analyst",
|
49 |
-
"Dental Assistant",
|
50 |
-
"Dentist",
|
51 |
-
"Dermatologist",
|
52 |
-
"Dietician",
|
53 |
-
"Nutritionist",
|
54 |
-
"Doctor",
|
55 |
-
"Physician",
|
56 |
-
"Economist",
|
57 |
-
"Electrician",
|
58 |
-
"Elementary School Teacher",
|
59 |
-
"Emergency Medical Technician",
|
60 |
-
"Engineer",
|
61 |
-
"Environmental Scientist",
|
62 |
-
"Event Planner",
|
63 |
-
"Fashion Designer",
|
64 |
-
"Film Director",
|
65 |
-
"Financial Analyst",
|
66 |
-
"Firefighter",
|
67 |
-
"Fisherman",
|
68 |
-
"Fitness Trainer",
|
69 |
-
"Flight Attendant",
|
70 |
-
"Florist",
|
71 |
-
"Food Scientist",
|
72 |
-
"Forensic Scientist",
|
73 |
-
"Furniture Maker",
|
74 |
-
"Game Developer",
|
75 |
-
"Gardener",
|
76 |
-
"Landscaper",
|
77 |
-
"Geologist",
|
78 |
-
"Graphic Designer",
|
79 |
-
"Hair Stylist",
|
80 |
-
"Historian",
|
81 |
-
"Home Health Aide",
|
82 |
-
"Hotel Manager",
|
83 |
-
"Human Resources Manager",
|
84 |
-
"Immigration Lawyer",
|
85 |
-
"Industrial Designer",
|
86 |
-
"Insurance Agent",
|
87 |
-
"Interior Designer",
|
88 |
-
"Interpreter",
|
89 |
-
"Translator",
|
90 |
-
"Investment Banker",
|
91 |
-
"IT Specialist",
|
92 |
-
"Journalist",
|
93 |
-
"Judge",
|
94 |
-
"Kindergarten Teacher",
|
95 |
-
"Land Surveyor",
|
96 |
-
"Landscape Architect",
|
97 |
-
"Lawyer",
|
98 |
-
"Attorney",
|
99 |
-
"Librarian",
|
100 |
-
"Life Coach",
|
101 |
-
"Linguist",
|
102 |
-
"Makeup Artist",
|
103 |
-
"Management Consultant",
|
104 |
-
"Manufacturing Engineer",
|
105 |
-
"Marine Biologist",
|
106 |
-
"Marketing Manager",
|
107 |
-
"Massage Therapist",
|
108 |
-
"Mechanical Engineer",
|
109 |
-
"Medical Assistant",
|
110 |
-
"Medical Researcher",
|
111 |
-
"Meteorologist",
|
112 |
-
"Midwife",
|
113 |
-
"Military Officer",
|
114 |
-
"Music Producer",
|
115 |
-
"Musician",
|
116 |
-
"Nurse",
|
117 |
-
"Occupational Therapist",
|
118 |
-
"Optician",
|
119 |
-
"Optometrist",
|
120 |
-
"Paralegal",
|
121 |
-
"Paramedic",
|
122 |
-
"Patent Attorney",
|
123 |
-
"Pediatrician",
|
124 |
-
"Personal Trainer",
|
125 |
-
"Petroleum Engineer",
|
126 |
-
"Pharmacist",
|
127 |
-
"Photographer",
|
128 |
-
"Physical Therapist",
|
129 |
-
"Physician Assistant",
|
130 |
-
"Pilot",
|
131 |
-
"Plumber",
|
132 |
-
"Police Officer",
|
133 |
-
"Political Scientist",
|
134 |
-
"Preschool Teacher",
|
135 |
-
"Private Investigator",
|
136 |
-
"Product Manager",
|
137 |
-
"Professor",
|
138 |
-
"Lecturer",
|
139 |
-
"Programmer",
|
140 |
-
"Psychiatrist",
|
141 |
-
"Psychologist",
|
142 |
-
"Public Relations Specialist",
|
143 |
-
"Public School Teacher",
|
144 |
-
"Real Estate Agent",
|
145 |
-
"Broker",
|
146 |
-
"Receptionist",
|
147 |
-
"Registered Nurse",
|
148 |
-
"Reporter",
|
149 |
-
"Restaurant Manager",
|
150 |
-
"Sales Representative",
|
151 |
-
"School Counselor",
|
152 |
-
"Scientist",
|
153 |
-
"Screenwriter",
|
154 |
-
"Social Media Manager",
|
155 |
-
"Social Worker",
|
156 |
-
"Software Developer",
|
157 |
-
"Speech-Language Pathologist",
|
158 |
-
"Sports Coach",
|
159 |
-
"Statistician"]
|
160 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/load_csv.py
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
from utils.read_config import get_args
|
3 |
-
|
4 |
-
# Function to load sample of dataset
|
5 |
-
|
6 |
-
|
7 |
-
def load_sample(num_sample_records, sample_method, df, col_name):
|
8 |
-
|
9 |
-
sample_first_records = get_args("first_records")
|
10 |
-
sample_random_seed = get_args("random_seed")
|
11 |
-
|
12 |
-
num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
|
13 |
-
|
14 |
-
# Keep only required column
|
15 |
-
df = df[[col_name]]
|
16 |
-
if sample_method == "First":
|
17 |
-
df = df.iloc[:num_sample_records].copy().reset_index()
|
18 |
-
if sample_method == "Last":
|
19 |
-
df = df.iloc[-num_sample_records:].copy().reset_index()
|
20 |
-
if sample_method == "Random":
|
21 |
-
df = df.sample(num_sample_records,
|
22 |
-
random_state=sample_random_seed).copy().reset_index()
|
23 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/read_config.py
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
|
3 |
-
def read_config_file():
|
4 |
-
with open("utils/config.json", "r") as jsonfile:
|
5 |
-
data = json.load(jsonfile)
|
6 |
-
return data
|
7 |
-
|
8 |
-
def get_args(args):
|
9 |
-
try:
|
10 |
-
data = read_config_file()
|
11 |
-
except:
|
12 |
-
raise "Could not read config file."
|
13 |
-
return data[args]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|