freyam commited on
Commit
7192c24
1 Parent(s): e0a1479

Restructure UX and optimise scripts for performance

Browse files
app.py CHANGED
@@ -110,7 +110,7 @@ def load_dataset(local_dataset, hf_dataset):
110
  )
111
 
112
  dataset_import_btn = gr.Button(
113
- value="Import",
114
  interactive=True,
115
  variant="primary",
116
  visible=True,
@@ -156,7 +156,17 @@ def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_colum
156
  DATASET["sampling_size"] = dataset_sampling_size
157
  DATASET["column"] = dataset_column
158
 
159
- return gr.Markdown(f"## Dataset (`{DATASET['name']}`)")
 
 
 
 
 
 
 
 
 
 
160
 
161
 
162
  def import_methodology(methodology):
@@ -164,8 +174,7 @@ def import_methodology(methodology):
164
 
165
  return (
166
  gr.Markdown(
167
- f"## Methodology (`{methodology}`)",
168
- visible=True,
169
  ),
170
  gr.Markdown(
171
  METHODOLOGIES[methodology]["description"],
@@ -173,7 +182,7 @@ def import_methodology(methodology):
173
  ),
174
  gr.Button(
175
  value="Evaluate",
176
- interactive=True,
177
  variant="primary",
178
  visible=True,
179
  ),
@@ -220,8 +229,11 @@ with BiasAware:
220
  hf_dataset = gr.Textbox(visible=False)
221
  hf_dataset_search_results = gr.Radio(visible=False)
222
 
223
- dataset_load_btn = gr.Button(visible=False)
224
- dataset_import_btn = gr.Button(visible=False)
 
 
 
225
 
226
  dataset_sampling_method = gr.Radio(visible=False)
227
  dataset_sampling_size = gr.Slider(visible=False)
@@ -237,6 +249,11 @@ with BiasAware:
237
  choices=METHODOLOGIES.keys(),
238
  )
239
 
 
 
 
 
 
240
  evaluation_btn = gr.Button(
241
  value="Evaluate",
242
  interactive=False,
@@ -244,11 +261,6 @@ with BiasAware:
244
  visible=True,
245
  )
246
 
247
- methodology_description = gr.Markdown(visible=False)
248
-
249
- with gr.Column(scale=2):
250
- result_title = gr.Markdown("## Results")
251
-
252
  result_description = gr.Markdown(visible=False)
253
  result_plot = gr.Plot(show_label=False, container=False, visible=False)
254
  result_df = gr.DataFrame(visible=False)
@@ -343,13 +355,13 @@ with BiasAware:
343
  dataset_sampling_size,
344
  dataset_column,
345
  ],
346
- outputs=[dataset_title],
347
  )
348
 
349
  methodology.input(
350
  fn=import_methodology,
351
  inputs=[methodology],
352
- outputs=[methodology_title, methodology_description, evaluation_btn],
353
  )
354
 
355
  evaluation_btn.click(
 
110
  )
111
 
112
  dataset_import_btn = gr.Button(
113
+ value="Import Dataset",
114
  interactive=True,
115
  variant="primary",
116
  visible=True,
 
156
  DATASET["sampling_size"] = dataset_sampling_size
157
  DATASET["column"] = dataset_column
158
 
159
+ return (
160
+ gr.Markdown(
161
+ f"## Results (Dataset: {'✅' if DATASET['name'] else '❎'}) (Methodology: {'✅' if DATASET['methodology'] else '❎'})"
162
+ ),
163
+ gr.Button(
164
+ value="Evaluate",
165
+ interactive=(True if DATASET["name"] and DATASET["methodology"] else False),
166
+ variant="primary",
167
+ visible=True,
168
+ ),
169
+ )
170
 
171
 
172
  def import_methodology(methodology):
 
174
 
175
  return (
176
  gr.Markdown(
177
+ f"## Results (Dataset: {'✅' if DATASET['name'] else '❎'}) (Methodology: {'✅' if DATASET['methodology'] else '❎'})"
 
178
  ),
179
  gr.Markdown(
180
  METHODOLOGIES[methodology]["description"],
 
182
  ),
183
  gr.Button(
184
  value="Evaluate",
185
+ interactive=(True if DATASET["name"] and DATASET["methodology"] else False),
186
  variant="primary",
187
  visible=True,
188
  ),
 
229
  hf_dataset = gr.Textbox(visible=False)
230
  hf_dataset_search_results = gr.Radio(visible=False)
231
 
232
+ with gr.Row():
233
+ with gr.Column(scale=1):
234
+ dataset_load_btn = gr.Button(visible=False)
235
+ with gr.Column(scale=1):
236
+ dataset_import_btn = gr.Button(visible=False)
237
 
238
  dataset_sampling_method = gr.Radio(visible=False)
239
  dataset_sampling_size = gr.Slider(visible=False)
 
249
  choices=METHODOLOGIES.keys(),
250
  )
251
 
252
+ methodology_description = gr.Markdown(visible=False)
253
+
254
+ with gr.Column(scale=2):
255
+ result_title = gr.Markdown("## Results (Dataset: ❎) (Methodology: ❎)")
256
+
257
  evaluation_btn = gr.Button(
258
  value="Evaluate",
259
  interactive=False,
 
261
  visible=True,
262
  )
263
 
 
 
 
 
 
264
  result_description = gr.Markdown(visible=False)
265
  result_plot = gr.Plot(show_label=False, container=False, visible=False)
266
  result_df = gr.DataFrame(visible=False)
 
355
  dataset_sampling_size,
356
  dataset_column,
357
  ],
358
+ outputs=[result_title, evaluation_btn],
359
  )
360
 
361
  methodology.input(
362
  fn=import_methodology,
363
  inputs=[methodology],
364
+ outputs=[result_title, methodology_description, evaluation_btn],
365
  )
366
 
367
  evaluation_btn.click(
scripts/gender_distribution.py CHANGED
@@ -3,83 +3,68 @@ import json
3
  import plotly.express as px
4
  import pandas as pd
5
 
6
- with open("config/gender_lexicons.json", "r") as lexicon_file:
7
- gender_lexicons = json.load(lexicon_file)
8
 
9
- male_lexicon = set(gender_lexicons.get("male_lexicons"))
10
- female_lexicon = set(gender_lexicons.get("female_lexicons"))
11
-
12
- male_pattern = re.compile(r"\b({})\b".format("|".join(map(re.escape, male_lexicon))))
13
- female_pattern = re.compile(
14
- r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
15
- )
16
 
17
 
18
  def count_gender_terms(text, gender_pattern):
19
- matches = re.findall(gender_pattern, text)
20
- return len(matches)
21
 
22
 
23
  def get_gender_tag(count_male_terms, count_female_terms):
24
  total_terms = count_male_terms + count_female_terms
25
-
26
  if total_terms == 0:
27
  return "No Gender"
28
 
29
  male_proportion = (count_male_terms / total_terms) * 100
 
 
30
  if male_proportion >= 75:
31
  return "Male Strongly Positive Gender"
32
  elif male_proportion >= 50:
33
  return "Male Positive Gender"
34
-
35
- female_proportion = (count_female_terms / total_terms) * 100
36
- if female_proportion >= 75:
37
  return "Female Strongly Positive Gender"
38
  elif female_proportion >= 50:
39
  return "Female Positive Gender"
40
-
41
  return "Equal Gender"
42
 
43
 
44
- def get_gender_category_counts(sample_df):
45
- gender_labels = [
46
- "No Gender",
47
- "Equal Gender",
48
- "Male Positive Gender",
49
- "Male Strongly Positive Gender",
50
- "Female Positive Gender",
51
- "Female Strongly Positive Gender",
52
- ]
53
-
54
- gender_counts = sample_df["gender_category"].value_counts()
55
- result = {label: str(gender_counts.get(label, 0)) for label in gender_labels}
56
 
57
- return result
 
 
 
 
 
58
 
 
 
 
 
59
 
60
- def plot_gender_category_counts(gender_labels):
61
- labels = [
62
- "No Gender",
63
- "Equal Gender",
64
- "Male Positive Gender",
65
- "Male Strongly Positive Gender",
66
- "Female Positive Gender",
67
- "Female Strongly Positive Gender",
68
- ]
69
 
70
- values = [gender_labels[label] for label in labels]
71
 
 
72
  fig = px.pie(
73
  values=values,
74
  names=labels,
75
  title="Gender Distribution",
76
- category_orders={"names": labels},
77
  )
78
 
79
  fig.update_traces(
80
  pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
81
  textinfo="percent+label",
82
- marker=dict(line=dict(color="#000000", width=1)),
 
 
83
  )
84
 
85
  fig.update_layout(showlegend=False)
@@ -88,28 +73,29 @@ def plot_gender_category_counts(gender_labels):
88
 
89
 
90
  def eval_gender_distribution(data):
91
- data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
92
 
93
- data["count_male_terms"] = data[data.columns[0]].apply(
94
- lambda x: count_gender_terms(x, male_pattern)
95
- )
96
- data["count_female_terms"] = data[data.columns[0]].apply(
97
- lambda x: count_gender_terms(x, female_pattern)
98
  )
99
 
100
- data["gender_category"] = data.apply(
101
- lambda row: get_gender_tag(row["count_male_terms"], row["count_female_terms"]),
102
- axis=1,
103
- )
 
 
 
 
104
 
105
- result_json = get_gender_category_counts(data)
106
- result_plot = plot_gender_category_counts(result_json)
 
107
 
108
- result_df = (
109
- pd.DataFrame.from_dict(result_json, orient="index")
110
- .reset_index()
111
- .rename(columns={"index": "Metric", 0: "Value"})
112
  )
 
113
 
114
  result_conclusion = ""
115
 
 
3
  import plotly.express as px
4
  import pandas as pd
5
 
 
 
6
 
7
+ def load_gender_lexicons():
8
+ with open("config/gender_lexicons.json", "r") as lexicon_file:
9
+ gender_lexicons = json.load(lexicon_file)
10
+ return gender_lexicons
 
 
 
11
 
12
 
13
  def count_gender_terms(text, gender_pattern):
14
+ return len(gender_pattern.findall(text))
 
15
 
16
 
17
  def get_gender_tag(count_male_terms, count_female_terms):
18
  total_terms = count_male_terms + count_female_terms
 
19
  if total_terms == 0:
20
  return "No Gender"
21
 
22
  male_proportion = (count_male_terms / total_terms) * 100
23
+ female_proportion = (count_female_terms / total_terms) * 100
24
+
25
  if male_proportion >= 75:
26
  return "Male Strongly Positive Gender"
27
  elif male_proportion >= 50:
28
  return "Male Positive Gender"
29
+ elif female_proportion >= 75:
 
 
30
  return "Female Strongly Positive Gender"
31
  elif female_proportion >= 50:
32
  return "Female Positive Gender"
 
33
  return "Equal Gender"
34
 
35
 
36
+ def analyze_text(text, gender_lexicons):
37
+ male_lexicon = set(gender_lexicons.get("male_lexicons"))
38
+ female_lexicon = set(gender_lexicons.get("female_lexicons"))
 
 
 
 
 
 
 
 
 
39
 
40
+ male_pattern = re.compile(
41
+ r"\b({})\b".format("|".join(map(re.escape, male_lexicon)))
42
+ )
43
+ female_pattern = re.compile(
44
+ r"\b({})\b".format("|".join(map(re.escape, female_lexicon)))
45
+ )
46
 
47
+ text = text.lower().strip()
48
+ count_male_terms = count_gender_terms(text, male_pattern)
49
+ count_female_terms = count_gender_terms(text, female_pattern)
50
+ gender_category = get_gender_tag(count_male_terms, count_female_terms)
51
 
52
+ return count_male_terms, count_female_terms, gender_category
 
 
 
 
 
 
 
 
53
 
 
54
 
55
+ def plot_gender_category_counts(labels, values):
56
  fig = px.pie(
57
  values=values,
58
  names=labels,
59
  title="Gender Distribution",
 
60
  )
61
 
62
  fig.update_traces(
63
  pull=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1],
64
  textinfo="percent+label",
65
+ marker=dict(
66
+ line=dict(color="#000000", width=1),
67
+ ),
68
  )
69
 
70
  fig.update_layout(showlegend=False)
 
73
 
74
 
75
  def eval_gender_distribution(data):
76
+ gender_lexicons = load_gender_lexicons()
77
 
78
+ data["count_male_terms"], data["count_female_terms"], data["gender_category"] = zip(
79
+ *data[data.columns[0]].apply(lambda x: analyze_text(x, gender_lexicons))
 
 
 
80
  )
81
 
82
+ gender_labels = [
83
+ "No Gender",
84
+ "Equal Gender",
85
+ "Male Positive Gender",
86
+ "Male Strongly Positive Gender",
87
+ "Female Positive Gender",
88
+ "Female Strongly Positive Gender",
89
+ ]
90
 
91
+ gender_counts = (
92
+ data["gender_category"].value_counts().reindex(gender_labels, fill_value=0)
93
+ )
94
 
95
+ result_df = pd.DataFrame(
96
+ {"Metric": gender_counts.index, "Value": gender_counts.values}
 
 
97
  )
98
+ result_plot = plot_gender_category_counts(gender_labels, gender_counts)
99
 
100
  result_conclusion = ""
101
 
scripts/gender_profession_bias.py CHANGED
@@ -6,15 +6,13 @@ import plotly.express as px
6
  import multiprocessing.pool
7
  from spacy.lang.en import English
8
 
9
- gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
10
- profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
11
 
12
  nlp = English()
13
  nlp.add_pipe("sentencizer")
14
 
15
 
16
  def call_multiprocessing_pool(df_text):
17
- concurrent = 2000
18
  pool = multiprocessing.pool.ThreadPool(processes=concurrent)
19
  result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
20
  pool.close()
@@ -27,29 +25,21 @@ def call_multiprocessing_pool(df_text):
27
  return return_df
28
 
29
 
30
- def get_split_text(text):
31
- doc = nlp(text)
32
- sentences = [sent for sent in doc.sents]
33
- return sentences
34
-
35
-
36
- def compile_regex_patterns(patterns):
37
- return [
38
- re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
39
- for pattern in patterns
40
- ]
41
-
42
-
43
  def get_gender_prof_match_details(df_text):
 
 
 
44
  male_pronouns = gender_lexicons.get("male_pronouns")
45
  female_pronouns = gender_lexicons.get("female_pronouns")
46
  professions = profession_lexicons.get("professions")
47
 
48
- male_pronoun_pat, female_pronoun_pat, professions_pat = compile_regex_patterns(
49
- [male_pronouns, female_pronouns, professions]
 
50
  )
51
 
52
- split_text = get_split_text(df_text)
 
53
 
54
  results = []
55
 
 
6
  import multiprocessing.pool
7
  from spacy.lang.en import English
8
 
 
 
9
 
10
  nlp = English()
11
  nlp.add_pipe("sentencizer")
12
 
13
 
14
  def call_multiprocessing_pool(df_text):
15
+ concurrent = multiprocessing.cpu_count()
16
  pool = multiprocessing.pool.ThreadPool(processes=concurrent)
17
  result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
18
  pool.close()
 
25
  return return_df
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def get_gender_prof_match_details(df_text):
29
+ gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
30
+ profession_lexicons = json.load(open("config/profession_lexicons.json", "r"))
31
+
32
  male_pronouns = gender_lexicons.get("male_pronouns")
33
  female_pronouns = gender_lexicons.get("female_pronouns")
34
  professions = profession_lexicons.get("professions")
35
 
36
+ male_pronoun_pat, female_pronoun_pat, professions_pat = (
37
+ re.compile(r"\b({})\b".format("|".join(pattern)), flags=re.IGNORECASE)
38
+ for pattern in [male_pronouns, female_pronouns, professions]
39
  )
40
 
41
+ doc = nlp(df_text)
42
+ split_text = [sent for sent in doc.sents]
43
 
44
  results = []
45