freyam commited on
Commit
0998e6d
1 Parent(s): 8ab9329

Fix HuggingFace global search

Browse files
Files changed (1) hide show
  1. app.py +123 -94
app.py CHANGED
@@ -1,26 +1,24 @@
1
- import os
2
  import json
 
3
  import timeit
 
 
4
  import gradio as gr
5
  import pandas as pd
6
- from datetime import date
 
 
 
7
 
8
  from scripts.genbit import *
9
- from scripts.gender_profession_bias import *
10
  from scripts.gender_distribution import *
 
11
 
12
- from datasets import load_dataset as hf_load_dataset
13
- from huggingface_hub import DatasetFilter, list_datasets
14
-
15
- from avidtools.datamodels.report import Report
16
- from avidtools.datamodels.components import *
17
- from avidtools.datamodels.enums import *
18
-
19
- MAX_THRESHOLD = 1000
20
- METHODOLOGIES = json.load(open("config/methodologies.json", "r", encoding="utf8"))
21
 
22
- DATASET = {
23
- "name": None,
24
  "source": None,
25
  "df": None,
26
  "sampling_method": None,
@@ -32,15 +30,15 @@ DATASET = {
32
 
33
 
34
  def generate_avid_report():
35
- dataset_id = DATASET["name"]
36
- methodology = DATASET["methodology"]
37
- result_json = DATASET["result_df"].to_dict(orient="list")
38
 
39
  report = Report()
40
 
41
  report.affects = Affects(
42
  developer=[],
43
- deployer=["Hugging Face"] if DATASET["source"] == "HuggingFace Hub" else [],
44
  artifacts=[Artifact(type=ArtifactTypeEnum.dataset, name=dataset_id)],
45
  )
46
  report.problemtype = Problemtype(
@@ -60,13 +58,14 @@ def generate_avid_report():
60
  report.references = (
61
  [
62
  Reference(
63
- label="""{dataset_id} on Hugging Face""".format(model_id=dataset_id),
64
- url="""https://huggingface.co/{dataset_id}""".format(
 
65
  dataset_id=dataset_id
66
  ),
67
  )
68
  ]
69
- if DATASET["source"] == "HuggingFace Hub"
70
  else []
71
  )
72
  report.description = LangValue(
@@ -87,16 +86,16 @@ def generate_avid_report():
87
 
88
 
89
  def evaluate():
90
- if DATASET["methodology"] == "GenBiT (Microsoft Gender Bias Tool)":
91
- DATASET["sampling_size"] = min(DATASET["sampling_size"], 100)
92
 
93
  print(
94
- f"Dataset : {DATASET['name']}\n"
95
- f"Source : {DATASET['source']}\n"
96
- f"Sampling Method : {DATASET['sampling_method']}\n"
97
- f"Sampling Size : {DATASET['sampling_size']}\n"
98
- f"Column : {DATASET['column']}\n"
99
- f"Methodology : {DATASET['methodology']}\n"
100
  f"Time Taken : ",
101
  end="",
102
  )
@@ -104,21 +103,21 @@ def evaluate():
104
  try:
105
  start = timeit.default_timer()
106
 
107
- data = DATASET["df"].copy()
108
- data = data[[DATASET["column"]]]
109
 
110
- if DATASET["sampling_method"] == "First":
111
- data = data.head(DATASET["sampling_size"])
112
- elif DATASET["sampling_method"] == "Last":
113
- data = data.tail(DATASET["sampling_size"])
114
- elif DATASET["sampling_method"] == "Random":
115
- data = data.sample(n=DATASET["sampling_size"], random_state=42)
116
 
117
  result_df, result_plot = globals()[
118
- METHODOLOGIES.get(DATASET["methodology"]).get("fx")
119
  ](data)
120
 
121
- DATASET["result_df"] = result_df
122
 
123
  stop = timeit.default_timer()
124
 
@@ -141,20 +140,19 @@ def evaluate():
141
 
142
 
143
  def load_dataset(local_dataset, hf_dataset):
144
- DATASET["name"] = (
145
- os.path.splitext(os.path.basename(local_dataset.name))[0]
146
- if local_dataset
147
- else hf_dataset
148
- )
149
- DATASET["source"] = "Local Dataset" if local_dataset else "HuggingFace Hub"
150
- DATASET["df"] = (
151
- pd.read_csv(local_dataset.name)
152
- if local_dataset
153
- else hf_load_dataset(hf_dataset, split="train[0:100]").to_pandas()
154
- )
155
-
156
- columns = DATASET["df"].select_dtypes(include=["object"]).columns.tolist()
157
- column_corpus = DATASET["df"][columns[0]].tolist()[:5]
158
 
159
  dataset_sampling_method = gr.Radio(
160
  label="Scope",
@@ -167,10 +165,10 @@ def load_dataset(local_dataset, hf_dataset):
167
 
168
  dataset_sampling_size = gr.Slider(
169
  label=f"Number of Entries",
170
- info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.",
171
  minimum=1,
172
- maximum=min(DATASET["df"].shape[0], MAX_THRESHOLD),
173
- value=min(DATASET["df"].shape[0], MAX_THRESHOLD),
174
  visible=True,
175
  interactive=True,
176
  )
@@ -204,44 +202,27 @@ def load_dataset(local_dataset, hf_dataset):
204
  )
205
 
206
 
207
- def show_hf_dataset_search_results(hf_dataset):
208
- choices = [
209
- dataset.id
210
- for dataset in list_datasets(
211
- filter=DatasetFilter(dataset_name=hf_dataset, language="en"), limit=10
212
- )
213
- ]
214
-
215
- return (
216
- gr.Button(
217
- value=f"Load",
218
- interactive=True,
219
- variant="secondary",
220
- visible=True,
221
- ),
222
- gr.Radio(
223
- label="HuggingFace Hub Search Results",
224
- info="Select the dataset to be imported",
225
- choices=choices,
226
- value=choices[0],
227
- interactive=True,
228
- visible=True,
229
- ),
230
- )
231
-
232
-
233
  def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
234
- DATASET["sampling_method"] = dataset_sampling_method
235
- DATASET["sampling_size"] = dataset_sampling_size
236
- DATASET["column"] = dataset_column
237
 
238
  return (
239
  gr.Markdown(
240
- f"## Results (Dataset: {'✅' if DATASET['name'] else '❎'}) (Methodology: {'✅' if DATASET['methodology'] else '❎'})"
 
 
 
 
 
241
  ),
242
  gr.Button(
243
  value="Evaluate",
244
- interactive=(True if DATASET["name"] and DATASET["methodology"] else False),
 
 
 
 
245
  variant="primary",
246
  visible=True,
247
  ),
@@ -249,11 +230,16 @@ def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_colum
249
 
250
 
251
  def import_methodology(methodology):
252
- DATASET["methodology"] = methodology
253
 
254
  return (
255
  gr.Markdown(
256
- f"## Results (Dataset: {'✅' if DATASET['name'] else '❎'}) (Methodology: {'✅' if DATASET['methodology'] else '❎'})"
 
 
 
 
 
257
  ),
258
  gr.Markdown(
259
  METHODOLOGIES[methodology]["description"],
@@ -261,7 +247,11 @@ def import_methodology(methodology):
261
  ),
262
  gr.Button(
263
  value="Evaluate",
264
- interactive=(True if DATASET["name"] and DATASET["methodology"] else False),
 
 
 
 
265
  variant="primary",
266
  visible=True,
267
  ),
@@ -330,7 +320,9 @@ with BiasAware:
330
  methodology_description = gr.Markdown(visible=False)
331
 
332
  with gr.Column(scale=2):
333
- result_title = gr.Markdown("## Results (Dataset: ❎) (Methodology: ❎)")
 
 
334
 
335
  evaluation_btn = gr.Button(
336
  value="Evaluate",
@@ -361,7 +353,7 @@ with BiasAware:
361
  gr.Textbox(
362
  label="HuggingFace Hub",
363
  placeholder="Search for a dataset",
364
- value="amazon_multi",
365
  interactive=True,
366
  visible=True,
367
  )
@@ -399,7 +391,12 @@ with BiasAware:
399
  )
400
 
401
  hf_dataset.submit(
402
- fn=show_hf_dataset_search_results,
 
 
 
 
 
403
  inputs=[hf_dataset],
404
  outputs=[dataset_load_btn],
405
  )
@@ -419,7 +416,7 @@ with BiasAware:
419
  dataset_column.input(
420
  fn=lambda column: gr.Dataframe(
421
  value=pd.DataFrame(
422
- {f"{column}": DATASET["df"][column].tolist()[:5]},
423
  ),
424
  visible=True,
425
  ),
@@ -456,3 +453,35 @@ with BiasAware:
456
 
457
  if __name__ == "__main__":
458
  BiasAware.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import os
3
  import timeit
4
+ from datetime import date
5
+
6
  import gradio as gr
7
  import pandas as pd
8
+ from avidtools.datamodels.components import *
9
+ from avidtools.datamodels.enums import *
10
+ from avidtools.datamodels.report import Report
11
+ from datasets import load_dataset as hf_load_dataset
12
 
13
  from scripts.genbit import *
 
14
  from scripts.gender_distribution import *
15
+ from scripts.gender_profession_bias import *
16
 
17
+ SAMPLING_SIZE_THRESHOLD = 2000
18
+ METHODOLOGIES = json.load(open("config/methodologies.json", "r"))
 
 
 
 
 
 
 
19
 
20
+ EVALUATION = {
21
+ "dataset_id": None,
22
  "source": None,
23
  "df": None,
24
  "sampling_method": None,
 
30
 
31
 
32
  def generate_avid_report():
33
+ dataset_id = EVALUATION["dataset_id"]
34
+ methodology = EVALUATION["methodology"]
35
+ result_json = EVALUATION["result_df"].to_dict(orient="list")
36
 
37
  report = Report()
38
 
39
  report.affects = Affects(
40
  developer=[],
41
+ deployer=["Hugging Face"] if EVALUATION["source"] == "HuggingFace Hub" else [],
42
  artifacts=[Artifact(type=ArtifactTypeEnum.dataset, name=dataset_id)],
43
  )
44
  report.problemtype = Problemtype(
 
58
  report.references = (
59
  [
60
  Reference(
61
+ type="",
62
+ label="""{dataset_id} on Hugging Face""".format(dataset_id=dataset_id),
63
+ url="""https://huggingface.co/datasets/{dataset_id}""".format(
64
  dataset_id=dataset_id
65
  ),
66
  )
67
  ]
68
+ if EVALUATION["source"] == "HuggingFace Hub"
69
  else []
70
  )
71
  report.description = LangValue(
 
86
 
87
 
88
  def evaluate():
89
+ if EVALUATION["methodology"] == "GenBiT (Microsoft Gender Bias Tool)":
90
+ EVALUATION["sampling_size"] = min(EVALUATION["sampling_size"], 100)
91
 
92
  print(
93
+ f"Dataset : {EVALUATION['dataset_id']}\n"
94
+ f"Source : {EVALUATION['source']}\n"
95
+ f"Sampling Method : {EVALUATION['sampling_method']}\n"
96
+ f"Sampling Size : {EVALUATION['sampling_size']}\n"
97
+ f"Column : {EVALUATION['column']}\n"
98
+ f"Methodology : {EVALUATION['methodology']}\n"
99
  f"Time Taken : ",
100
  end="",
101
  )
 
103
  try:
104
  start = timeit.default_timer()
105
 
106
+ data = EVALUATION["df"].copy()
107
+ data = data[[EVALUATION["column"]]]
108
 
109
+ if EVALUATION["sampling_method"] == "First":
110
+ data = data.head(EVALUATION["sampling_size"])
111
+ elif EVALUATION["sampling_method"] == "Last":
112
+ data = data.tail(EVALUATION["sampling_size"])
113
+ elif EVALUATION["sampling_method"] == "Random":
114
+ data = data.sample(n=EVALUATION["sampling_size"], random_state=42)
115
 
116
  result_df, result_plot = globals()[
117
+ METHODOLOGIES.get(EVALUATION["methodology"]).get("fx")
118
  ](data)
119
 
120
+ EVALUATION["result_df"] = result_df
121
 
122
  stop = timeit.default_timer()
123
 
 
140
 
141
 
142
  def load_dataset(local_dataset, hf_dataset):
143
+ if local_dataset:
144
+ EVALUATION["dataset_id"] = os.path.splitext(
145
+ os.path.basename(local_dataset.name)
146
+ )[0]
147
+ EVALUATION["source"] = "Local Dataset"
148
+ EVALUATION["df"] = pd.read_csv(local_dataset.name)
149
+ else:
150
+ EVALUATION["dataset_id"] = hf_dataset
151
+ EVALUATION["source"] = "HuggingFace Hub"
152
+ EVALUATION["df"] = hf_load_dataset(hf_dataset, split="train[0:100]").to_pandas()
153
+
154
+ columns = EVALUATION["df"].select_dtypes(include=["object"]).columns.tolist()
155
+ column_corpus = EVALUATION["df"][columns[0]].tolist()[:5]
 
156
 
157
  dataset_sampling_method = gr.Radio(
158
  label="Scope",
 
165
 
166
  dataset_sampling_size = gr.Slider(
167
  label=f"Number of Entries",
168
+ info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {SAMPLING_SIZE_THRESHOLD}.",
169
  minimum=1,
170
+ maximum=min(EVALUATION["df"].shape[0], SAMPLING_SIZE_THRESHOLD),
171
+ value=min(EVALUATION["df"].shape[0], SAMPLING_SIZE_THRESHOLD),
172
  visible=True,
173
  interactive=True,
174
  )
 
202
  )
203
 
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column):
206
+ EVALUATION["sampling_method"] = dataset_sampling_method
207
+ EVALUATION["sampling_size"] = dataset_sampling_size
208
+ EVALUATION["column"] = dataset_column
209
 
210
  return (
211
  gr.Markdown(
212
+ "## Results (Dataset: {}{}) (Methodology: {}{})".format(
213
+ "\u2705" if EVALUATION["dataset_id"] else "\u274E",
214
+ "",
215
+ "\u2705" if EVALUATION["methodology"] else "\u274E",
216
+ "",
217
+ )
218
  ),
219
  gr.Button(
220
  value="Evaluate",
221
+ interactive=(
222
+ True
223
+ if EVALUATION["dataset_id"] and EVALUATION["methodology"]
224
+ else False
225
+ ),
226
  variant="primary",
227
  visible=True,
228
  ),
 
230
 
231
 
232
  def import_methodology(methodology):
233
+ EVALUATION["methodology"] = methodology
234
 
235
  return (
236
  gr.Markdown(
237
+ "## Results (Dataset: {}{}) (Methodology: {}{})".format(
238
+ "\u2705" if EVALUATION["dataset_id"] else "\u274E",
239
+ "",
240
+ "\u2705" if EVALUATION["methodology"] else "\u274E",
241
+ "",
242
+ )
243
  ),
244
  gr.Markdown(
245
  METHODOLOGIES[methodology]["description"],
 
247
  ),
248
  gr.Button(
249
  value="Evaluate",
250
+ interactive=(
251
+ True
252
+ if EVALUATION["dataset_id"] and EVALUATION["methodology"]
253
+ else False
254
+ ),
255
  variant="primary",
256
  visible=True,
257
  ),
 
320
  methodology_description = gr.Markdown(visible=False)
321
 
322
  with gr.Column(scale=2):
323
+ result_title = gr.Markdown(
324
+ "## Results (Dataset: \u274E) (Methodology: \u274E)"
325
+ )
326
 
327
  evaluation_btn = gr.Button(
328
  value="Evaluate",
 
353
  gr.Textbox(
354
  label="HuggingFace Hub",
355
  placeholder="Search for a dataset",
356
+ value="imdb",
357
  interactive=True,
358
  visible=True,
359
  )
 
391
  )
392
 
393
  hf_dataset.submit(
394
+ fn=lambda _: gr.Button(
395
+ value=f"Load",
396
+ interactive=True,
397
+ variant="secondary",
398
+ visible=True,
399
+ ),
400
  inputs=[hf_dataset],
401
  outputs=[dataset_load_btn],
402
  )
 
416
  dataset_column.input(
417
  fn=lambda column: gr.Dataframe(
418
  value=pd.DataFrame(
419
+ {f"{column}": EVALUATION["df"][column].tolist()[:5]},
420
  ),
421
  visible=True,
422
  ),
 
453
 
454
  if __name__ == "__main__":
455
  BiasAware.launch()
456
+
457
+
458
+ if __name__ == "__main__":
459
+ BiasAware.launch()
460
+
461
+
462
+ if __name__ == "__main__":
463
+ BiasAware.launch()
464
+
465
+
466
+ if __name__ == "__main__":
467
+ BiasAware.launch()
468
+
469
+
470
+ if __name__ == "__main__":
471
+ BiasAware.launch()
472
+
473
+
474
+ if __name__ == "__main__":
475
+ BiasAware.launch()
476
+
477
+
478
+ if __name__ == "__main__":
479
+ BiasAware.launch()
480
+
481
+
482
+ if __name__ == "__main__":
483
+ BiasAware.launch()
484
+
485
+
486
+ if __name__ == "__main__":
487
+ BiasAware.launch()