|
import json |
|
import gradio as gr |
|
import pandas as pd |
|
|
|
from scripts.genbit import * |
|
from scripts.gender_profession_bias import * |
|
from scripts.gender_distribution import * |
|
|
|
from datasets import load_dataset as hf_load_dataset |
|
from huggingface_hub import DatasetFilter, list_datasets |
|
|
|
MAX_THRESHOLD = 1000 |
|
METHODOLOGIES = json.load(open("config/methodologies.json", "r")) |
|
|
|
DATASET = { |
|
"name": None, |
|
"source": None, |
|
"df": None, |
|
"sampling_method": None, |
|
"sampling_size": None, |
|
"column": None, |
|
"methodology": None, |
|
} |
|
|
|
|
|
def evaluate(): |
|
print( |
|
f"Dataset : {DATASET['name']}\n" |
|
f"Source : {DATASET['source']}\n" |
|
f"Sampling Method : {DATASET['sampling_method']}\n" |
|
f"Sampling Size : {DATASET['sampling_size']}\n" |
|
f"Column : {DATASET['column']}\n" |
|
f"Methodology : {DATASET['methodology']}\n" |
|
) |
|
|
|
try: |
|
data = DATASET["df"].copy() |
|
data = data[[DATASET["column"]]] |
|
|
|
if DATASET["sampling_method"] == "First": |
|
data = data.head(DATASET["sampling_size"]) |
|
elif DATASET["sampling_method"] == "Last": |
|
data = data.tail(DATASET["sampling_size"]) |
|
elif DATASET["sampling_method"] == "Random": |
|
data = data.sample(n=DATASET["sampling_size"], random_state=42) |
|
|
|
result_df, result_plot, result_description = globals()[ |
|
METHODOLOGIES.get(DATASET["methodology"]).get("fx") |
|
](data) |
|
|
|
return ( |
|
gr.Markdown(f"### Result Summary\n\nlorem ipsum", visible=True), |
|
gr.Plot(result_plot, visible=True), |
|
gr.Dataframe(result_df, visible=True), |
|
) |
|
except Exception as e: |
|
print(e) |
|
return ( |
|
gr.Markdown(visible=False), |
|
gr.Plot(visible=False), |
|
gr.Dataframe(visible=False), |
|
) |
|
|
|
|
|
def load_dataset(local_dataset, hf_dataset): |
|
DATASET["name"] = ( |
|
local_dataset.name.split("/")[-1].split(".")[0] if local_dataset else hf_dataset |
|
) |
|
DATASET["source"] = "Local Dataset" if local_dataset else "HuggingFace Hub" |
|
DATASET["df"] = ( |
|
pd.read_csv(local_dataset.name) |
|
if local_dataset |
|
else hf_load_dataset(hf_dataset, split="train[0:100]").to_pandas() |
|
) |
|
|
|
columns = DATASET["df"].select_dtypes(include=["object"]).columns.tolist() |
|
column_corpus = DATASET["df"][columns[0]].tolist()[:5] |
|
|
|
dataset_sampling_method = gr.Radio( |
|
label="Scope", |
|
info="Determines the scope of the dataset to be analyzed", |
|
choices=["First", "Last", "Random"], |
|
value="First", |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
dataset_sampling_size = gr.Slider( |
|
label=f"Number of Entries", |
|
info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {MAX_THRESHOLD}.", |
|
minimum=1, |
|
maximum=min(DATASET["df"].shape[0], MAX_THRESHOLD), |
|
value=min(DATASET["df"].shape[0], MAX_THRESHOLD), |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
dataset_column = gr.Radio( |
|
label="Column", |
|
info="Determines the column to be analyzed. These are the columns with text data.", |
|
choices=columns, |
|
value=columns[0], |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
dataset_column_corpus = gr.Dataframe( |
|
value=pd.DataFrame({f"{columns[0]}": column_corpus}), visible=True |
|
) |
|
|
|
dataset_import_btn = gr.Button( |
|
value="Import", |
|
interactive=True, |
|
variant="primary", |
|
visible=True, |
|
) |
|
|
|
return ( |
|
dataset_sampling_method, |
|
dataset_sampling_size, |
|
dataset_column, |
|
dataset_column_corpus, |
|
dataset_import_btn, |
|
) |
|
|
|
|
|
def show_hf_dataset_search_results(hf_dataset): |
|
choices = [ |
|
dataset.id |
|
for dataset in list_datasets( |
|
filter=DatasetFilter(dataset_name=hf_dataset, language="en"), limit=10 |
|
) |
|
] |
|
|
|
return ( |
|
gr.Button( |
|
value=f"Load", |
|
interactive=True, |
|
variant="secondary", |
|
visible=True, |
|
), |
|
gr.Radio( |
|
label="HuggingFace Hub Search Results", |
|
info="Select the dataset to be imported", |
|
choices=choices, |
|
value=choices[0], |
|
interactive=True, |
|
visible=True, |
|
), |
|
) |
|
|
|
|
|
def import_dataset(dataset_sampling_method, dataset_sampling_size, dataset_column): |
|
DATASET["sampling_method"] = dataset_sampling_method |
|
DATASET["sampling_size"] = dataset_sampling_size |
|
DATASET["column"] = dataset_column |
|
|
|
return gr.Markdown(f"## Dataset (`{DATASET['name']}`)") |
|
|
|
|
|
def import_methodology(methodology): |
|
DATASET["methodology"] = methodology |
|
|
|
return ( |
|
gr.Markdown( |
|
f"## Methodology (`{methodology}`)", |
|
visible=True, |
|
), |
|
gr.Markdown( |
|
METHODOLOGIES[methodology]["description"], |
|
visible=True, |
|
), |
|
gr.Button( |
|
value="Evaluate", |
|
interactive=True, |
|
variant="primary", |
|
visible=True, |
|
), |
|
) |
|
|
|
|
|
BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection") |
|
|
|
with BiasAware: |
|
gr.Markdown( |
|
""" |
|
# BiasAware: Dataset Bias Detection |
|
|
|
BiasAware is a specialized tool for detecting and quantifying biases within datasets used for Natural Language Processing (NLP) tasks. NLP training datasets frequently mirror the inherent biases of their source materials, resulting in AI models that unintentionally perpetuate stereotypes, exhibit underrepresentation, and showcase skewed perspectives. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
dataset_title = gr.Markdown("## Dataset") |
|
|
|
dataset_import_type = gr.Radio( |
|
label="Import Type", |
|
info="Determines the mode of importing the dataset", |
|
choices=["Local Dataset", "HuggingFace Hub"], |
|
value="Local Dataset", |
|
) |
|
|
|
local_dataset = gr.File( |
|
label="Dataset", file_types=["csv"], value=None, visible=True |
|
) |
|
hf_dataset = gr.Textbox(visible=False) |
|
hf_dataset_search_results = gr.Radio(visible=False) |
|
|
|
dataset_load_btn = gr.Button(visible=False) |
|
dataset_import_btn = gr.Button(visible=False) |
|
|
|
dataset_sampling_method = gr.Radio(visible=False) |
|
dataset_sampling_size = gr.Slider(visible=False) |
|
dataset_column = gr.Radio(visible=False) |
|
dataset_column_corpus = gr.Dataframe(visible=False) |
|
|
|
with gr.Column(scale=2): |
|
methodology_title = gr.Markdown("## Methodology") |
|
|
|
methodology = gr.Radio( |
|
label="Methodology", |
|
info="Determines the methodology to be used for bias detection", |
|
choices=METHODOLOGIES.keys(), |
|
) |
|
|
|
evaluation_btn = gr.Button( |
|
value="Evaluate", |
|
interactive=False, |
|
variant="primary", |
|
visible=True, |
|
) |
|
|
|
methodology_description = gr.Markdown(visible=False) |
|
|
|
with gr.Column(scale=2): |
|
result_title = gr.Markdown("## Results") |
|
|
|
result_description = gr.Markdown(visible=False) |
|
result_plot = gr.Plot(show_label=False, container=False, visible=False) |
|
result_df = gr.DataFrame(visible=False) |
|
|
|
submit_to_avid_btn = gr.Button( |
|
value="Submit to AVID", |
|
interactive=False, |
|
variant="primary", |
|
) |
|
|
|
|
|
|
|
|
|
dataset_import_type.input( |
|
fn=lambda import_type: ( |
|
gr.File(label="Dataset", file_types=["csv"], value=None, visible=True) |
|
if import_type == "Local Dataset" |
|
else gr.Textbox(visible=False), |
|
gr.Textbox( |
|
label="HuggingFace Hub", |
|
placeholder="Search for a dataset", |
|
value="amazon_multi", |
|
interactive=True, |
|
visible=True, |
|
) |
|
if import_type == "HuggingFace Hub" |
|
else gr.File(value=None, visible=False), |
|
gr.Button(visible=False), |
|
gr.Radio(visible=False), |
|
gr.Slider(visible=False), |
|
gr.Radio(visible=False), |
|
gr.Dataframe(visible=False), |
|
gr.Button(visible=False), |
|
), |
|
inputs=[dataset_import_type], |
|
outputs=[ |
|
local_dataset, |
|
hf_dataset, |
|
dataset_load_btn, |
|
dataset_sampling_method, |
|
dataset_sampling_size, |
|
dataset_column, |
|
dataset_column_corpus, |
|
dataset_import_btn, |
|
], |
|
) |
|
|
|
local_dataset.upload( |
|
fn=lambda _: gr.Button( |
|
value=f"Load", |
|
interactive=True, |
|
variant="secondary", |
|
visible=True, |
|
), |
|
inputs=[local_dataset], |
|
outputs=[dataset_load_btn], |
|
) |
|
|
|
hf_dataset.submit( |
|
fn=show_hf_dataset_search_results, |
|
inputs=[hf_dataset], |
|
outputs=[dataset_load_btn, hf_dataset_search_results], |
|
) |
|
|
|
dataset_load_btn.click( |
|
fn=load_dataset, |
|
inputs=[local_dataset, hf_dataset_search_results], |
|
outputs=[ |
|
dataset_sampling_method, |
|
dataset_sampling_size, |
|
dataset_column, |
|
dataset_column_corpus, |
|
dataset_import_btn, |
|
], |
|
) |
|
|
|
dataset_column.input( |
|
fn=lambda column: gr.Dataframe( |
|
value=pd.DataFrame( |
|
{f"{column}": DATASET["df"][column].tolist()[:5]}, |
|
), |
|
visible=True, |
|
), |
|
inputs=[dataset_column], |
|
outputs=[dataset_column_corpus], |
|
) |
|
|
|
dataset_import_btn.click( |
|
fn=import_dataset, |
|
inputs=[ |
|
dataset_sampling_method, |
|
dataset_sampling_size, |
|
dataset_column, |
|
], |
|
outputs=[dataset_title], |
|
) |
|
|
|
methodology.input( |
|
fn=import_methodology, |
|
inputs=[methodology], |
|
outputs=[methodology_title, methodology_description, evaluation_btn], |
|
) |
|
|
|
evaluation_btn.click( |
|
fn=evaluate, inputs=None, outputs=[result_description, result_plot, result_df] |
|
) |
|
|
|
submit_to_avid_btn.click( |
|
fn=None, |
|
inputs=None, |
|
outputs=None, |
|
) |
|
|
|
|
|
BiasAware.launch() |
|
|