ExploreACMnaacl / posts /dataset_exploration.py
Yacine Jernite
black
0a10cf1
import logging
from os import mkdir
from os.path import isdir
from os.path import join as pjoin
from pathlib import Path
import streamlit as st
from data_measurements_clusters import Clustering
title = "Dataset Exploration"
description = "Comparison of hate speech detection datasets"
date = "2022-01-26"
thumbnail = "images/books.png"
__COLLECT = """
In order to turn observations of the world into data, choices must be made
about what counts as data, where to collect data, and how to collect data.
When collecting language data, this often means selecting websites that allow
for easily collecting samples of text, and hate speech data is frequently
collected from social media platforms like Twitter or forums like Wikipedia.
Each of these decisions results in a specific sample of all the possible
observations.
"""
__ANNOTATE = """
Once the data is collected, further decisions must be made about how to
label the data if the data is being used to train a classification system,
as is common in hate speech detection. These labels must be defined in order
for the dataset to be consistently labeled, which helps the classification
model produce more consistent output. This labeling process, called
*annotation*, can be done by the data collectors, by a set of trained
annotators with relevant expert knowledge, or by online crowdworkers. Who
is doing the annotating has a significant effect on the resulting set of
labels ([Sap et al., 2019](https://aclanthology.org/P19-1163.pdf)).
"""
__STANDARDIZE = """
As a relatively new task in NLP, the definitions that are used across
different projects vary. Some projects target just hate speech, but others
may label their data for ‘toxic’, ‘offensive’, or ‘abusive’ language. Still
others may address related problems such as bullying and harassment.
This variation makes it difficult to compare across datasets and their
respective models. As these modeling paradigms become more established,
definitions grounded in relevant sociological research will need to be
agreed upon in order for datasets and models in ACM to appropriately
capture the problems in the world that they set out to address. For more
on this discussion, see
[Madukwe et al 2020](https://aclanthology.org/2020.alw-1.18.pdf) and
[Fortuna et al 2020](https://aclanthology.org/2020.lrec-1.838.pdf).
"""
__HOW_TO = """
To use the tool, select a dataset. The tool will then show clusters of
examples in the dataset that have been automatically determined to be similar
to one another. Below that, you can see specific examples within the cluster,
the labels for those examples, and the distribution of labels within the
cluster. Note that cluster 0 will always be the full dataset.
"""
DSET_OPTIONS = {
"classla/FRENK-hate-en": {
"binary": {
"train": {
("text",): {
"label": {
100000: {
"sentence-transformers/all-mpnet-base-v2": {
"tree": {
"dataset_name": "classla/FRENK-hate-en",
"config_name": "binary",
"split_name": "train",
"input_field_path": ("text",),
"label_name": "label",
"num_rows": 100000,
"model_name": "sentence-transformers/all-mpnet-base-v2",
"file_name": "tree",
}
}
}
}
}
}
}
},
"tweets_hate_speech_detection": {
"default": {
"train": {
("tweet",): {
"label": {
100000: {
"sentence-transformers/all-mpnet-base-v2": {
"tree": {
"dataset_name": "tweets_hate_speech_detection",
"config_name": "default",
"split_name": "train",
"input_field_path": ("tweet",),
"label_name": "label",
"num_rows": 100000,
"model_name": "sentence-transformers/all-mpnet-base-v2",
"file_name": "tree",
}
}
}
}
}
}
}
},
"ucberkeley-dlab/measuring-hate-speech": {
"default": {
"train": {
("text",): {
"hatespeech": {
100000: {
"sentence-transformers/all-mpnet-base-v2": {
"tree": {
"dataset_name": "ucberkeley-dlab/measuring-hate-speech",
"config_name": "default",
"split_name": "train",
"input_field_path": ("text",),
"label_name": "hatespeech",
"num_rows": 100000,
"model_name": "sentence-transformers/all-mpnet-base-v2",
"file_name": "tree",
}
}
}
}
}
}
}
},
}
@st.cache(allow_output_mutation=True)
def download_tree(args):
clusters = Clustering(**args)
return clusters
def run_article():
st.markdown("# Making a Hate Speech Dataset")
st.markdown("## Collecting observations of the world")
with st.expander("Collection"):
st.markdown(__COLLECT, unsafe_allow_html=True)
st.markdown("## Annotating observations with task labels")
with st.expander("Annotation"):
st.markdown(__ANNOTATE, unsafe_allow_html=True)
st.markdown("## Standardizing the task")
with st.expander("Standardization"):
st.markdown(__STANDARDIZE, unsafe_allow_html=True)
st.markdown("# Exploring datasets")
with st.expander("How to use the tool"):
st.markdown(__HOW_TO, unsafe_allow_html=True)
choose_dset = st.selectbox(
"Select dataset to visualize",
DSET_OPTIONS,
)
pre_args = DSET_OPTIONS[choose_dset]
args = pre_args
while not "dataset_name" in args:
args = list(args.values())[0]
clustering = download_tree(args)
st.markdown("---\n")
full_tree_fig = clustering.get_full_tree()
st.plotly_chart(full_tree_fig, use_container_width=True)
st.markdown("---\n")
show_node = st.selectbox(
"Visualize cluster node:",
range(len(clustering.node_list)),
)
st.markdown(
f"Node {show_node} has {clustering.node_list[show_node]['weight']} examples."
)
st.markdown(
f"Node {show_node} was merged at {clustering.node_list[show_node]['merged_at']:.2f}."
)
examplars = clustering.get_node_examplars(show_node)
st.markdown("---\n")
label_fig = clustering.get_node_label_chart(show_node)
examplars_col, labels_col = st.columns([2, 1])
examplars_col.markdown("#### Node cluster examplars")
examplars_col.table(examplars)
labels_col.markdown("#### Node cluster labels")
labels_col.plotly_chart(label_fig, use_container_width=True)