|
import logging |
|
from os import mkdir |
|
from os.path import isdir |
|
from os.path import join as pjoin |
|
from pathlib import Path |
|
|
|
import streamlit as st |
|
|
|
from data_measurements_clusters import Clustering |
|
|
|
title = "Dataset Exploration" |
|
description = "Comparison of hate speech detection datasets" |
|
date = "2022-01-26" |
|
thumbnail = "images/books.png" |
|
|
|
__COLLECT = """ |
|
In order to turn observations of the world into data, choices must be made |
|
about what counts as data, where to collect data, and how to collect data. |
|
When collecting language data, this often means selecting websites that allow |
|
for easily collecting samples of text, and hate speech data is frequently |
|
collected from social media platforms like Twitter or forums like Wikipedia. |
|
Each of these decisions results in a specific sample of all the possible |
|
observations. |
|
""" |
|
|
|
__ANNOTATE = """ |
|
Once the data is collected, further decisions must be made about how to |
|
label the data if the data is being used to train a classification system, |
|
as is common in hate speech detection. These labels must be defined in order |
|
for the dataset to be consistently labeled, which helps the classification |
|
model produce more consistent output. This labeling process, called |
|
*annotation*, can be done by the data collectors, by a set of trained |
|
annotators with relevant expert knowledge, or by online crowdworkers. Who |
|
is doing the annotating has a significant effect on the resulting set of |
|
labels ([Sap et al., 2019](https://aclanthology.org/P19-1163.pdf)). |
|
""" |
|
|
|
__STANDARDIZE = """ |
|
As a relatively new task in NLP, the definitions that are used across |
|
different projects vary. Some projects target just hate speech, but others |
|
may label their data for ‘toxic’, ‘offensive’, or ‘abusive’ language. Still |
|
others may address related problems such as bullying and harassment. |
|
This variation makes it difficult to compare across datasets and their |
|
respective models. As these modeling paradigms become more established, |
|
definitions grounded in relevant sociological research will need to be |
|
agreed upon in order for datasets and models in ACM to appropriately |
|
capture the problems in the world that they set out to address. For more |
|
on this discussion, see |
|
[Madukwe et al 2020](https://aclanthology.org/2020.alw-1.18.pdf) and |
|
[Fortuna et al 2020](https://aclanthology.org/2020.lrec-1.838.pdf). |
|
""" |
|
|
|
__HOW_TO = """ |
|
To use the tool, select a dataset. The tool will then show clusters of |
|
examples in the dataset that have been automatically determined to be similar |
|
to one another. Below that, you can see specific examples within the cluster, |
|
the labels for those examples, and the distribution of labels within the |
|
cluster. Note that cluster 0 will always be the full dataset. |
|
""" |
|
|
|
DSET_OPTIONS = {'classla/FRENK-hate-en': {'binary': {'train': {('text',): {'label': {100000: { |
|
'sentence-transformers/all-mpnet-base-v2': {'tree': {'dataset_name': 'classla/FRENK-hate-en', |
|
'config_name': 'binary', |
|
'split_name': 'train', |
|
'input_field_path': ('text',), |
|
'label_name': 'label', |
|
'num_rows': 100000, |
|
'model_name': 'sentence-transformers/all-mpnet-base-v2', |
|
'file_name': 'tree'}}}}}}}}, |
|
'tweets_hate_speech_detection': {'default': {'train': {('tweet',): {'label': {100000: { |
|
'sentence-transformers/all-mpnet-base-v2': {'tree': {'dataset_name': 'tweets_hate_speech_detection', |
|
'config_name': 'default', |
|
'split_name': 'train', |
|
'input_field_path': ('tweet',), |
|
'label_name': 'label', |
|
'num_rows': 100000, |
|
'model_name': 'sentence-transformers/all-mpnet-base-v2', |
|
'file_name': 'tree'}}}}}}}}, |
|
'ucberkeley-dlab/measuring-hate-speech': {'default': {'train': {('text',): {'hatespeech': {100000: { |
|
'sentence-transformers/all-mpnet-base-v2': {'tree': {'dataset_name': 'ucberkeley-dlab/measuring-hate-speech', |
|
'config_name': 'default', |
|
'split_name': 'train', |
|
'input_field_path': ('text',), |
|
'label_name': 'hatespeech', |
|
'num_rows': 100000, |
|
'model_name': 'sentence-transformers/all-mpnet-base-v2', |
|
'file_name': 'tree'}}}}}}}}, |
|
} |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def download_tree(args): |
|
clusters = Clustering(**args) |
|
return clusters |
|
|
|
|
|
def run_article(): |
|
st.markdown("# Making a Hate Speech Dataset") |
|
st.markdown("## Collecting observations of the world") |
|
with st.expander("Collection"): |
|
st.markdown(__COLLECT, unsafe_allow_html=True) |
|
st.markdown("## Annotating observations with task labels") |
|
with st.expander("Annotation"): |
|
st.markdown(__ANNOTATE, unsafe_allow_html=True) |
|
st.markdown("## Standardizing the task") |
|
with st.expander("Standardization"): |
|
st.markdown(__STANDARDIZE, unsafe_allow_html=True) |
|
st.markdown("# Exploring datasets") |
|
with st.expander("How to use the tool"): |
|
st.markdown(__HOW_TO, unsafe_allow_html=True) |
|
|
|
choose_dset = st.selectbox( |
|
"Select dataset to visualize", |
|
DSET_OPTIONS, |
|
) |
|
|
|
pre_args = DSET_OPTIONS[choose_dset] |
|
args = pre_args |
|
while not 'dataset_name' in args: |
|
args = list(args.values())[0] |
|
|
|
clustering = download_tree(args) |
|
|
|
st.markdown("---\n") |
|
|
|
full_tree_fig = clustering.get_full_tree() |
|
st.plotly_chart(full_tree_fig, use_container_width=True) |
|
|
|
st.markdown("---\n") |
|
show_node = st.selectbox( |
|
"Visualize cluster node:", |
|
range(len(clustering.node_list)), |
|
) |
|
st.markdown(f"Node {show_node} has {clustering.node_list[show_node]['weight']} examples.") |
|
st.markdown(f"Node {show_node} was merged at {clustering.node_list[show_node]['merged_at']:.2f}.") |
|
examplars = clustering.get_node_examplars(show_node) |
|
st.markdown("---\n") |
|
|
|
label_fig = clustering.get_node_label_chart(show_node) |
|
examplars_col, labels_col = st.columns([2, 1]) |
|
examplars_col.markdown("#### Node cluster examplars") |
|
examplars_col.table(examplars) |
|
labels_col.markdown("#### Node cluster labels") |
|
labels_col.plotly_chart(label_fig, use_container_width=True) |
|
|