import logging from os import mkdir from os.path import isdir from os.path import join as pjoin from pathlib import Path import streamlit as st from data_measurements_clusters import Clustering title = "Dataset Exploration" description = "Comparison of hate speech detection datasets" date = "2022-01-26" thumbnail = "images/books.png" __COLLECT = """ In order to turn observations of the world into data, choices must be made about what counts as data, where to collect data, and how to collect data. When collecting language data, this often means selecting websites that allow for easily collecting samples of text, and hate speech data is frequently collected from social media platforms like Twitter or forums like Wikipedia. Each of these decisions results in a specific sample of all the possible observations. """ __ANNOTATE = """ Once the data is collected, further decisions must be made about how to label the data if the data is being used to train a classification system, as is common in hate speech detection. These labels must be defined in order for the dataset to be consistently labeled, which helps the classification model produce more consistent output. This labeling process, called *annotation*, can be done by the data collectors, by a set of trained annotators with relevant expert knowledge, or by online crowdworkers. Who is doing the annotating has a significant effect on the resulting set of labels ([Sap et al., 2019](https://aclanthology.org/P19-1163.pdf)). """ __STANDARDIZE = """ As a relatively new task in NLP, the definitions that are used across different projects vary. Some projects target just hate speech, but others may label their data for ‘toxic’, ‘offensive’, or ‘abusive’ language. Still others may address related problems such as bullying and harassment. This variation makes it difficult to compare across datasets and their respective models. As these modeling paradigms become more established, definitions grounded in relevant sociological research will need to be agreed upon in order for datasets and models in ACM to appropriately capture the problems in the world that they set out to address. For more on this discussion, see [Madukwe et al 2020](https://aclanthology.org/2020.alw-1.18.pdf) and [Fortuna et al 2020](https://aclanthology.org/2020.lrec-1.838.pdf). """ __HOW_TO = """ To use the tool, select a dataset. The tool will then show clusters of examples in the dataset that have been automatically determined to be similar to one another. Below that, you can see specific examples within the cluster, the labels for those examples, and the distribution of labels within the cluster. Note that cluster 0 will always be the full dataset. """ DSET_OPTIONS = { "classla/FRENK-hate-en": { "binary": { "train": { ("text",): { "label": { 100000: { "sentence-transformers/all-mpnet-base-v2": { "tree": { "dataset_name": "classla/FRENK-hate-en", "config_name": "binary", "split_name": "train", "input_field_path": ("text",), "label_name": "label", "num_rows": 100000, "model_name": "sentence-transformers/all-mpnet-base-v2", "file_name": "tree", } } } } } } } }, "tweets_hate_speech_detection": { "default": { "train": { ("tweet",): { "label": { 100000: { "sentence-transformers/all-mpnet-base-v2": { "tree": { "dataset_name": "tweets_hate_speech_detection", "config_name": "default", "split_name": "train", "input_field_path": ("tweet",), "label_name": "label", "num_rows": 100000, "model_name": "sentence-transformers/all-mpnet-base-v2", "file_name": "tree", } } } } } } } }, "ucberkeley-dlab/measuring-hate-speech": { "default": { "train": { ("text",): { "hatespeech": { 100000: { "sentence-transformers/all-mpnet-base-v2": { "tree": { "dataset_name": "ucberkeley-dlab/measuring-hate-speech", "config_name": "default", "split_name": "train", "input_field_path": ("text",), "label_name": "hatespeech", "num_rows": 100000, "model_name": "sentence-transformers/all-mpnet-base-v2", "file_name": "tree", } } } } } } } }, } @st.cache(allow_output_mutation=True) def download_tree(args): clusters = Clustering(**args) return clusters def run_article(): st.markdown("# Making a Hate Speech Dataset") st.markdown("## Collecting observations of the world") with st.expander("Collection"): st.markdown(__COLLECT, unsafe_allow_html=True) st.markdown("## Annotating observations with task labels") with st.expander("Annotation"): st.markdown(__ANNOTATE, unsafe_allow_html=True) st.markdown("## Standardizing the task") with st.expander("Standardization"): st.markdown(__STANDARDIZE, unsafe_allow_html=True) st.markdown("# Exploring datasets") with st.expander("How to use the tool"): st.markdown(__HOW_TO, unsafe_allow_html=True) choose_dset = st.selectbox( "Select dataset to visualize", DSET_OPTIONS, ) pre_args = DSET_OPTIONS[choose_dset] args = pre_args while not "dataset_name" in args: args = list(args.values())[0] clustering = download_tree(args) st.markdown("---\n") full_tree_fig = clustering.get_full_tree() st.plotly_chart(full_tree_fig, use_container_width=True) st.markdown("---\n") show_node = st.selectbox( "Visualize cluster node:", range(len(clustering.node_list)), ) st.markdown( f"Node {show_node} has {clustering.node_list[show_node]['weight']} examples." ) st.markdown( f"Node {show_node} was merged at {clustering.node_list[show_node]['merged_at']:.2f}." ) examplars = clustering.get_node_examplars(show_node) st.markdown("---\n") label_fig = clustering.get_node_label_chart(show_node) examplars_col, labels_col = st.columns([2, 1]) examplars_col.markdown("#### Node cluster examplars") examplars_col.table(examplars) labels_col.markdown("#### Node cluster labels") labels_col.plotly_chart(label_fig, use_container_width=True)