Spaces:
Build error
Build error
| import logging | |
| from os import mkdir | |
| from os.path import isdir | |
| from os.path import join as pjoin | |
| from pathlib import Path | |
| import streamlit as st | |
| from data_measurements_clusters import Clustering | |
| title = "Dataset Exploration" | |
| description = "Comparison of hate speech detection datasets" | |
| date = "2022-01-26" | |
| thumbnail = "images/books.png" | |
| __COLLECT = """ | |
| In order to turn observations of the world into data, choices must be made | |
| about what counts as data, where to collect data, and how to collect data. | |
| When collecting language data, this often means selecting websites that allow | |
| for easily collecting samples of text, and hate speech data is frequently | |
| collected from social media platforms like Twitter or forums like Wikipedia. | |
| Each of these decisions results in a specific sample of all the possible | |
| observations. | |
| """ | |
| __ANNOTATE = """ | |
| Once the data is collected, further decisions must be made about how to | |
| label the data if the data is being used to train a classification system, | |
| as is common in hate speech detection. These labels must be defined in order | |
| for the dataset to be consistently labeled, which helps the classification | |
| model produce more consistent output. This labeling process, called | |
| *annotation*, can be done by the data collectors, by a set of trained | |
| annotators with relevant expert knowledge, or by online crowdworkers. Who | |
| is doing the annotating has a significant effect on the resulting set of | |
| labels ([Sap et al., 2019](https://aclanthology.org/P19-1163.pdf)). | |
| """ | |
| __STANDARDIZE = """ | |
| As a relatively new task in NLP, the definitions that are used across | |
| different projects vary. Some projects target just hate speech, but others | |
| may label their data for ‘toxic’, ‘offensive’, or ‘abusive’ language. Still | |
| others may address related problems such as bullying and harassment. | |
| This variation makes it difficult to compare across datasets and their | |
| respective models. As these modeling paradigms become more established, | |
| definitions grounded in relevant sociological research will need to be | |
| agreed upon in order for datasets and models in ACM to appropriately | |
| capture the problems in the world that they set out to address. For more | |
| on this discussion, see | |
| [Madukwe et al 2020](https://aclanthology.org/2020.alw-1.18.pdf) and | |
| [Fortuna et al 2020](https://aclanthology.org/2020.lrec-1.838.pdf). | |
| """ | |
| __HOW_TO = """ | |
| To use the tool, select a dataset. The tool will then show clusters of | |
| examples in the dataset that have been automatically determined to be similar | |
| to one another. Below that, you can see specific examples within the cluster, | |
| the labels for those examples, and the distribution of labels within the | |
| cluster. Note that cluster 0 will always be the full dataset. | |
| """ | |
| DSET_OPTIONS = { | |
| "classla/FRENK-hate-en": { | |
| "binary": { | |
| "train": { | |
| ("text",): { | |
| "label": { | |
| 100000: { | |
| "sentence-transformers/all-mpnet-base-v2": { | |
| "tree": { | |
| "dataset_name": "classla/FRENK-hate-en", | |
| "config_name": "binary", | |
| "split_name": "train", | |
| "input_field_path": ("text",), | |
| "label_name": "label", | |
| "num_rows": 100000, | |
| "model_name": "sentence-transformers/all-mpnet-base-v2", | |
| "file_name": "tree", | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "tweets_hate_speech_detection": { | |
| "default": { | |
| "train": { | |
| ("tweet",): { | |
| "label": { | |
| 100000: { | |
| "sentence-transformers/all-mpnet-base-v2": { | |
| "tree": { | |
| "dataset_name": "tweets_hate_speech_detection", | |
| "config_name": "default", | |
| "split_name": "train", | |
| "input_field_path": ("tweet",), | |
| "label_name": "label", | |
| "num_rows": 100000, | |
| "model_name": "sentence-transformers/all-mpnet-base-v2", | |
| "file_name": "tree", | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "ucberkeley-dlab/measuring-hate-speech": { | |
| "default": { | |
| "train": { | |
| ("text",): { | |
| "hatespeech": { | |
| 100000: { | |
| "sentence-transformers/all-mpnet-base-v2": { | |
| "tree": { | |
| "dataset_name": "ucberkeley-dlab/measuring-hate-speech", | |
| "config_name": "default", | |
| "split_name": "train", | |
| "input_field_path": ("text",), | |
| "label_name": "hatespeech", | |
| "num_rows": 100000, | |
| "model_name": "sentence-transformers/all-mpnet-base-v2", | |
| "file_name": "tree", | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| } | |
| def download_tree(args): | |
| clusters = Clustering(**args) | |
| return clusters | |
| def run_article(): | |
| st.markdown("# Making a Hate Speech Dataset") | |
| st.markdown("## Collecting observations of the world") | |
| with st.expander("Collection"): | |
| st.markdown(__COLLECT, unsafe_allow_html=True) | |
| st.markdown("## Annotating observations with task labels") | |
| with st.expander("Annotation"): | |
| st.markdown(__ANNOTATE, unsafe_allow_html=True) | |
| st.markdown("## Standardizing the task") | |
| with st.expander("Standardization"): | |
| st.markdown(__STANDARDIZE, unsafe_allow_html=True) | |
| st.markdown("# Exploring datasets") | |
| with st.expander("How to use the tool"): | |
| st.markdown(__HOW_TO, unsafe_allow_html=True) | |
| choose_dset = st.selectbox( | |
| "Select dataset to visualize", | |
| DSET_OPTIONS, | |
| ) | |
| pre_args = DSET_OPTIONS[choose_dset] | |
| args = pre_args | |
| while not "dataset_name" in args: | |
| args = list(args.values())[0] | |
| clustering = download_tree(args) | |
| st.markdown("---\n") | |
| full_tree_fig = clustering.get_full_tree() | |
| st.plotly_chart(full_tree_fig, use_container_width=True) | |
| st.markdown("---\n") | |
| show_node = st.selectbox( | |
| "Visualize cluster node:", | |
| range(len(clustering.node_list)), | |
| ) | |
| st.markdown( | |
| f"Node {show_node} has {clustering.node_list[show_node]['weight']} examples." | |
| ) | |
| st.markdown( | |
| f"Node {show_node} was merged at {clustering.node_list[show_node]['merged_at']:.2f}." | |
| ) | |
| examplars = clustering.get_node_examplars(show_node) | |
| st.markdown("---\n") | |
| label_fig = clustering.get_node_label_chart(show_node) | |
| examplars_col, labels_col = st.columns([2, 1]) | |
| examplars_col.markdown("#### Node cluster examplars") | |
| examplars_col.table(examplars) | |
| labels_col.markdown("#### Node cluster labels") | |
| labels_col.plotly_chart(label_fig, use_container_width=True) | |