Spaces:

hf-task-exploration
/

ExploreACMnaacl

Running

ExploreACMnaacl / posts /dataset_exploration.py

Yacine Jernite

black

0a10cf1 over 3 years ago

7.72 kB

	import logging
	from os import mkdir
	from os.path import isdir
	from os.path import join as pjoin
	from pathlib import Path

	import streamlit as st

	from data_measurements_clusters import Clustering

	title = "Dataset Exploration"
	description = "Comparison of hate speech detection datasets"
	date = "2022-01-26"
	thumbnail = "images/books.png"

	__COLLECT = """
	In order to turn observations of the world into data, choices must be made
	about what counts as data, where to collect data, and how to collect data.
	When collecting language data, this often means selecting websites that allow
	for easily collecting samples of text, and hate speech data is frequently
	collected from social media platforms like Twitter or forums like Wikipedia.
	Each of these decisions results in a specific sample of all the possible
	observations.
	"""

	__ANNOTATE = """
	Once the data is collected, further decisions must be made about how to
	label the data if the data is being used to train a classification system,
	as is common in hate speech detection. These labels must be defined in order
	for the dataset to be consistently labeled, which helps the classification
	model produce more consistent output. This labeling process, called
	annotation, can be done by the data collectors, by a set of trained
	annotators with relevant expert knowledge, or by online crowdworkers. Who
	is doing the annotating has a significant effect on the resulting set of
	labels ([Sap et al., 2019](https://aclanthology.org/P19-1163.pdf)).
	"""

	__STANDARDIZE = """
	As a relatively new task in NLP, the definitions that are used across
	different projects vary. Some projects target just hate speech, but others
	may label their data for ‘toxic’, ‘offensive’, or ‘abusive’ language. Still
	others may address related problems such as bullying and harassment.
	This variation makes it difficult to compare across datasets and their
	respective models. As these modeling paradigms become more established,
	definitions grounded in relevant sociological research will need to be
	agreed upon in order for datasets and models in ACM to appropriately
	capture the problems in the world that they set out to address. For more
	on this discussion, see
	[Madukwe et al 2020](https://aclanthology.org/2020.alw-1.18.pdf) and
	[Fortuna et al 2020](https://aclanthology.org/2020.lrec-1.838.pdf).
	"""

	__HOW_TO = """
	To use the tool, select a dataset. The tool will then show clusters of
	examples in the dataset that have been automatically determined to be similar
	to one another. Below that, you can see specific examples within the cluster,
	the labels for those examples, and the distribution of labels within the
	cluster. Note that cluster 0 will always be the full dataset.
	"""

	DSET_OPTIONS = {
	"classla/FRENK-hate-en": {
	"binary": {
	"train": {
	("text",): {
	"label": {
	100000: {
	"sentence-transformers/all-mpnet-base-v2": {
	"tree": {
	"dataset_name": "classla/FRENK-hate-en",
	"config_name": "binary",
	"split_name": "train",
	"input_field_path": ("text",),
	"label_name": "label",
	"num_rows": 100000,
	"model_name": "sentence-transformers/all-mpnet-base-v2",
	"file_name": "tree",
	}
	}
	}
	}
	}
	}
	}
	},
	"tweets_hate_speech_detection": {
	"default": {
	"train": {
	("tweet",): {
	"label": {
	100000: {
	"sentence-transformers/all-mpnet-base-v2": {
	"tree": {
	"dataset_name": "tweets_hate_speech_detection",
	"config_name": "default",
	"split_name": "train",
	"input_field_path": ("tweet",),
	"label_name": "label",
	"num_rows": 100000,
	"model_name": "sentence-transformers/all-mpnet-base-v2",
	"file_name": "tree",
	}
	}
	}
	}
	}
	}
	}
	},
	"ucberkeley-dlab/measuring-hate-speech": {
	"default": {
	"train": {
	("text",): {
	"hatespeech": {
	100000: {
	"sentence-transformers/all-mpnet-base-v2": {
	"tree": {
	"dataset_name": "ucberkeley-dlab/measuring-hate-speech",
	"config_name": "default",
	"split_name": "train",
	"input_field_path": ("text",),
	"label_name": "hatespeech",
	"num_rows": 100000,
	"model_name": "sentence-transformers/all-mpnet-base-v2",
	"file_name": "tree",
	}
	}
	}
	}
	}
	}
	}
	},
	}


	@st.cache(allow_output_mutation=True)
	def download_tree(args):
	clusters = Clustering(**args)
	return clusters


	def run_article():
	st.markdown("# Making a Hate Speech Dataset")
	st.markdown("## Collecting observations of the world")
	with st.expander("Collection"):
	st.markdown(__COLLECT, unsafe_allow_html=True)
	st.markdown("## Annotating observations with task labels")
	with st.expander("Annotation"):
	st.markdown(__ANNOTATE, unsafe_allow_html=True)
	st.markdown("## Standardizing the task")
	with st.expander("Standardization"):
	st.markdown(__STANDARDIZE, unsafe_allow_html=True)
	st.markdown("# Exploring datasets")
	with st.expander("How to use the tool"):
	st.markdown(__HOW_TO, unsafe_allow_html=True)

	choose_dset = st.selectbox(
	"Select dataset to visualize",
	DSET_OPTIONS,
	)

	pre_args = DSET_OPTIONS[choose_dset]
	args = pre_args
	while not "dataset_name" in args:
	args = list(args.values())[0]

	clustering = download_tree(args)

	st.markdown("---\n")

	full_tree_fig = clustering.get_full_tree()
	st.plotly_chart(full_tree_fig, use_container_width=True)

	st.markdown("---\n")
	show_node = st.selectbox(
	"Visualize cluster node:",
	range(len(clustering.node_list)),
	)
	st.markdown(
	f"Node {show_node} has {clustering.node_list[show_node]['weight']} examples."
	)
	st.markdown(
	f"Node {show_node} was merged at {clustering.node_list[show_node]['merged_at']:.2f}."
	)
	examplars = clustering.get_node_examplars(show_node)
	st.markdown("---\n")

	label_fig = clustering.get_node_label_chart(show_node)
	examplars_col, labels_col = st.columns([2, 1])
	examplars_col.markdown("#### Node cluster examplars")
	examplars_col.table(examplars)
	labels_col.markdown("#### Node cluster labels")
	labels_col.plotly_chart(label_fig, use_container_width=True)