Spaces:

huggingface
/

data-measurements-tool

Build error

data-measurements-tool / data_measurements /streamlit_utils.py

Ezi Ozoani

try fix

386b032 about 1 year ago

22.4 kB

	# Copyright 2021 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import statistics

	import json
	import pandas as pd
	import seaborn as sns
	import streamlit as st
	from st_aggrid import AgGrid, GridOptionsBuilder

	from .dataset_utils import HF_DESC_FIELD, HF_FEATURE_FIELD, HF_LABEL_FIELD
	st.set_option('deprecation.showPyplotGlobalUse', False)
	json_file_path = "cache_dir/has_cache.json"
	with open(json_file_path, "r", encoding="utf-8") as j:
	HAS_CACHE = json.loads(j.read())

	def sidebar_header():
	st.sidebar.markdown(
	"""
	This demo showcases the [dataset measures as we develop them](https://huggingface.co/blog/data-measurements-tool).
	Right now this has a few pre-loaded datasets for which you can:
	- view some general statistics about the text vocabulary, lengths, labels
	- explore some distributional statistics to assess properties of the language
	- view some comparison statistics and overview of the text distribution

	The tool is in development, and will keep growing in utility and functionality 🤗🚧
	""",
	unsafe_allow_html=True,
	)


	def sidebar_selection(ds_name_to_dict, column_id):
	# ds_names = list(ds_name_to_dict.keys())
	ds_names = list(_HAS_CACHE.keys())
	with st.sidebar.expander(f"Choose dataset and field {column_id}", expanded=True):
	# choose a dataset to analyze
	ds_name = st.selectbox(
	f"Choose dataset to explore{column_id}:",
	ds_names,
	index=ds_names.index("hate_speech18"),
	)
	# choose a config to analyze
	ds_configs = ds_name_to_dict[ds_name]
	if ds_name == "c4":
	config_names = ['en','en.noblocklist','realnewslike']
	else:
	config_names = list(ds_configs.keys())
	config_names = list(_HAS_CACHE[ds_name].keys())
	config_name = st.selectbox(
	f"Choose configuration{column_id}:",
	config_names,
	index=0,
	)
	# choose a subset of num_examples
	# TODO: Handling for multiple text features
	ds_config = ds_configs[config_name]
	# text_features = ds_config[HF_FEATURE_FIELD]["string"]
	text_features = [tuple(text_field.split('-')) for text_field in _HAS_CACHE[ds_name][config_name]]
	# TODO @yacine: Explain what this is doing and why eg tp[0] could = "id"
	text_field = st.selectbox(
	f"Which text feature from the{column_id} dataset would you like to analyze?",
	[("text",)]
	if ds_name == "c4"
	else [tp for tp in text_features if tp[0] != "id"],
	)
	# Choose a split and dataset size
	# avail_splits = list(ds_config["splits"].keys())
	avail_splits = list(_HAS_CACHE[ds_name][config_name]['-'.join(text_field)].keys())
	# 12.Nov note: Removing "test" because those should not be examined
	# without discussion of pros and cons, which we haven't done yet.
	if "test" in avail_splits:
	avail_splits.remove("test")
	split = st.selectbox(
	f"Which split from the{column_id} dataset would you like to analyze?",
	avail_splits,
	index=0,
	)
	label_field, label_names = (
	ds_name_to_dict[ds_name][config_name][HF_FEATURE_FIELD][HF_LABEL_FIELD][0]
	if len(
	ds_name_to_dict[ds_name][config_name][HF_FEATURE_FIELD][HF_LABEL_FIELD]
	)
	> 0
	else ((), [])
	)
	return {
	"dset_name": ds_name,
	"dset_config": config_name,
	"split_name": split,
	"text_field": text_field,
	"label_field": label_field,
	"label_names": label_names,
	}


	def expander_header(dstats, ds_name_to_dict, column_id):
	with st.expander(f"Dataset Description{column_id}"):
	st.markdown(
	ds_name_to_dict[dstats.dset_name][dstats.dset_config][HF_DESC_FIELD]
	)
	st.dataframe(dstats.dset_peek)


	def expander_general_stats(dstats, column_id):
	with st.expander(f"General Text Statistics{column_id}"):
	st.caption(
	"Use this widget to check whether the terms you see most represented"
	" in the dataset make sense for the goals of the dataset."
	)
	if dstats.total_words == 0:
	st.markdown("Eh oh...not finding the file I need. 😭 Probably it will be there soon. 🤞 Check back later!")
	else:
	st.markdown("There are {0} total words".format(str(dstats.total_words)))
	st.markdown(
	"There are {0} words after removing closed "
	"class words".format(str(dstats.total_open_words))
	)
	st.markdown(
	"The most common "
	"[open class words](https://dictionary.apa.org/open-class-words) "
	"and their counts are: "
	)
	st.dataframe(dstats.sorted_top_vocab_df)
	st.markdown(
	"There are {0} missing values in the dataset.".format(
	str(dstats.text_nan_count)
	)
	)
	if dstats.dedup_total > 0:
	st.markdown(
	"There are {0} duplicate items in the dataset. "
	"For more information about the duplicates, "
	"click the 'Duplicates' tab below.".format(str(dstats.dedup_total))
	)
	else:
	st.markdown("There are 0 duplicate items in the dataset. ")


	### Show the label distribution from the datasets
	def expander_label_distribution(fig_labels, column_id):
	with st.expander(f"Label Distribution{column_id}", expanded=False):
	st.caption(
	"Use this widget to see how balanced the labels in your dataset are."
	)
	if fig_labels is not None:
	st.plotly_chart(fig_labels, use_container_width=True)
	else:
	st.markdown("No labels were found in the dataset")


	def expander_text_lengths(dstats, column_id):
	_TEXT_LENGTH_CAPTION = (
	"Use this widget to identify outliers, particularly suspiciously long outliers."
	)
	with st.expander(f"Text Lengths{column_id}", expanded=False):
	st.caption(_TEXT_LENGTH_CAPTION)
	st.markdown(
	"Below, you can see how the lengths of the text instances in your dataset are distributed."
	)
	st.markdown(
	"Any unexpected peaks or valleys in the distribution may help to identify instances you want to remove or augment."
	)
	st.markdown(
	"### Here is the relative frequency of different text lengths in your dataset:"
	)
	try:
	st.image(dstats.fig_tok_length_png)
	except:
	st.pyplot(dstats.fig_tok_length, use_container_width=True)
	st.markdown(
	"The average length of text instances is **"
	+ str(dstats.avg_length)
	+ " words, with a standard deviation of "
	+ str(dstats.std_length)
	+ "**."
	)
	# This is quite a large file and is breaking our ability to navigate the app development.
	# Just passing if it's not already there for launch v0
	if dstats.length_df is not None:
	start_id_show_lengths = st.selectbox(
	"Show examples of length:",
	sorted(dstats.length_df["length"].unique().tolist()),
	key=f"select_show_length_{column_id}",
	)
	st.table(
	dstats.length_df[
	dstats.length_df["length"] == start_id_show_lengths
	].set_index("length")
	)


	### Third, use a sentence embedding model
	def expander_text_embeddings(
	text_dset, fig_tree, node_list, embeddings, text_field, column_id
	):
	with st.expander(f"Text Embedding Clusters{column_id}", expanded=False):
	_EMBEDDINGS_CAPTION = """
	### Hierarchical Clustering of Text Fields
	Taking in the diversity of text represented in a dataset can be
	challenging when it is made up of hundreds to thousands of sentences.
	Grouping these text items based on a measure of similarity can help
	users gain some insights into their distribution.
	The following figure shows a hierarchical clustering of the text fields
	in the dataset based on a
	[Sentence-Transformer](https://hf.co/sentence-transformers/all-mpnet-base-v2)
	model. Clusters are merged if any of the embeddings in cluster A has a
	dot product with any of the embeddings or with the centroid of cluster B
	higher than a threshold (one threshold per level, from 0.5 to 0.95).
	To explore the clusters, you can:
	- hover over a node to see the 5 most representative examples (deduplicated)
	- enter an example in the text box below to see which clusters it is most similar to
	- select a cluster by ID to show all of its examples
	"""
	st.markdown(_EMBEDDINGS_CAPTION)
	st.plotly_chart(fig_tree, use_container_width=True)
	st.markdown("---\n")
	if st.checkbox(
	label="Enter text to see nearest clusters",
	key=f"search_clusters_{column_id}",
	):
	compare_example = st.text_area(
	label="Enter some text here to see which of the clusters in the dataset it is closest to",
	key=f"search_cluster_input_{column_id}",
	)
	if compare_example != "":
	paths_to_leaves = embeddings.cached_clusters.get(
	compare_example,
	embeddings.find_cluster_beam(compare_example, beam_size=50),
	)
	clusters_intro = ""
	if paths_to_leaves[0][1] < 0.3:
	clusters_intro += (
	"Warning: no close clusters found (best score <0.3). "
	)
	clusters_intro += "The closest clusters to the text entered aboce are:"
	st.markdown(clusters_intro)
	for path, score in paths_to_leaves[:5]:
	example = text_dset[
	node_list[path[-1]]["sorted_examples_centroid"][0][0]
	][text_field][:256]
	st.write(
	f"Cluster {path[-1]:5d} \| Score: {score:.3f} \n Example: {example}"
	)
	show_node_default = paths_to_leaves[0][0][-1]
	else:
	show_node_default = len(node_list) // 2
	else:
	show_node_default = len(node_list) // 2
	st.markdown("---\n")
	if text_dset is None:
	st.markdown("Missing source text to show, check back later!")
	else:
	show_node = st.selectbox(
	f"Choose a leaf node to explore in the{column_id} dataset:",
	range(len(node_list)),
	index=show_node_default,
	)
	node = node_list[show_node]
	start_id = st.slider(
	f"Show closest sentences in cluster to the centroid{column_id} starting at index:",
	0,
	len(node["sorted_examples_centroid"]) - 5,
	value=0,
	step=5,
	)
	for sid, sim in node["sorted_examples_centroid"][start_id : start_id + 5]:
	# only show the first 4 lines and the first 10000 characters
	show_text = text_dset[sid][text_field][:10000]
	show_text = "\n".join(show_text.split("\n")[:4])
	st.text(f"{sim:.3f} \t {show_text}")


	### Then, show duplicates
	def expander_text_duplicates(dstats, column_id):
	# TODO: Saving/loading figure
	with st.expander(f"Text Duplicates{column_id}", expanded=False):
	st.caption(
	"Use this widget to identify text strings that appear more than once."
	)
	st.markdown(
	"A model's training and testing may be negatively affected by unwarranted duplicates ([Lee et al., 2021](https://arxiv.org/abs/2107.06499))."
	)
	st.markdown("------")
	st.write(
	"### Here is the list of all the duplicated items and their counts in your dataset:"
	)
	if dstats.dup_counts_df is None or dstats.dup_counts_df.empty:
	st.write("There are no duplicates in this dataset! 🥳")
	else:
	st.dataframe(dstats.dup_counts_df.reset_index(drop=True))


	def expander_npmi_description(min_vocab):
	_NPMI_CAPTION = (
	"Use this widget to identify problematic biases and stereotypes in your data."
	)
	_NPMI_CAPTION1 = """
	nPMI scores for a word help to identify potentially
	problematic associations, ranked by how close the association is."""
	_NPMI_CAPTION2 = """
	nPMI bias scores for paired words help to identify how word
	associations are skewed between the selected selected words
	([Aka et al., 2021](https://arxiv.org/abs/2103.03417)).
	"""

	st.caption(_NPMI_CAPTION)
	st.markdown(_NPMI_CAPTION1)
	st.markdown(_NPMI_CAPTION2)
	st.markdown(" ")
	st.markdown(
	"You can select from gender and sexual orientation "
	"identity terms that appear in the dataset at least %s "
	"times." % min_vocab
	)
	st.markdown(
	"The resulting ranked words are those that co-occur with both "
	"identity terms. "
	)
	st.markdown(
	"The more positive the score, the more associated the word is with the first identity term. "
	"The more negative the score, the more associated the word is with the second identity term."
	)


	### Finally, show Zipf stuff
	def expander_zipf(z, zipf_fig, column_id):
	with st.expander(
	f"Vocabulary Distribution{column_id}: Zipf's Law Fit", expanded=False
	):
	try:
	_ZIPF_CAPTION = """This shows how close the observed language is to an ideal
	natural language distribution following [Zipf's law](https://en.wikipedia.org/wiki/Zipf%27s_law),
	calculated by minimizing the [Kolmogorov-Smirnov (KS) statistic](https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test)."""

	powerlaw_eq = r"""p(x) \propto x^{- \alpha}"""
	zipf_summary = (
	"The optimal alpha based on this dataset is: **"
	+ str(round(z.alpha, 2))
	+ ", with a KS distance of: "
	+ str(round(z.distance, 2))
	)
	zipf_summary += (
	". This was fit with a minimum rank value of: "
	+ str(int(z.xmin))
	+ "*, which is the optimal rank beyond which* the scaling regime of the power law fits best."
	)

	alpha_warning = "Your alpha value is a bit on the high side, which means that the distribution over words in this dataset is a bit unnatural. This could be due to non-language items throughout the dataset."
	xmin_warning = "The minimum rank for this fit is a bit on the high side, which means that the frequencies of your most common words aren't distributed as would be expected by Zipf's law."
	fit_results_table = pd.DataFrame.from_dict(
	{
	r"Alpha:": [str("%.2f" % z.alpha)],
	"KS distance:": [str("%.2f" % z.distance)],
	"Min rank:": [str("%s" % int(z.xmin))],
	},
	columns=["Results"],
	orient="index",
	)
	fit_results_table.index.name = column_id
	st.caption(
	"Use this widget for the counts of different words in your dataset, measuring the difference between the observed count and the expected count under Zipf's law."
	)
	st.markdown(_ZIPF_CAPTION)
	st.write(
	"""
	A Zipfian distribution follows the power law: $p(x) \propto x^{-α}$
	with an ideal α value of 1."""
	)
	st.markdown(
	"In general, an alpha greater than 2 or a minimum rank greater than 10 (take with a grain of salt) means that your distribution is relativaly _unnatural_ for natural language. This can be a sign of mixed artefacts in the dataset, such as HTML markup."
	)
	st.markdown(
	"Below, you can see the counts of each word in your dataset vs. the expected number of counts following a Zipfian distribution."
	)
	st.markdown("-----")
	st.write("### Here is your dataset's Zipf results:")
	st.dataframe(fit_results_table)
	st.write(zipf_summary)
	# TODO: Nice UI version of the content in the comments.
	# st.markdown("\nThe KS test p-value is < %.2f" % z.ks_test.pvalue)
	# if z.ks_test.pvalue < 0.01:
	# st.markdown(
	# "\n Great news! Your data fits a powerlaw with a minimum KS " "distance of %.4f" % z.distance)
	# else:
	# st.markdown("\n Sadly, your data does not fit a powerlaw. =(")
	# st.markdown("Checking the goodness of fit of our observed distribution")
	# st.markdown("to the hypothesized power law distribution")
	# st.markdown("using a Kolmogorov–Smirnov (KS) test.")
	st.plotly_chart(zipf_fig, use_container_width=True)
	if z.alpha > 2:
	st.markdown(alpha_warning)
	if z.xmin > 5:
	st.markdown(xmin_warning)
	except:
	st.write("Under construction! 😱 🚧")


	### Finally finally finally, show nPMI stuff.
	def npmi_widget(npmi_stats, min_vocab, column_id):
	"""
	Part of the main app, but uses a user interaction so pulled out as its own f'n.
	:param use_cache:
	:param column_id:
	:param npmi_stats:
	:param min_vocab:
	:return:
	"""
	with st.expander(f"Word Association{column_id}: nPMI", expanded=False):
	try:
	if len(npmi_stats.available_terms) > 0:
	expander_npmi_description(min_vocab)
	st.markdown("-----")
	term1 = st.selectbox(
	f"What is the first term you want to select?{column_id}",
	npmi_stats.available_terms,
	)
	term2 = st.selectbox(
	f"What is the second term you want to select?{column_id}",
	reversed(npmi_stats.available_terms),
	)
	# We calculate/grab nPMI data based on a canonical (alphabetic)
	# subgroup ordering.
	subgroup_pair = sorted([term1, term2])
	try:
	joint_npmi_df = npmi_stats.load_or_prepare_joint_npmi(subgroup_pair)
	npmi_show(joint_npmi_df)
	except KeyError:
	st.markdown(
	"WARNING! The nPMI for these terms has not been pre-computed, please re-run caching."
	)
	else:
	st.markdown(
	"No words found co-occurring with both of the selected identity terms."
	)
	except:
	st.write("Under construction! 😱 🚧")


	def npmi_show(paired_results):
	if paired_results.empty:
	st.markdown("No words that co-occur enough times for results! Or there's a 🐛. Or we're still computing this one. 🤷")
	else:
	s = pd.DataFrame(paired_results.sort_values(by="npmi-bias", ascending=True))
	# s.columns=pd.MultiIndex.from_arrays([['npmi','npmi','npmi','count', 'count'],['bias','man','straight','man','straight']])
	s.index.name = "word"
	npmi_cols = s.filter(like="npmi").columns
	count_cols = s.filter(like="count").columns
	if s.shape[0] > 10000:
	bias_thres = max(abs(s["npmi-bias"][5000]), abs(s["npmi-bias"][-5000]))
	print(f"filtering with bias threshold: {bias_thres}")
	s_filtered = s[s["npmi-bias"].abs() > bias_thres]
	else:
	s_filtered = s
	# TODO: This is very different look than the duplicates table above. Should probably standardize.
	cm = sns.palplot(sns.diverging_palette(270, 36, s=99, l=48, n=16))
	out_df = (
	s_filtered.style.background_gradient(subset=npmi_cols, cmap=cm)
	.format(subset=npmi_cols, formatter="{:,.3f}")
	.format(subset=count_cols, formatter=int)
	.set_properties(
	subset=count_cols, **{"width": "10em", "text-align": "center"}
	)
	.set_properties(**{"align": "center"})
	.set_caption(
	"nPMI scores and co-occurence counts between the selected identity terms and the words they both co-occur with"
	)
	) # s = pd.read_excel("output.xlsx", index_col="word")
	st.write("### Here is your dataset's nPMI results:")
	st.dataframe(out_df)


	### Dumping unused functions here for now
	### Second, show the distribution of text perplexities
	def expander_text_perplexities(text_label_df, sorted_sents_loss, fig_loss):
	with st.expander("Show text perplexities A", expanded=False):
	st.markdown("### Text perplexities A")
	st.plotly_chart(fig_loss, use_container_width=True)
	start_id_show_loss = st.slider(
	"Show highest perplexity sentences in A starting at index:",
	0,
	text_label_df.shape[0] - 5,
	value=0,
	step=5,
	)
	for lss, sent in sorted_sents_loss[start_id_show_loss : start_id_show_loss + 5]:
	st.text(f"{lss:.3f} {sent}")