Spaces:

clarin-pl
/

datasets-explorer

Runtime error

datasets-explorer / clarin_datasets /polemo_dataset.py

Mariusz Kossakowski

Add tSNE projection

77405f7 about 3 years ago

9.52 kB

	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datasets import load_dataset
	import pandas as pd
	import plotly.figure_factory as ff
	import plotly.graph_objects as go
	from sklearn.manifold import TSNE
	import streamlit as st

	from clarin_datasets.dataset_to_show import DatasetToShow
	from clarin_datasets.utils import (
	count_num_of_characters,
	count_num_of_words,
	embed_sentence,
	PLOT_COLOR_PALETTE
	)


	class PolemoDataset(DatasetToShow):
	def __init__(self):
	DatasetToShow.__init__(self)
	self.dataset_name = "clarin-pl/polemo2-official"
	self.subsets = ["train", "validation", "test"]
	self.description = f"""
	Dataset link: https://huggingface.co/datasets/{self.dataset_name}

	The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
	hotels, products, and university. It is human-annotated on a level of full reviews and individual
	sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
	sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
	046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
	annotated with four labels: positive, negative, neutral, or ambiguous. """

	def load_data(self):
	raw_dataset = load_dataset(self.dataset_name)
	self.data_dict = {
	subset: raw_dataset[subset].to_pandas() for subset in self.subsets
	}

	def show_dataset(self):
	header = st.container()
	description = st.container()
	dataframe_head = st.container()
	word_searching = st.container()
	dataset_statistics = st.container()
	tsne_projection = st.container()

	with header:
	st.title(self.dataset_name)

	with description:
	st.header("Dataset description")
	st.write(self.description)

	with dataframe_head:
	filtering_options = self.data_dict["train"]["target"].unique().tolist()
	filtering_options.append("All classes")

	st.header("First 10 observations of a chosen class")
	class_to_show = st.selectbox(
	label="Select class to show", options=filtering_options
	)
	df_to_show = pd.concat(
	[
	self.data_dict["train"].copy(),
	self.data_dict["validation"].copy(),
	self.data_dict["test"].copy(),
	]
	)
	if class_to_show == "All classes":
	df_to_show = df_to_show.head(10)
	else:
	df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
	10
	)
	st.dataframe(df_to_show)
	st.text_area(label="Latex code", value=df_to_show.style.to_latex())

	st.subheader("First 10 observations of a chosen domain and text type")
	domain = st.selectbox(
	label="Select domain",
	options=["all", "hotels", "medicine", "products", "reviews"],
	)
	text_type = st.selectbox(
	label="Select text type",
	options=["Full text", "Tokenized to sentences"],
	)
	text_type_mapping_dict = {
	"Full text": "text",
	"Tokenized to sentences": "sentence",
	}

	polemo_subset = load_dataset(
	self.dataset_name,
	f"{domain}_{text_type_mapping_dict[text_type]}",
	)
	df = pd.concat(
	[
	polemo_subset["train"].to_pandas(),
	polemo_subset["validation"].to_pandas(),
	polemo_subset["test"].to_pandas(),
	]
	).head(10)
	st.dataframe(df)
	st.text_area(label="Latex code", value=df.style.to_latex())

	with word_searching:
	st.header("Observations containing a chosen word")
	searched_word = st.text_input(
	label="Enter the word you are looking for below"
	)
	df_to_show = pd.concat(
	[
	self.data_dict["train"].copy(),
	self.data_dict["validation"].copy(),
	self.data_dict["test"].copy(),
	]
	)
	df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
	st.dataframe(df_to_show)
	st.text_area(label="Latex code", value=df_to_show.style.to_latex())

	with dataset_statistics:
	st.header("Dataset statistics")
	st.subheader("Number of samples in each data split")
	metrics_df = pd.DataFrame.from_dict(
	{
	"Train": self.data_dict["train"].shape[0],
	"Validation": self.data_dict["validation"].shape[0],
	"Test": self.data_dict["test"].shape[0],
	"Total": sum(
	[
	self.data_dict["train"].shape[0],
	self.data_dict["validation"].shape[0],
	self.data_dict["test"].shape[0],
	]
	),
	},
	orient="index",
	).reset_index()
	metrics_df.columns = ["Subset", "Number of samples"]
	st.dataframe(metrics_df)

	latex_df = metrics_df.style.to_latex()
	st.text_area(label="Latex code", value=latex_df)

	# Class distribution in each subset
	st.subheader("Class distribution in each subset")
	target_unique_values = self.data_dict["train"]["target"].unique()
	hist = (
	pd.DataFrame(
	[
	df["target"].value_counts(normalize=True).rename(k)
	for k, df in self.data_dict.items()
	]
	)
	.reset_index()
	.rename({"index": "split_name"}, axis=1)
	)
	plot_data = [
	go.Bar(
	name=str(target_unique_values[i]),
	x=self.subsets,
	y=hist[target_unique_values[i]].values,
	)
	for i in range(len(target_unique_values))
	]
	barchart_class_dist = go.Figure(data=plot_data)
	barchart_class_dist.update_layout(
	barmode="group",
	title_text="Barchart - class distribution",
	xaxis_title="Split name",
	yaxis_title="Number of data points",
	)
	st.plotly_chart(barchart_class_dist, use_container_width=True)
	st.dataframe(hist)
	st.text_area(label="Latex code", value=hist.style.to_latex())

	# Number of words per observation
	st.subheader("Number of words per observation in each subset")
	hist_data_num_words = [
	df["text"].apply(count_num_of_words) for df in self.data_dict.values()
	]
	fig_num_words = ff.create_distplot(
	hist_data_num_words, self.subsets, show_rug=False, bin_size=1
	)
	fig_num_words.update_traces(
	nbinsx=100, autobinx=True, selector={"type": "histogram"}
	)
	fig_num_words.update_layout(
	title_text="Histogram - number of characters per observation",
	xaxis_title="Number of characters",
	)
	st.plotly_chart(fig_num_words, use_container_width=True)

	# Number of characters per observation
	st.subheader("Number of characters per observation in each subset")
	hist_data_num_characters = [
	df["text"].apply(count_num_of_characters)
	for df in self.data_dict.values()
	]
	fig_num_chars = ff.create_distplot(
	hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
	)
	fig_num_chars.update_layout(
	title_text="Histogram - number of characters per observation",
	xaxis_title="Number of characters",
	)
	st.plotly_chart(fig_num_chars, use_container_width=True)

	with tsne_projection:
	st.header("t-SNE projection of the dataset")
	subset_to_project = st.selectbox(
	label="Select subset to project", options=self.subsets
	)
	sentences = self.data_dict[subset_to_project]["text"].values
	reducer = TSNE(
	n_components=2
	)
	embedded_sentences = np.array(
	[embed_sentence(text) for text in sentences]
	)
	transformed_embeddings = reducer.fit_transform(embedded_sentences)
	fig, ax = plt.subplots()
	ax.scatter(
	x=transformed_embeddings[:, 0],
	y=transformed_embeddings[:, 1],
	c=[
	PLOT_COLOR_PALETTE[x]
	for x in self.data_dict[subset_to_project]["target"].values
	],
	)
	st.pyplot(fig)