datasets-explorer / clarin_datasets /polemo_dataset.py
Mariusz Kossakowski
Add tSNE projection
77405f7
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import streamlit as st
from clarin_datasets.dataset_to_show import DatasetToShow
from clarin_datasets.utils import (
count_num_of_characters,
count_num_of_words,
embed_sentence,
PLOT_COLOR_PALETTE
)
class PolemoDataset(DatasetToShow):
def __init__(self):
DatasetToShow.__init__(self)
self.dataset_name = "clarin-pl/polemo2-official"
self.subsets = ["train", "validation", "test"]
self.description = f"""
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
hotels, products, and university. It is human-annotated on a level of full reviews and individual
sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
annotated with four labels: positive, negative, neutral, or ambiguous. """
def load_data(self):
raw_dataset = load_dataset(self.dataset_name)
self.data_dict = {
subset: raw_dataset[subset].to_pandas() for subset in self.subsets
}
def show_dataset(self):
header = st.container()
description = st.container()
dataframe_head = st.container()
word_searching = st.container()
dataset_statistics = st.container()
tsne_projection = st.container()
with header:
st.title(self.dataset_name)
with description:
st.header("Dataset description")
st.write(self.description)
with dataframe_head:
filtering_options = self.data_dict["train"]["target"].unique().tolist()
filtering_options.append("All classes")
st.header("First 10 observations of a chosen class")
class_to_show = st.selectbox(
label="Select class to show", options=filtering_options
)
df_to_show = pd.concat(
[
self.data_dict["train"].copy(),
self.data_dict["validation"].copy(),
self.data_dict["test"].copy(),
]
)
if class_to_show == "All classes":
df_to_show = df_to_show.head(10)
else:
df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
10
)
st.dataframe(df_to_show)
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
st.subheader("First 10 observations of a chosen domain and text type")
domain = st.selectbox(
label="Select domain",
options=["all", "hotels", "medicine", "products", "reviews"],
)
text_type = st.selectbox(
label="Select text type",
options=["Full text", "Tokenized to sentences"],
)
text_type_mapping_dict = {
"Full text": "text",
"Tokenized to sentences": "sentence",
}
polemo_subset = load_dataset(
self.dataset_name,
f"{domain}_{text_type_mapping_dict[text_type]}",
)
df = pd.concat(
[
polemo_subset["train"].to_pandas(),
polemo_subset["validation"].to_pandas(),
polemo_subset["test"].to_pandas(),
]
).head(10)
st.dataframe(df)
st.text_area(label="Latex code", value=df.style.to_latex())
with word_searching:
st.header("Observations containing a chosen word")
searched_word = st.text_input(
label="Enter the word you are looking for below"
)
df_to_show = pd.concat(
[
self.data_dict["train"].copy(),
self.data_dict["validation"].copy(),
self.data_dict["test"].copy(),
]
)
df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
st.dataframe(df_to_show)
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
with dataset_statistics:
st.header("Dataset statistics")
st.subheader("Number of samples in each data split")
metrics_df = pd.DataFrame.from_dict(
{
"Train": self.data_dict["train"].shape[0],
"Validation": self.data_dict["validation"].shape[0],
"Test": self.data_dict["test"].shape[0],
"Total": sum(
[
self.data_dict["train"].shape[0],
self.data_dict["validation"].shape[0],
self.data_dict["test"].shape[0],
]
),
},
orient="index",
).reset_index()
metrics_df.columns = ["Subset", "Number of samples"]
st.dataframe(metrics_df)
latex_df = metrics_df.style.to_latex()
st.text_area(label="Latex code", value=latex_df)
# Class distribution in each subset
st.subheader("Class distribution in each subset")
target_unique_values = self.data_dict["train"]["target"].unique()
hist = (
pd.DataFrame(
[
df["target"].value_counts(normalize=True).rename(k)
for k, df in self.data_dict.items()
]
)
.reset_index()
.rename({"index": "split_name"}, axis=1)
)
plot_data = [
go.Bar(
name=str(target_unique_values[i]),
x=self.subsets,
y=hist[target_unique_values[i]].values,
)
for i in range(len(target_unique_values))
]
barchart_class_dist = go.Figure(data=plot_data)
barchart_class_dist.update_layout(
barmode="group",
title_text="Barchart - class distribution",
xaxis_title="Split name",
yaxis_title="Number of data points",
)
st.plotly_chart(barchart_class_dist, use_container_width=True)
st.dataframe(hist)
st.text_area(label="Latex code", value=hist.style.to_latex())
# Number of words per observation
st.subheader("Number of words per observation in each subset")
hist_data_num_words = [
df["text"].apply(count_num_of_words) for df in self.data_dict.values()
]
fig_num_words = ff.create_distplot(
hist_data_num_words, self.subsets, show_rug=False, bin_size=1
)
fig_num_words.update_traces(
nbinsx=100, autobinx=True, selector={"type": "histogram"}
)
fig_num_words.update_layout(
title_text="Histogram - number of characters per observation",
xaxis_title="Number of characters",
)
st.plotly_chart(fig_num_words, use_container_width=True)
# Number of characters per observation
st.subheader("Number of characters per observation in each subset")
hist_data_num_characters = [
df["text"].apply(count_num_of_characters)
for df in self.data_dict.values()
]
fig_num_chars = ff.create_distplot(
hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
)
fig_num_chars.update_layout(
title_text="Histogram - number of characters per observation",
xaxis_title="Number of characters",
)
st.plotly_chart(fig_num_chars, use_container_width=True)
with tsne_projection:
st.header("t-SNE projection of the dataset")
subset_to_project = st.selectbox(
label="Select subset to project", options=self.subsets
)
sentences = self.data_dict[subset_to_project]["text"].values
reducer = TSNE(
n_components=2
)
embedded_sentences = np.array(
[embed_sentence(text) for text in sentences]
)
transformed_embeddings = reducer.fit_transform(embedded_sentences)
fig, ax = plt.subplots()
ax.scatter(
x=transformed_embeddings[:, 0],
y=transformed_embeddings[:, 1],
c=[
PLOT_COLOR_PALETTE[x]
for x in self.data_dict[subset_to_project]["target"].values
],
)
st.pyplot(fig)