Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from datasets import load_dataset | |
| import pandas as pd | |
| import plotly.figure_factory as ff | |
| import plotly.graph_objects as go | |
| from sklearn.manifold import TSNE | |
| import streamlit as st | |
| from clarin_datasets.dataset_to_show import DatasetToShow | |
| from clarin_datasets.utils import ( | |
| count_num_of_characters, | |
| count_num_of_words, | |
| embed_sentence, | |
| PLOT_COLOR_PALETTE | |
| ) | |
| class PolemoDataset(DatasetToShow): | |
| def __init__(self): | |
| DatasetToShow.__init__(self) | |
| self.dataset_name = "clarin-pl/polemo2-official" | |
| self.subsets = ["train", "validation", "test"] | |
| self.description = f""" | |
| Dataset link: https://huggingface.co/datasets/{self.dataset_name} | |
| The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine, | |
| hotels, products, and university. It is human-annotated on a level of full reviews and individual | |
| sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and | |
| sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197, | |
| 046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is | |
| annotated with four labels: positive, negative, neutral, or ambiguous. """ | |
| def load_data(self): | |
| raw_dataset = load_dataset(self.dataset_name) | |
| self.data_dict = { | |
| subset: raw_dataset[subset].to_pandas() for subset in self.subsets | |
| } | |
| def show_dataset(self): | |
| header = st.container() | |
| description = st.container() | |
| dataframe_head = st.container() | |
| word_searching = st.container() | |
| dataset_statistics = st.container() | |
| tsne_projection = st.container() | |
| with header: | |
| st.title(self.dataset_name) | |
| with description: | |
| st.header("Dataset description") | |
| st.write(self.description) | |
| with dataframe_head: | |
| filtering_options = self.data_dict["train"]["target"].unique().tolist() | |
| filtering_options.append("All classes") | |
| st.header("First 10 observations of a chosen class") | |
| class_to_show = st.selectbox( | |
| label="Select class to show", options=filtering_options | |
| ) | |
| df_to_show = pd.concat( | |
| [ | |
| self.data_dict["train"].copy(), | |
| self.data_dict["validation"].copy(), | |
| self.data_dict["test"].copy(), | |
| ] | |
| ) | |
| if class_to_show == "All classes": | |
| df_to_show = df_to_show.head(10) | |
| else: | |
| df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head( | |
| 10 | |
| ) | |
| st.dataframe(df_to_show) | |
| st.text_area(label="Latex code", value=df_to_show.style.to_latex()) | |
| st.subheader("First 10 observations of a chosen domain and text type") | |
| domain = st.selectbox( | |
| label="Select domain", | |
| options=["all", "hotels", "medicine", "products", "reviews"], | |
| ) | |
| text_type = st.selectbox( | |
| label="Select text type", | |
| options=["Full text", "Tokenized to sentences"], | |
| ) | |
| text_type_mapping_dict = { | |
| "Full text": "text", | |
| "Tokenized to sentences": "sentence", | |
| } | |
| polemo_subset = load_dataset( | |
| self.dataset_name, | |
| f"{domain}_{text_type_mapping_dict[text_type]}", | |
| ) | |
| df = pd.concat( | |
| [ | |
| polemo_subset["train"].to_pandas(), | |
| polemo_subset["validation"].to_pandas(), | |
| polemo_subset["test"].to_pandas(), | |
| ] | |
| ).head(10) | |
| st.dataframe(df) | |
| st.text_area(label="Latex code", value=df.style.to_latex()) | |
| with word_searching: | |
| st.header("Observations containing a chosen word") | |
| searched_word = st.text_input( | |
| label="Enter the word you are looking for below" | |
| ) | |
| df_to_show = pd.concat( | |
| [ | |
| self.data_dict["train"].copy(), | |
| self.data_dict["validation"].copy(), | |
| self.data_dict["test"].copy(), | |
| ] | |
| ) | |
| df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)] | |
| st.dataframe(df_to_show) | |
| st.text_area(label="Latex code", value=df_to_show.style.to_latex()) | |
| with dataset_statistics: | |
| st.header("Dataset statistics") | |
| st.subheader("Number of samples in each data split") | |
| metrics_df = pd.DataFrame.from_dict( | |
| { | |
| "Train": self.data_dict["train"].shape[0], | |
| "Validation": self.data_dict["validation"].shape[0], | |
| "Test": self.data_dict["test"].shape[0], | |
| "Total": sum( | |
| [ | |
| self.data_dict["train"].shape[0], | |
| self.data_dict["validation"].shape[0], | |
| self.data_dict["test"].shape[0], | |
| ] | |
| ), | |
| }, | |
| orient="index", | |
| ).reset_index() | |
| metrics_df.columns = ["Subset", "Number of samples"] | |
| st.dataframe(metrics_df) | |
| latex_df = metrics_df.style.to_latex() | |
| st.text_area(label="Latex code", value=latex_df) | |
| # Class distribution in each subset | |
| st.subheader("Class distribution in each subset") | |
| target_unique_values = self.data_dict["train"]["target"].unique() | |
| hist = ( | |
| pd.DataFrame( | |
| [ | |
| df["target"].value_counts(normalize=True).rename(k) | |
| for k, df in self.data_dict.items() | |
| ] | |
| ) | |
| .reset_index() | |
| .rename({"index": "split_name"}, axis=1) | |
| ) | |
| plot_data = [ | |
| go.Bar( | |
| name=str(target_unique_values[i]), | |
| x=self.subsets, | |
| y=hist[target_unique_values[i]].values, | |
| ) | |
| for i in range(len(target_unique_values)) | |
| ] | |
| barchart_class_dist = go.Figure(data=plot_data) | |
| barchart_class_dist.update_layout( | |
| barmode="group", | |
| title_text="Barchart - class distribution", | |
| xaxis_title="Split name", | |
| yaxis_title="Number of data points", | |
| ) | |
| st.plotly_chart(barchart_class_dist, use_container_width=True) | |
| st.dataframe(hist) | |
| st.text_area(label="Latex code", value=hist.style.to_latex()) | |
| # Number of words per observation | |
| st.subheader("Number of words per observation in each subset") | |
| hist_data_num_words = [ | |
| df["text"].apply(count_num_of_words) for df in self.data_dict.values() | |
| ] | |
| fig_num_words = ff.create_distplot( | |
| hist_data_num_words, self.subsets, show_rug=False, bin_size=1 | |
| ) | |
| fig_num_words.update_traces( | |
| nbinsx=100, autobinx=True, selector={"type": "histogram"} | |
| ) | |
| fig_num_words.update_layout( | |
| title_text="Histogram - number of characters per observation", | |
| xaxis_title="Number of characters", | |
| ) | |
| st.plotly_chart(fig_num_words, use_container_width=True) | |
| # Number of characters per observation | |
| st.subheader("Number of characters per observation in each subset") | |
| hist_data_num_characters = [ | |
| df["text"].apply(count_num_of_characters) | |
| for df in self.data_dict.values() | |
| ] | |
| fig_num_chars = ff.create_distplot( | |
| hist_data_num_characters, self.subsets, show_rug=False, bin_size=1 | |
| ) | |
| fig_num_chars.update_layout( | |
| title_text="Histogram - number of characters per observation", | |
| xaxis_title="Number of characters", | |
| ) | |
| st.plotly_chart(fig_num_chars, use_container_width=True) | |
| with tsne_projection: | |
| st.header("t-SNE projection of the dataset") | |
| subset_to_project = st.selectbox( | |
| label="Select subset to project", options=self.subsets | |
| ) | |
| sentences = self.data_dict[subset_to_project]["text"].values | |
| reducer = TSNE( | |
| n_components=2 | |
| ) | |
| embedded_sentences = np.array( | |
| [embed_sentence(text) for text in sentences] | |
| ) | |
| transformed_embeddings = reducer.fit_transform(embedded_sentences) | |
| fig, ax = plt.subplots() | |
| ax.scatter( | |
| x=transformed_embeddings[:, 0], | |
| y=transformed_embeddings[:, 1], | |
| c=[ | |
| PLOT_COLOR_PALETTE[x] | |
| for x in self.data_dict[subset_to_project]["target"].values | |
| ], | |
| ) | |
| st.pyplot(fig) | |