Spaces:

clarin-pl
/

datasets-explorer

Runtime error

App Files Files Community

Mariusz Kossakowski commited on Aug 18, 2022

Commit

d572e8e

1 Parent(s): 41b0597

Major refactor

Browse files

Files changed (5) hide show

app.py +16 -236
clarin_datasets/abusive_clauses_dataset.py +188 -0
clarin_datasets/dataset_to_show.py +18 -0
clarin_datasets/polemo_dataset.py +200 -0
clarin_datasets/utils.py +16 -0

app.py CHANGED Viewed

@@ -1,244 +1,24 @@
-import re
-from typing import Dict, List
-from datasets import load_dataset
-import pandas as pd
-import plotly.figure_factory as ff
-import plotly.graph_objects as go
 import streamlit as st
-from unidecode import unidecode
-DATA_SPLITS = ["train", "validation", "test"]
-def load_data() -> Dict[str, pd.DataFrame]:
-    return {
-        data: pd.read_csv(f"data/{data}.csv").rename(
-            {"label": "target"}, axis="columns"
-        )
-        for data in DATA_SPLITS
-    }
-def flatten_list(main_list: List[List]) -> List:
-    return [item for sublist in main_list for item in sublist]
-def count_num_of_characters(text: str) -> int:
-    return len(re.sub(r"[^a-zA-Z]", "", unidecode(text)))
-def count_num_of_words(text: str) -> int:
-    return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
 selected_dataset = st.sidebar.selectbox(
     "Choose a dataset to load",
-    ("clarin-pl/polemo2-official", "laugustyniak/abusive-clauses-pl"),
 )
-def load_hf_dataset():
-    if selected_dataset == "clarin-pl/polemo2-official":
-        data = load_dataset("clarin-pl/polemo2-official")
-        DATA_DICT = {
-            "train": data["train"].to_pandas(),
-            "validation": data["validation"].to_pandas(),
-            "test": data["test"].to_pandas(),
-        }
-        DATA_DESCRIPTION = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
-        hotels, products, and university. It is human-annotated on a level of full reviews and individual
-        sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
-        sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
-        046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
-        annotated with four labels: positive, negative, neutral, or ambiguous. """
-    elif selected_dataset == "laugustyniak/abusive-clauses-pl":
-        DATA_DICT = load_data()
-        DATA_DESCRIPTION = """
-        ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
-        Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
-        But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
-        we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
-        contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
-        or many more. In all these situations, you will need to conclude the contract, but there is a high probability
-        that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
-        businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
-        requiring consumers to accept.
-        Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
-        clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
-        situation of imbalance between the duties and rights of the parties.
-        On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
-        we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
-        learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
-        agencies can use these resources to make their work more eﬀective and eﬃcient. Moreover, consumers can automatically
-        analyze contracts and understand what they agree upon.
-        """
-    return DATA_DICT, DATA_DESCRIPTION
-DATA_DICT, DATA_DESCRIPTION = load_hf_dataset()
-header = st.container()
-description = st.container()
-dataframe_head = st.container()
-word_searching = st.container()
-dataset_statistics = st.container()
-with header:
-    st.title(selected_dataset)
-with description:
-    st.header("Dataset description")
-    st.write(DATA_DESCRIPTION)
-with dataframe_head:
-    filtering_options = DATA_DICT["train"]["target"].unique().tolist()
-    filtering_options.append("All classes")
-    st.header("First 10 observations of a chosen class")
-    class_to_show = st.selectbox(
-        label="Select class to show", options=filtering_options
-    )
-    df_to_show = pd.concat(
-        [
-            DATA_DICT["train"].copy(),
-            DATA_DICT["validation"].copy(),
-            DATA_DICT["test"].copy(),
-        ]
-    )
-    if class_to_show == "All classes":
-        df_to_show = df_to_show.head(10)
-    else:
-        df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(10)
-    st.dataframe(df_to_show)
-    st.text_area(label="Latex code", value=df_to_show.style.to_latex())
-    if selected_dataset == "clarin-pl/polemo2-official":
-        st.subheader("First 10 observations of a chosen domain and text type")
-        domain = st.selectbox(
-            label="Select domain",
-            options=["all", "hotels", "medicine", "products", "reviews"],
-        )
-        text_type = st.selectbox(
-            label="Select text type", options=["Full text", "Tokenized to sentences"]
-        )
-        text_type_mapping_dict = {
-            "Full text": "text",
-            "Tokenized to sentences": "sentence",
-        }
-        polemo_subset = load_dataset(
-            selected_dataset, f"{domain}_{text_type_mapping_dict[text_type]}"
-        )
-        df = pd.concat(
-            [
-                polemo_subset["train"].to_pandas(),
-                polemo_subset["validation"].to_pandas(),
-                polemo_subset["test"].to_pandas(),
-            ]
-        ).head(10)
-        st.dataframe(df)
-        st.text_area(label="Latex code", value=df.style.to_latex())
-with word_searching:
-    st.header("Observations containing a chosen word")
-    searched_word = st.text_input(label="Enter the word you are looking for below")
-    df_to_show = pd.concat(
-        [
-            DATA_DICT["train"].copy(),
-            DATA_DICT["validation"].copy(),
-            DATA_DICT["test"].copy(),
-        ]
-    )
-    df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
-    st.dataframe(df_to_show)
-    st.text_area(label="Latex code", value=df_to_show.style.to_latex())
-with dataset_statistics:
-    st.header("Dataset statistics")
-    st.subheader("Number of samples in each data split")
-    metrics_df = pd.DataFrame.from_dict(
-        {
-            "Train": DATA_DICT["train"].shape[0],
-            "Validation": DATA_DICT["validation"].shape[0],
-            "Test": DATA_DICT["test"].shape[0],
-            "Total": sum(
-                [
-                    DATA_DICT["train"].shape[0],
-                    DATA_DICT["validation"].shape[0],
-                    DATA_DICT["test"].shape[0],
-                ]
-            ),
-        },
-        orient="index",
-    ).reset_index()
-    metrics_df.columns = ["Subset", "Number of samples"]
-    st.dataframe(metrics_df)
-    latex_df = metrics_df.style.to_latex()
-    st.text_area(label="Latex code", value=latex_df)
-    # Class distribution in each subset
-    st.subheader("Class distribution in each subset")
-    target_unique_values = DATA_DICT["train"]["target"].unique()
-    hist = (
-        pd.DataFrame(
-            [
-                df["target"].value_counts(normalize=True).rename(k)
-                for k, df in DATA_DICT.items()
-            ]
-        )
-        .reset_index()
-        .rename({"index": "split_name"}, axis=1)
-    )
-    plot_data = [
-        go.Bar(
-            name=str(target_unique_values[i]),
-            x=DATA_SPLITS,
-            y=hist[target_unique_values[i]].values,
-        )
-        for i in range(len(target_unique_values))
-    ]
-    barchart_class_dist = go.Figure(data=plot_data)
-    barchart_class_dist.update_layout(
-        barmode="group",
-        title_text="Barchart - class distribution",
-        xaxis_title="Split name",
-        yaxis_title="Number of data points",
-    )
-    st.plotly_chart(barchart_class_dist, use_container_width=True)
-    st.dataframe(hist)
-    st.text_area(label="Latex code", value=hist.style.to_latex())
-    # Number of words per observation
-    st.subheader("Number of words per observation in each subset")
-    hist_data_num_words = [
-        df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
-    ]
-    fig_num_words = ff.create_distplot(
-        hist_data_num_words, DATA_SPLITS, show_rug=False, bin_size=1
-    )
-    fig_num_words.update_traces(
-        nbinsx=100, autobinx=True, selector={"type": "histogram"}
-    )
-    fig_num_words.update_layout(
-        title_text="Histogram - number of characters per observation",
-        xaxis_title="Number of characters",
-    )
-    st.plotly_chart(fig_num_words, use_container_width=True)
-    # Number of characters per observation
-    st.subheader("Number of characters per observation in each subset")
-    hist_data_num_characters = [
-        df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
-    ]
-    fig_num_chars = ff.create_distplot(
-        hist_data_num_characters, DATA_SPLITS, show_rug=False, bin_size=1
-    )
-    fig_num_chars.update_layout(
-        title_text="Histogram - number of characters per observation",
-        xaxis_title="Number of characters",
-    )
-    st.plotly_chart(fig_num_chars, use_container_width=True)

 import streamlit as st
+from clarin_datasets.polemo_dataset import PolemoDataset
+from clarin_datasets.abusive_clauses_dataset import AbusiveClausesDataset
+from clarin_datasets.aspectemo_dataset import AspectEmoDataset
 selected_dataset = st.sidebar.selectbox(
     "Choose a dataset to load",
+    (
+        "clarin-pl/polemo2-official",
+        "laugustyniak/abusive-clauses-pl",
+        "clarin-pl/aspectemo",
+    ),
 )
+if selected_dataset == "clarin-pl/polemo2-official":
+    dataset = PolemoDataset()
+elif selected_dataset == "laugustyniak/abusive-clauses-pl":
+    dataset = AbusiveClausesDataset()
+elif selected_dataset == "clarin-pl/aspectemo":
+    dataset = AspectEmoDataset()
+dataset.load_data()
+dataset.show_dataset()

clarin_datasets/abusive_clauses_dataset.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import pandas as pd
+import plotly.figure_factory as ff
+import plotly.graph_objects as go
+import streamlit as st
+from clarin_datasets.dataset_to_show import DatasetToShow
+from clarin_datasets.utils import (
+    count_num_of_characters,
+    count_num_of_words,
+)
+class AbusiveClausesDataset(DatasetToShow):
+    def __init__(self):
+        self.dataset_name = "laugustyniak/abusive-clauses-pl"
+        self.data_dict = None
+        self.subsets = ["train", "validation", "test"]
+        self.description = """
+        ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
+        Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
+        But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
+        we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
+        contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
+        or many more. In all these situations, you will need to conclude the contract, but there is a high probability
+        that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
+        businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
+        requiring consumers to accept.
+        Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
+        clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
+        situation of imbalance between the duties and rights of the parties.
+        On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
+        we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
+        learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
+        agencies can use these resources to make their work more eﬀective and eﬃcient. Moreover, consumers can automatically
+        analyze contracts and understand what they agree upon.
+        """
+    def load_data(self):
+        self.data_dict = {
+            subset: pd.read_csv(f"data/{subset}.csv").rename(
+                {"label": "target"}, axis="columns"
+            )
+            for subset in self.subsets
+        }
+    def show_dataset(self):
+        header = st.container()
+        description = st.container()
+        dataframe_head = st.container()
+        word_searching = st.container()
+        dataset_statistics = st.container()
+        with header:
+            st.title(self.dataset_name)
+        with description:
+            st.header("Dataset description")
+            st.write(self.description)
+        with dataframe_head:
+            filtering_options = self.data_dict["train"]["target"].unique().tolist()
+            filtering_options.append("All classes")
+            st.header("First 10 observations of a chosen class")
+            class_to_show = st.selectbox(
+                label="Select class to show", options=filtering_options
+            )
+            df_to_show = pd.concat(
+                [
+                    self.data_dict["train"].copy(),
+                    self.data_dict["validation"].copy(),
+                    self.data_dict["test"].copy(),
+                ]
+            )
+            if class_to_show == "All classes":
+                df_to_show = df_to_show.head(10)
+            else:
+                df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
+                    10
+                )
+            st.dataframe(df_to_show)
+            st.text_area(label="Latex code", value=df_to_show.style.to_latex())
+        with word_searching:
+            st.header("Observations containing a chosen word")
+            searched_word = st.text_input(
+                label="Enter the word you are looking for below"
+            )
+            df_to_show = pd.concat(
+                [
+                    self.data_dict["train"].copy(),
+                    self.data_dict["validation"].copy(),
+                    self.data_dict["test"].copy(),
+                ]
+            )
+            df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
+            st.dataframe(df_to_show)
+            st.text_area(label="Latex code", value=df_to_show.style.to_latex())
+        with dataset_statistics:
+            st.header("Dataset statistics")
+            st.subheader("Number of samples in each data split")
+            metrics_df = pd.DataFrame.from_dict(
+                {
+                    "Train": self.data_dict["train"].shape[0],
+                    "Validation": self.data_dict["validation"].shape[0],
+                    "Test": self.data_dict["test"].shape[0],
+                    "Total": sum(
+                        [
+                            self.data_dict["train"].shape[0],
+                            self.data_dict["validation"].shape[0],
+                            self.data_dict["test"].shape[0],
+                        ]
+                    ),
+                },
+                orient="index",
+            ).reset_index()
+            metrics_df.columns = ["Subset", "Number of samples"]
+            st.dataframe(metrics_df)
+            latex_df = metrics_df.style.to_latex()
+            st.text_area(label="Latex code", value=latex_df)
+            # Class distribution in each subset
+            st.subheader("Class distribution in each subset")
+            target_unique_values = self.data_dict["train"]["target"].unique()
+            hist = (
+                pd.DataFrame(
+                    [
+                        df["target"].value_counts(normalize=True).rename(k)
+                        for k, df in self.data_dict.items()
+                    ]
+                )
+                .reset_index()
+                .rename({"index": "split_name"}, axis=1)
+            )
+            plot_data = [
+                go.Bar(
+                    name=str(target_unique_values[i]),
+                    x=self.subsets,
+                    y=hist[target_unique_values[i]].values,
+                )
+                for i in range(len(target_unique_values))
+            ]
+            barchart_class_dist = go.Figure(data=plot_data)
+            barchart_class_dist.update_layout(
+                barmode="group",
+                title_text="Barchart - class distribution",
+                xaxis_title="Split name",
+                yaxis_title="Number of data points",
+            )
+            st.plotly_chart(barchart_class_dist, use_container_width=True)
+            st.dataframe(hist)
+            st.text_area(label="Latex code", value=hist.style.to_latex())
+            # Number of words per observation
+            st.subheader("Number of words per observation in each subset")
+            hist_data_num_words = [
+                df["text"].apply(count_num_of_words) for df in self.data_dict.values()
+            ]
+            fig_num_words = ff.create_distplot(
+                hist_data_num_words, self.subsets, show_rug=False, bin_size=1
+            )
+            fig_num_words.update_traces(
+                nbinsx=100, autobinx=True, selector={"type": "histogram"}
+            )
+            fig_num_words.update_layout(
+                title_text="Histogram - number of characters per observation",
+                xaxis_title="Number of characters",
+            )
+            st.plotly_chart(fig_num_words, use_container_width=True)
+            # Number of characters per observation
+            st.subheader("Number of characters per observation in each subset")
+            hist_data_num_characters = [
+                df["text"].apply(count_num_of_characters)
+                for df in self.data_dict.values()
+            ]
+            fig_num_chars = ff.create_distplot(
+                hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
+            )
+            fig_num_chars.update_layout(
+                title_text="Histogram - number of characters per observation",
+                xaxis_title="Number of characters",
+            )
+            st.plotly_chart(fig_num_chars, use_container_width=True)

clarin_datasets/dataset_to_show.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from abc import ABC, abstractmethod
+class DatasetToShow(ABC):
+    @abstractmethod
+    def __init__(self):
+        self.dataset_name = None
+        self.data_dict = None
+        self.subsets = None
+        self.description = None
+    @abstractmethod
+    def load_data(self):
+        pass
+    @abstractmethod
+    def show_dataset(self):
+        pass

clarin_datasets/polemo_dataset.py ADDED Viewed

	@@ -0,0 +1,200 @@

+from datasets import load_dataset
+import pandas as pd
+import plotly.figure_factory as ff
+import plotly.graph_objects as go
+import streamlit as st
+from clarin_datasets.dataset_to_show import DatasetToShow
+from clarin_datasets.utils import (
+    count_num_of_characters,
+    count_num_of_words,
+)
+class PolemoDataset(DatasetToShow):
+    def __init__(self):
+        self.dataset_name = "clarin-pl/polemo2-official"
+        self.data_dict = None
+        self.subsets = ["train", "validation", "test"]
+        self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
+        hotels, products, and university. It is human-annotated on a level of full reviews and individual
+        sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
+        sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
+        046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
+        annotated with four labels: positive, negative, neutral, or ambiguous. """
+    def load_data(self):
+        raw_dataset = load_dataset(self.dataset_name)
+        self.data_dict = {
+            subset: raw_dataset[subset].to_pandas() for subset in self.subsets
+        }
+    def show_dataset(self):
+        header = st.container()
+        description = st.container()
+        dataframe_head = st.container()
+        word_searching = st.container()
+        dataset_statistics = st.container()
+        with header:
+            st.title(self.dataset_name)
+        with description:
+            st.header("Dataset description")
+            st.write(self.description)
+        with dataframe_head:
+            filtering_options = self.data_dict["train"]["target"].unique().tolist()
+            filtering_options.append("All classes")
+            st.header("First 10 observations of a chosen class")
+            class_to_show = st.selectbox(
+                label="Select class to show", options=filtering_options
+            )
+            df_to_show = pd.concat(
+                [
+                    self.data_dict["train"].copy(),
+                    self.data_dict["validation"].copy(),
+                    self.data_dict["test"].copy(),
+                ]
+            )
+            if class_to_show == "All classes":
+                df_to_show = df_to_show.head(10)
+            else:
+                df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
+                    10
+                )
+            st.dataframe(df_to_show)
+            st.text_area(label="Latex code", value=df_to_show.style.to_latex())
+            st.subheader("First 10 observations of a chosen domain and text type")
+            domain = st.selectbox(
+                label="Select domain",
+                options=["all", "hotels", "medicine", "products", "reviews"],
+            )
+            text_type = st.selectbox(
+                label="Select text type",
+                options=["Full text", "Tokenized to sentences"],
+            )
+            text_type_mapping_dict = {
+                "Full text": "text",
+                "Tokenized to sentences": "sentence",
+            }
+            polemo_subset = load_dataset(
+                self.dataset_name,
+                f"{domain}_{text_type_mapping_dict[text_type]}",
+            )
+            df = pd.concat(
+                [
+                    polemo_subset["train"].to_pandas(),
+                    polemo_subset["validation"].to_pandas(),
+                    polemo_subset["test"].to_pandas(),
+                ]
+            ).head(10)
+            st.dataframe(df)
+            st.text_area(label="Latex code", value=df.style.to_latex())
+        with word_searching:
+            st.header("Observations containing a chosen word")
+            searched_word = st.text_input(
+                label="Enter the word you are looking for below"
+            )
+            df_to_show = pd.concat(
+                [
+                    self.data_dict["train"].copy(),
+                    self.data_dict["validation"].copy(),
+                    self.data_dict["test"].copy(),
+                ]
+            )
+            df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
+            st.dataframe(df_to_show)
+            st.text_area(label="Latex code", value=df_to_show.style.to_latex())
+        with dataset_statistics:
+            st.header("Dataset statistics")
+            st.subheader("Number of samples in each data split")
+            metrics_df = pd.DataFrame.from_dict(
+                {
+                    "Train": self.data_dict["train"].shape[0],
+                    "Validation": self.data_dict["validation"].shape[0],
+                    "Test": self.data_dict["test"].shape[0],
+                    "Total": sum(
+                        [
+                            self.data_dict["train"].shape[0],
+                            self.data_dict["validation"].shape[0],
+                            self.data_dict["test"].shape[0],
+                        ]
+                    ),
+                },
+                orient="index",
+            ).reset_index()
+            metrics_df.columns = ["Subset", "Number of samples"]
+            st.dataframe(metrics_df)
+            latex_df = metrics_df.style.to_latex()
+            st.text_area(label="Latex code", value=latex_df)
+            # Class distribution in each subset
+            st.subheader("Class distribution in each subset")
+            target_unique_values = self.data_dict["train"]["target"].unique()
+            hist = (
+                pd.DataFrame(
+                    [
+                        df["target"].value_counts(normalize=True).rename(k)
+                        for k, df in self.data_dict.items()
+                    ]
+                )
+                .reset_index()
+                .rename({"index": "split_name"}, axis=1)
+            )
+            plot_data = [
+                go.Bar(
+                    name=str(target_unique_values[i]),
+                    x=self.subsets,
+                    y=hist[target_unique_values[i]].values,
+                )
+                for i in range(len(target_unique_values))
+            ]
+            barchart_class_dist = go.Figure(data=plot_data)
+            barchart_class_dist.update_layout(
+                barmode="group",
+                title_text="Barchart - class distribution",
+                xaxis_title="Split name",
+                yaxis_title="Number of data points",
+            )
+            st.plotly_chart(barchart_class_dist, use_container_width=True)
+            st.dataframe(hist)
+            st.text_area(label="Latex code", value=hist.style.to_latex())
+            # Number of words per observation
+            st.subheader("Number of words per observation in each subset")
+            hist_data_num_words = [
+                df["text"].apply(count_num_of_words) for df in self.data_dict.values()
+            ]
+            fig_num_words = ff.create_distplot(
+                hist_data_num_words, self.subsets, show_rug=False, bin_size=1
+            )
+            fig_num_words.update_traces(
+                nbinsx=100, autobinx=True, selector={"type": "histogram"}
+            )
+            fig_num_words.update_layout(
+                title_text="Histogram - number of characters per observation",
+                xaxis_title="Number of characters",
+            )
+            st.plotly_chart(fig_num_words, use_container_width=True)
+            # Number of characters per observation
+            st.subheader("Number of characters per observation in each subset")
+            hist_data_num_characters = [
+                df["text"].apply(count_num_of_characters)
+                for df in self.data_dict.values()
+            ]
+            fig_num_chars = ff.create_distplot(
+                hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
+            )
+            fig_num_chars.update_layout(
+                title_text="Histogram - number of characters per observation",
+                xaxis_title="Number of characters",
+            )
+            st.plotly_chart(fig_num_chars, use_container_width=True)

clarin_datasets/utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import re
+from typing import List
+from unidecode import unidecode
+def flatten_list(main_list: List[List]) -> List:
+    return [item for sublist in main_list for item in sublist]
+def count_num_of_characters(text: str) -> int:
+    return len(re.sub(r"[^a-zA-Z]", "", unidecode(text)))
+def count_num_of_words(text: str) -> int:
+    return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))