Spaces:

clarin-pl
/

datasets-explorer

Runtime error

Mariusz Kossakowski commited on Aug 23, 2022

Commit

c28d81d

•

1 Parent(s): 15162f9

Refactor

Files changed (5) hide show

app.py CHANGED Viewed

@@ -3,6 +3,13 @@ import streamlit as st
 from clarin_datasets.polemo_dataset import PolemoDataset
 from clarin_datasets.abusive_clauses_dataset import AbusiveClausesDataset
 from clarin_datasets.aspectemo_dataset import AspectEmoDataset
 selected_dataset = st.sidebar.selectbox(
     "Choose a dataset to load",
@@ -10,6 +17,10 @@ selected_dataset = st.sidebar.selectbox(
         "clarin-pl/polemo2-official",
         "laugustyniak/abusive-clauses-pl",
         "clarin-pl/aspectemo",
     ),
 )
@@ -19,6 +30,15 @@ elif selected_dataset == "laugustyniak/abusive-clauses-pl":
     dataset = AbusiveClausesDataset()
 elif selected_dataset == "clarin-pl/aspectemo":
     dataset = AspectEmoDataset()
 dataset.load_data()
 dataset.show_dataset()

 from clarin_datasets.polemo_dataset import PolemoDataset
 from clarin_datasets.abusive_clauses_dataset import AbusiveClausesDataset
 from clarin_datasets.aspectemo_dataset import AspectEmoDataset
+from clarin_datasets.kpwr_ner_datasets import KpwrNerDataset
+from clarin_datasets.punctuation_restoration_dataset import (
+    PunctuationRestorationDataset,
+)
+from clarin_datasets.nkjp_pos_dataset import NkjpPosDataset
+from clarin_datasets.cst_wikinews_dataset import CSTWikinewsDataset
 selected_dataset = st.sidebar.selectbox(
     "Choose a dataset to load",
         "clarin-pl/polemo2-official",
         "laugustyniak/abusive-clauses-pl",
         "clarin-pl/aspectemo",
+        "clarin-pl/kpwr-ner",
+        "clarin-pl/2021-punctuation-restoration",
+        "clarin-pl/nkjp-pos",
+        "clarin-pl/cst-wikinews",
     ),
 )
     dataset = AbusiveClausesDataset()
 elif selected_dataset == "clarin-pl/aspectemo":
     dataset = AspectEmoDataset()
+elif selected_dataset == "clarin-pl/kpwr-ner":
+    dataset = KpwrNerDataset()
+elif selected_dataset == "clarin-pl/2021-punctuation-restoration":
+    dataset = PunctuationRestorationDataset()
+elif selected_dataset == "clarin-pl/nkjp-pos":
+    dataset = NkjpPosDataset()
+elif selected_dataset == "clarin-pl/cst-wikinews":
+    dataset = CSTWikinewsDataset()
 dataset.load_data()
 dataset.show_dataset()

clarin_datasets/abusive_clauses_dataset.py CHANGED Viewed

@@ -12,8 +12,8 @@ from clarin_datasets.utils import (
 class AbusiveClausesDataset(DatasetToShow):
     def __init__(self):
         self.dataset_name = "laugustyniak/abusive-clauses-pl"
-        self.data_dict = None
         self.subsets = ["train", "validation", "test"]
         self.description = """
         ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.

 class AbusiveClausesDataset(DatasetToShow):
     def __init__(self):
+        DatasetToShow.__init__(self)
         self.dataset_name = "laugustyniak/abusive-clauses-pl"
         self.subsets = ["train", "validation", "test"]
         self.description = """
         ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.

clarin_datasets/aspectemo_dataset.py CHANGED Viewed

@@ -7,8 +7,8 @@ from clarin_datasets.dataset_to_show import DatasetToShow
 class AspectEmoDataset(DatasetToShow):
     def __init__(self):
         self.dataset_name = "clarin-pl/aspectemo"
-        self.subsets = ["train", "test"]
         self.description = """
         Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
         corpus of Polish customer reviews used in many projects on the use of different methods in sentiment

 class AspectEmoDataset(DatasetToShow):
     def __init__(self):
+        DatasetToShow.__init__(self)
         self.dataset_name = "clarin-pl/aspectemo"
         self.description = """
         Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
         corpus of Polish customer reviews used in many projects on the use of different methods in sentiment

clarin_datasets/dataset_to_show.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from abc import ABC, abstractmethod
@@ -6,12 +8,16 @@ class DatasetToShow(ABC):
     def __init__(self):
         self.dataset_name = None
         self.data_dict = None
-        self.subsets = None
         self.description = None
     @abstractmethod
     def load_data(self):
-        pass
     @abstractmethod
     def show_dataset(self):

+from datasets import load_dataset
 from abc import ABC, abstractmethod
     def __init__(self):
         self.dataset_name = None
         self.data_dict = None
+        self.subsets = ["train", "test"]
         self.description = None
     @abstractmethod
     def load_data(self):
+        raw_dataset = load_dataset(self.dataset_name)
+        self.data_dict = {
+            subset: raw_dataset[subset].to_pandas()
+            for subset in self.subsets
+        }
     @abstractmethod
     def show_dataset(self):

clarin_datasets/polemo_dataset.py CHANGED Viewed

@@ -13,8 +13,8 @@ from clarin_datasets.utils import (
 class PolemoDataset(DatasetToShow):
     def __init__(self):
         self.dataset_name = "clarin-pl/polemo2-official"
-        self.data_dict = None
         self.subsets = ["train", "validation", "test"]
         self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
         hotels, products, and university. It is human-annotated on a level of full reviews and individual
@@ -24,10 +24,7 @@ class PolemoDataset(DatasetToShow):
         annotated with four labels: positive, negative, neutral, or ambiguous. """
     def load_data(self):
-        raw_dataset = load_dataset(self.dataset_name)
-        self.data_dict = {
-            subset: raw_dataset[subset].to_pandas() for subset in self.subsets
-        }
     def show_dataset(self):
         header = st.container()

 class PolemoDataset(DatasetToShow):
     def __init__(self):
+        DatasetToShow.__init__(self)
         self.dataset_name = "clarin-pl/polemo2-official"
         self.subsets = ["train", "validation", "test"]
         self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
         hotels, products, and university. It is human-annotated on a level of full reviews and individual
         annotated with four labels: positive, negative, neutral, or ambiguous. """
     def load_data(self):
+        DatasetToShow.show_dataset(self)
     def show_dataset(self):
         header = st.container()