Mariusz Kossakowski commited on
Commit
c28d81d
1 Parent(s): 15162f9
app.py CHANGED
@@ -3,6 +3,13 @@ import streamlit as st
3
  from clarin_datasets.polemo_dataset import PolemoDataset
4
  from clarin_datasets.abusive_clauses_dataset import AbusiveClausesDataset
5
  from clarin_datasets.aspectemo_dataset import AspectEmoDataset
 
 
 
 
 
 
 
6
 
7
  selected_dataset = st.sidebar.selectbox(
8
  "Choose a dataset to load",
@@ -10,6 +17,10 @@ selected_dataset = st.sidebar.selectbox(
10
  "clarin-pl/polemo2-official",
11
  "laugustyniak/abusive-clauses-pl",
12
  "clarin-pl/aspectemo",
 
 
 
 
13
  ),
14
  )
15
 
@@ -19,6 +30,15 @@ elif selected_dataset == "laugustyniak/abusive-clauses-pl":
19
  dataset = AbusiveClausesDataset()
20
  elif selected_dataset == "clarin-pl/aspectemo":
21
  dataset = AspectEmoDataset()
 
 
 
 
 
 
 
 
 
22
 
23
  dataset.load_data()
24
  dataset.show_dataset()
 
3
  from clarin_datasets.polemo_dataset import PolemoDataset
4
  from clarin_datasets.abusive_clauses_dataset import AbusiveClausesDataset
5
  from clarin_datasets.aspectemo_dataset import AspectEmoDataset
6
+ from clarin_datasets.kpwr_ner_datasets import KpwrNerDataset
7
+ from clarin_datasets.punctuation_restoration_dataset import (
8
+ PunctuationRestorationDataset,
9
+ )
10
+ from clarin_datasets.nkjp_pos_dataset import NkjpPosDataset
11
+ from clarin_datasets.cst_wikinews_dataset import CSTWikinewsDataset
12
+
13
 
14
  selected_dataset = st.sidebar.selectbox(
15
  "Choose a dataset to load",
 
17
  "clarin-pl/polemo2-official",
18
  "laugustyniak/abusive-clauses-pl",
19
  "clarin-pl/aspectemo",
20
+ "clarin-pl/kpwr-ner",
21
+ "clarin-pl/2021-punctuation-restoration",
22
+ "clarin-pl/nkjp-pos",
23
+ "clarin-pl/cst-wikinews",
24
  ),
25
  )
26
 
 
30
  dataset = AbusiveClausesDataset()
31
  elif selected_dataset == "clarin-pl/aspectemo":
32
  dataset = AspectEmoDataset()
33
+ elif selected_dataset == "clarin-pl/kpwr-ner":
34
+ dataset = KpwrNerDataset()
35
+ elif selected_dataset == "clarin-pl/2021-punctuation-restoration":
36
+ dataset = PunctuationRestorationDataset()
37
+ elif selected_dataset == "clarin-pl/nkjp-pos":
38
+ dataset = NkjpPosDataset()
39
+ elif selected_dataset == "clarin-pl/cst-wikinews":
40
+ dataset = CSTWikinewsDataset()
41
+
42
 
43
  dataset.load_data()
44
  dataset.show_dataset()
clarin_datasets/abusive_clauses_dataset.py CHANGED
@@ -12,8 +12,8 @@ from clarin_datasets.utils import (
12
 
13
  class AbusiveClausesDataset(DatasetToShow):
14
  def __init__(self):
 
15
  self.dataset_name = "laugustyniak/abusive-clauses-pl"
16
- self.data_dict = None
17
  self.subsets = ["train", "validation", "test"]
18
  self.description = """
19
  ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
 
12
 
13
  class AbusiveClausesDataset(DatasetToShow):
14
  def __init__(self):
15
+ DatasetToShow.__init__(self)
16
  self.dataset_name = "laugustyniak/abusive-clauses-pl"
 
17
  self.subsets = ["train", "validation", "test"]
18
  self.description = """
19
  ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
clarin_datasets/aspectemo_dataset.py CHANGED
@@ -7,8 +7,8 @@ from clarin_datasets.dataset_to_show import DatasetToShow
7
 
8
  class AspectEmoDataset(DatasetToShow):
9
  def __init__(self):
 
10
  self.dataset_name = "clarin-pl/aspectemo"
11
- self.subsets = ["train", "test"]
12
  self.description = """
13
  Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
14
  corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
 
7
 
8
  class AspectEmoDataset(DatasetToShow):
9
  def __init__(self):
10
+ DatasetToShow.__init__(self)
11
  self.dataset_name = "clarin-pl/aspectemo"
 
12
  self.description = """
13
  Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
14
  corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
clarin_datasets/dataset_to_show.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from abc import ABC, abstractmethod
2
 
3
 
@@ -6,12 +8,16 @@ class DatasetToShow(ABC):
6
  def __init__(self):
7
  self.dataset_name = None
8
  self.data_dict = None
9
- self.subsets = None
10
  self.description = None
11
 
12
  @abstractmethod
13
  def load_data(self):
14
- pass
 
 
 
 
15
 
16
  @abstractmethod
17
  def show_dataset(self):
 
1
+ from datasets import load_dataset
2
+
3
  from abc import ABC, abstractmethod
4
 
5
 
 
8
  def __init__(self):
9
  self.dataset_name = None
10
  self.data_dict = None
11
+ self.subsets = ["train", "test"]
12
  self.description = None
13
 
14
  @abstractmethod
15
  def load_data(self):
16
+ raw_dataset = load_dataset(self.dataset_name)
17
+ self.data_dict = {
18
+ subset: raw_dataset[subset].to_pandas()
19
+ for subset in self.subsets
20
+ }
21
 
22
  @abstractmethod
23
  def show_dataset(self):
clarin_datasets/polemo_dataset.py CHANGED
@@ -13,8 +13,8 @@ from clarin_datasets.utils import (
13
 
14
  class PolemoDataset(DatasetToShow):
15
  def __init__(self):
 
16
  self.dataset_name = "clarin-pl/polemo2-official"
17
- self.data_dict = None
18
  self.subsets = ["train", "validation", "test"]
19
  self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
20
  hotels, products, and university. It is human-annotated on a level of full reviews and individual
@@ -24,10 +24,7 @@ class PolemoDataset(DatasetToShow):
24
  annotated with four labels: positive, negative, neutral, or ambiguous. """
25
 
26
  def load_data(self):
27
- raw_dataset = load_dataset(self.dataset_name)
28
- self.data_dict = {
29
- subset: raw_dataset[subset].to_pandas() for subset in self.subsets
30
- }
31
 
32
  def show_dataset(self):
33
  header = st.container()
 
13
 
14
  class PolemoDataset(DatasetToShow):
15
  def __init__(self):
16
+ DatasetToShow.__init__(self)
17
  self.dataset_name = "clarin-pl/polemo2-official"
 
18
  self.subsets = ["train", "validation", "test"]
19
  self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
20
  hotels, products, and university. It is human-annotated on a level of full reviews and individual
 
24
  annotated with four labels: positive, negative, neutral, or ambiguous. """
25
 
26
  def load_data(self):
27
+ DatasetToShow.show_dataset(self)
 
 
 
28
 
29
  def show_dataset(self):
30
  header = st.container()