Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
•
802f11a
1
Parent(s):
08bbbaf
Change description displaying
Browse files
clarin_datasets/aspectemo_dataset.py
CHANGED
@@ -9,34 +9,36 @@ class AspectEmoDataset(DatasetToShow):
|
|
9 |
def __init__(self):
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.dataset_name = "clarin-pl/aspectemo"
|
12 |
-
self.description =
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
40 |
|
41 |
def load_data(self):
|
42 |
raw_dataset = load_dataset(self.dataset_name)
|
@@ -56,7 +58,9 @@ class AspectEmoDataset(DatasetToShow):
|
|
56 |
|
57 |
with description:
|
58 |
st.header("Dataset description")
|
59 |
-
st.write(self.description)
|
|
|
|
|
60 |
|
61 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
62 |
tokens_all = full_dataframe["tokens"].tolist()
|
|
|
9 |
def __init__(self):
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.dataset_name = "clarin-pl/aspectemo"
|
12 |
+
self.description = [
|
13 |
+
"""
|
14 |
+
AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
|
15 |
+
corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
|
16 |
+
analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
|
17 |
+
following domains: school, medicine, hotels, and products. All documents are annotated at the aspect level
|
18 |
+
with six sentiment categories: strong negative (minus_m), weak negative (minus_s), neutral (zero),
|
19 |
+
weak positive (plus_s), strong positive (plus_m).
|
20 |
+
""",
|
21 |
+
"Tasks (input, output and metrics)",
|
22 |
+
"""
|
23 |
+
Aspect-based sentiment analysis (ABSA) is a text analysis method that
|
24 |
+
categorizes data by aspects and identifies the sentiment assigned to each aspect. It is the sequence tagging
|
25 |
+
task.
|
26 |
+
|
27 |
+
"Input ('tokens' column): sequence of tokens"
|
28 |
+
|
29 |
+
Output ('labels' column): sequence of predicted tokens’ classes ("O" + 6 possible classes: strong negative (
|
30 |
+
a_minus_m), weak negative (a_minus_s), neutral (a_zero), weak positive (a_plus_s), strong positive (
|
31 |
+
a_plus_m), ambiguous (a_amb) )
|
32 |
+
|
33 |
+
Domain: school, medicine, hotels and products
|
34 |
+
|
35 |
+
Measurements:
|
36 |
+
|
37 |
+
Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
|
38 |
+
'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
|
39 |
+
'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
|
40 |
+
"""
|
41 |
+
]
|
42 |
|
43 |
def load_data(self):
|
44 |
raw_dataset = load_dataset(self.dataset_name)
|
|
|
58 |
|
59 |
with description:
|
60 |
st.header("Dataset description")
|
61 |
+
st.write(self.description[0])
|
62 |
+
st.subheader(self.description[1])
|
63 |
+
st.write(self.description[2])
|
64 |
|
65 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
66 |
tokens_all = full_dataframe["tokens"].tolist()
|
clarin_datasets/kpwr_ner_datasets.py
CHANGED
@@ -10,28 +10,31 @@ class KpwrNerDataset(DatasetToShow):
|
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/kpwr-ner"
|
13 |
-
self.description =
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
35 |
|
36 |
def load_data(self):
|
37 |
raw_dataset = load_dataset(self.dataset_name)
|
@@ -67,7 +70,9 @@ class KpwrNerDataset(DatasetToShow):
|
|
67 |
|
68 |
with description:
|
69 |
st.header("Dataset description")
|
70 |
-
st.write(self.description)
|
|
|
|
|
71 |
|
72 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
73 |
tokens_all = full_dataframe["tokens"].tolist()
|
|
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/kpwr-ner"
|
13 |
+
self.description = [
|
14 |
+
"""
|
15 |
+
KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
|
16 |
+
Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories
|
17 |
+
of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
|
18 |
+
originally 120). During corpus creation, texts were annotated by humans from various sources, covering many
|
19 |
+
domains and genres.
|
20 |
+
""",
|
21 |
+
"Tasks (input, output and metrics)",
|
22 |
+
"""
|
23 |
+
Named entity recognition (NER) - tagging entities in text with their corresponding type.
|
24 |
+
|
25 |
+
Input ('tokens' column): sequence of tokens
|
26 |
+
|
27 |
+
Output ('ner' column): sequence of predicted tokens’ classes in BIO notation (82 possible classes, described
|
28 |
+
in detail in the annotation guidelines)
|
29 |
+
|
30 |
+
example:
|
31 |
+
|
32 |
+
[‘Roboty’, ‘mają’, ‘kilkanaście’, ‘lat’, ‘i’, ‘pochodzą’, ‘z’, ‘USA’, ‘,’, ‘Wysokie’, ‘napięcie’, ‘jest’,
|
33 |
+
‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
|
34 |
+
‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
|
35 |
+
‘B-nam_loc_gpe_country’, ‘O’]
|
36 |
+
"""
|
37 |
+
]
|
38 |
|
39 |
def load_data(self):
|
40 |
raw_dataset = load_dataset(self.dataset_name)
|
|
|
70 |
|
71 |
with description:
|
72 |
st.header("Dataset description")
|
73 |
+
st.write(self.description[0])
|
74 |
+
st.subheader(self.description[1])
|
75 |
+
st.write(self.description[2])
|
76 |
|
77 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
78 |
tokens_all = full_dataframe["tokens"].tolist()
|
clarin_datasets/punctuation_restoration_dataset.py
CHANGED
@@ -10,32 +10,34 @@ class PunctuationRestorationDataset(DatasetToShow):
|
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/2021-punctuation-restoration"
|
13 |
-
self.description =
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
39 |
|
40 |
def load_data(self):
|
41 |
raw_dataset = load_dataset(self.dataset_name)
|
@@ -70,7 +72,10 @@ class PunctuationRestorationDataset(DatasetToShow):
|
|
70 |
|
71 |
with description:
|
72 |
st.header("Dataset description")
|
73 |
-
st.write(self.description)
|
|
|
|
|
|
|
74 |
|
75 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
76 |
|
|
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/2021-punctuation-restoration"
|
13 |
+
self.description = [
|
14 |
+
"""
|
15 |
+
Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
|
16 |
+
not contain any punctuation or capitalization. In longer stretches of automatically recognized speech,
|
17 |
+
the lack of punctuation affects the general clarity of the output text [1]. The primary purpose of
|
18 |
+
punctuation (PR) and capitalization restoration (CR) as a distinct natural language processing (NLP) task is
|
19 |
+
to improve the legibility of ASR-generated text, and possibly other types of texts without punctuation. Aside
|
20 |
+
from their intrinsic value, PR and CR may improve the performance of other NLP aspects such as Named Entity
|
21 |
+
Recognition (NER), part-of-speech (POS) and semantic parsing or spoken dialog segmentation [2, 3]. As useful
|
22 |
+
as it seems, It is hard to systematically evaluate PR on transcripts of conversational language; mainly
|
23 |
+
because punctuation rules can be ambiguous even for originally written texts, and the very nature of
|
24 |
+
naturally-occurring spoken language makes it difficult to identify clear phrase and sentence boundaries [4,
|
25 |
+
5]. Given these requirements and limitations, a PR task based on a redistributable corpus of read speech was
|
26 |
+
suggested. 1200 texts included in this collection (totaling over 240,000 words) were selected from two
|
27 |
+
distinct sources: WikiNews and WikiTalks. Punctuation found in these sources should be approached with some
|
28 |
+
reservation when used for evaluation: these are original texts and may contain some user-induced errors and
|
29 |
+
bias. The texts were read out by over a hundred different speakers. Original texts with punctuation were
|
30 |
+
forced-aligned with recordings and used as the ideal ASR output. The goal of the task is to provide a
|
31 |
+
solution for restoring punctuation in the test set collated for this task. The test set consists of
|
32 |
+
time-aligned ASR transcriptions of read texts from the two sources. Participants are encouraged to use both
|
33 |
+
text-based and speech-derived features to identify punctuation symbols (e.g. multimodal framework [6]). In
|
34 |
+
addition, the train set is accompanied by reference text corpora of WikiNews and WikiTalks data that can be
|
35 |
+
used in training and fine-tuning punctuation models.
|
36 |
+
""",
|
37 |
+
"Task description",
|
38 |
+
"The purpose of this task is to restore punctuation in the ASR recognition of texts read out loud.",
|
39 |
+
"clarin_datasets/punctuation_restoration_task.png"
|
40 |
+
]
|
41 |
|
42 |
def load_data(self):
|
43 |
raw_dataset = load_dataset(self.dataset_name)
|
|
|
72 |
|
73 |
with description:
|
74 |
st.header("Dataset description")
|
75 |
+
st.write(self.description[0])
|
76 |
+
st.subheader(self.description[1])
|
77 |
+
st.write(self.description[2])
|
78 |
+
st.image(self.description[3])
|
79 |
|
80 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
81 |
|