Szymon Woźniak commited on
Commit
b802e46
1 Parent(s): 25586d7

add description to dataset statistics page

Browse files
pages/1_Language_Statistics.py CHANGED
@@ -14,7 +14,7 @@ st.set_page_config(page_title="Language Statistics", page_icon="📈")
14
  st.markdown("# Language Statistics")
15
  st.write("""\
16
  The table below shows the per-language statistics of the MMS corpus.
17
- You can use the **'Add filters'** button to filter the table by any of the columns.
18
 
19
  Column descriptions:
20
  - **Language**: Language name,
 
14
  st.markdown("# Language Statistics")
15
  st.write("""\
16
  The table below shows the per-language statistics of the MMS corpus.
17
+ You can use the **'Add filters'** checkbox to filter the table by any of the columns.
18
 
19
  Column descriptions:
20
  - **Language**: Language name,
pages/{2_Dataset_Statistics.py → 2_Dataset_Statistics_&_citation_export.py} RENAMED
@@ -7,6 +7,8 @@ from filter_dataframe import filter_dataframe
7
  def get_language_stats_df():
8
  return pd.read_parquet("data/datasets_stats.parquet")
9
 
 
 
10
  _MMS_CITATION = """\
11
  @misc{augustyniak2023massively,
12
  title={Massively Multilingual Corpus of Sentiment Datasets and Multi-faceted Sentiment Classification Benchmark},
@@ -30,9 +32,26 @@ def export_citations(df: pd.DataFrame):
30
  return f"% MMS corpus citation\n{_MMS_CITATION}\n{CITATION_SEPARATOR}{dataset_citations_joined}"
31
 
32
 
33
- st.set_page_config(page_title="Dataset statistics", page_icon="📈")
 
 
 
 
 
 
34
 
35
- st.markdown("# Dataset statistics")
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  df = get_language_stats_df()
38
 
 
7
  def get_language_stats_df():
8
  return pd.read_parquet("data/datasets_stats.parquet")
9
 
10
+ TITLE = "Dataset statistics & citation export"
11
+
12
  _MMS_CITATION = """\
13
  @misc{augustyniak2023massively,
14
  title={Massively Multilingual Corpus of Sentiment Datasets and Multi-faceted Sentiment Classification Benchmark},
 
32
  return f"% MMS corpus citation\n{_MMS_CITATION}\n{CITATION_SEPARATOR}{dataset_citations_joined}"
33
 
34
 
35
+ st.set_page_config(page_title=TITLE, page_icon="📈")
36
+
37
+ st.markdown(f"# {TITLE}")
38
+ st.write("""\
39
+ The table below shows the per-language statistics of the MMS corpus.
40
+ You can use the **'Add filters'** checkbox to filter the table by any of the columns.
41
+ You can also use the 'Export citations' button to export the citations for the datasets in the filtered table.
42
 
43
+ Column descriptions:
44
+ - **original_dataset**: Original dataset name as used in the MMS corpus,
45
+ - **language**: 2-letter language code,
46
+ - **domain**: Domain of the dataset,
47
+ - **mean_chars**: The average number of characters in a single example,
48
+ - **mean_words**: The average number of words in a single example,
49
+ - **examples_sum**: The total number of examples in the dataset,
50
+ - **NEG**: Number of examples with negative sentiment,
51
+ - **NEU**: Number of examples with neutral sentiment,
52
+ - **POS**: Number of examples with positive sentiment,
53
+ - **paper**: Link to the paper in which the dataset was originally published,
54
+ - **citation**: Citation for the dataset,""")
55
 
56
  df = get_language_stats_df()
57