File size: 2,758 Bytes
cd37af8
 
 
 
 
 
 
 
 
b802e46
 
cd37af8
9d3e113
 
 
 
 
 
 
 
cd37af8
55c9088
cd37af8
 
7eb31f3
 
cd37af8
 
 
 
 
55c9088
cd37af8
 
b802e46
 
 
 
c63fab5
b802e46
 
cd37af8
b802e46
878e3b5
6bcdbf9
 
878e3b5
 
 
 
 
 
 
 
 
cd37af8
e883a51
a851110
 
cd37af8
 
 
 
 
 
 
 
56e420d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import streamlit as st
import pandas as pd
from filter_dataframe import filter_dataframe


@st.cache_data
def get_language_stats_df():
    return pd.read_parquet("data/datasets_stats.parquet")

TITLE = "Dataset statistics & citation export"

_MMS_CITATION = """\
@misc{augustyniak2023massively,
      title={Massively Multilingual Corpus of Sentiment Datasets and Multi-faceted Sentiment Classification Benchmark}, 
      author={Łukasz Augustyniak and Szymon Woźniak and Marcin Gruza and Piotr Gramacki and Krzysztof Rajda and Mikołaj Morzy and Tomasz Kajdanowicz},
      year={2023},
      eprint={2306.07902},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}"""

CITATION_SEPARATOR = "% " + ("-" * 90) + "\n\n"

def export_citations(df: pd.DataFrame):
    dataset_names = df["Original Dataset"].tolist()
    dataset_citations = df["Citation"].tolist()

    df = pd.DataFrame({"dataset": dataset_names, "citation": dataset_citations})
    citations_grouped_df = df.groupby("citation").agg({"dataset": lambda x: ", ".join(x)}).reset_index().sort_values(by="dataset")
    dataset_citations = ("% Datasets: " + citations_grouped_df["dataset"] + "\n" + citations_grouped_df["citation"]).to_list()
    dataset_citations_joined = CITATION_SEPARATOR.join(dataset_citations)
    return f"% MMS corpus citation\n{_MMS_CITATION}\n{CITATION_SEPARATOR}{dataset_citations_joined}"


st.set_page_config(page_title=TITLE, page_icon="📈")

st.markdown(f"# {TITLE}")
st.write("""\
The table below shows the per-dataset statistics of the MMS corpus.  
You can use the **'Add filters'** checkbox to filter the table by any of the columns.  
You can also use the 'Export citations' button to export the citations for the datasets in the filtered table.  

Column descriptions:
- **Original Dataset**: Original dataset name as used in the MMS corpus,
- **Language code**: 2-letter language code,
- **Language**: Language name,
- **Domain**: Domain of the dataset,
- **Characters**: The average number of characters in a single example,
- **Words**: The average number of words in a single example,
- **Examples**: The total number of examples in the dataset,
- **Negative**: Percentage of examples with negative sentiment,
- **Neutral**: Percentage of examples with neutral sentiment,
- **Positive**: Percentage of examples with positive sentiment,
- **Paper**: Link to the paper in which the dataset was originally published,
- **Citation**: Citation for the dataset,""")

with st.expander("How to export citations") as exp:
    st.video("data/export_example.webm")

df = get_language_stats_df()

df_filter = filter_dataframe(df)
st.dataframe(df_filter)


if st.button("Export citations"):
    val = export_citations(df_filter)
    st.code(val, language="latex")