mms_benchmark / pages /2_Dataset_Statistics_&_citation_export.py
Szymon Woźniak
rename export example expander
e883a51
import streamlit as st
import pandas as pd
from filter_dataframe import filter_dataframe
@st.cache_data
def get_language_stats_df():
return pd.read_parquet("data/datasets_stats.parquet")
TITLE = "Dataset statistics & citation export"
_MMS_CITATION = """\
@misc{augustyniak2023massively,
title={Massively Multilingual Corpus of Sentiment Datasets and Multi-faceted Sentiment Classification Benchmark},
author={Łukasz Augustyniak and Szymon Woźniak and Marcin Gruza and Piotr Gramacki and Krzysztof Rajda and Mikołaj Morzy and Tomasz Kajdanowicz},
year={2023},
eprint={2306.07902},
archivePrefix={arXiv},
primaryClass={cs.CL}
}"""
CITATION_SEPARATOR = "% " + ("-" * 90) + "\n\n"
def export_citations(df: pd.DataFrame):
dataset_names = df["Original Dataset"].tolist()
dataset_citations = df["Citation"].tolist()
df = pd.DataFrame({"dataset": dataset_names, "citation": dataset_citations})
citations_grouped_df = df.groupby("citation").agg({"dataset": lambda x: ", ".join(x)}).reset_index().sort_values(by="dataset")
dataset_citations = ("% Datasets: " + citations_grouped_df["dataset"] + "\n" + citations_grouped_df["citation"]).to_list()
dataset_citations_joined = CITATION_SEPARATOR.join(dataset_citations)
return f"% MMS corpus citation\n{_MMS_CITATION}\n{CITATION_SEPARATOR}{dataset_citations_joined}"
st.set_page_config(page_title=TITLE, page_icon="📈")
st.markdown(f"# {TITLE}")
st.write("""\
The table below shows the per-dataset statistics of the MMS corpus.
You can use the **'Add filters'** checkbox to filter the table by any of the columns.
You can also use the 'Export citations' button to export the citations for the datasets in the filtered table.
Column descriptions:
- **Original Dataset**: Original dataset name as used in the MMS corpus,
- **Language code**: 2-letter language code,
- **Language**: Language name,
- **Domain**: Domain of the dataset,
- **Characters**: The average number of characters in a single example,
- **Words**: The average number of words in a single example,
- **Examples**: The total number of examples in the dataset,
- **Negative**: Percentage of examples with negative sentiment,
- **Neutral**: Percentage of examples with neutral sentiment,
- **Positive**: Percentage of examples with positive sentiment,
- **Paper**: Link to the paper in which the dataset was originally published,
- **Citation**: Citation for the dataset,""")
with st.expander("How to export citations") as exp:
st.video("data/export_example.webm")
df = get_language_stats_df()
df_filter = filter_dataframe(df)
st.dataframe(df_filter)
if st.button("Export citations"):
val = export_citations(df_filter)
st.code(val, language="latex")