Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
from filter_dataframe import filter_dataframe | |
def get_language_stats_df(): | |
return pd.read_parquet("data/datasets_stats.parquet") | |
TITLE = "Dataset statistics & citation export" | |
_MMS_CITATION = """\ | |
@misc{augustyniak2023massively, | |
title={Massively Multilingual Corpus of Sentiment Datasets and Multi-faceted Sentiment Classification Benchmark}, | |
author={Łukasz Augustyniak and Szymon Woźniak and Marcin Gruza and Piotr Gramacki and Krzysztof Rajda and Mikołaj Morzy and Tomasz Kajdanowicz}, | |
year={2023}, | |
eprint={2306.07902}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CL} | |
}""" | |
CITATION_SEPARATOR = "% " + ("-" * 90) + "\n\n" | |
def export_citations(df: pd.DataFrame): | |
dataset_names = df.original_dataset.tolist() | |
dataset_citations = df.citation.tolist() | |
df = pd.DataFrame({"dataset": dataset_names, "citation": dataset_citations}) | |
citations_grouped_df = df.groupby("citation").agg({"dataset": lambda x: ", ".join(x)}).reset_index().sort_values(by="dataset") | |
dataset_citations = ("% Datasets: " + citations_grouped_df["dataset"] + "\n" + citations_grouped_df["citation"]).to_list() | |
dataset_citations_joined = CITATION_SEPARATOR.join(dataset_citations) | |
return f"% MMS corpus citation\n{_MMS_CITATION}\n{CITATION_SEPARATOR}{dataset_citations_joined}" | |
st.set_page_config(page_title=TITLE, page_icon="📈") | |
st.markdown(f"# {TITLE}") | |
st.write("""\ | |
The table below shows the per-dataset statistics of the MMS corpus. | |
You can use the **'Add filters'** checkbox to filter the table by any of the columns. | |
You can also use the 'Export citations' button to export the citations for the datasets in the filtered table. | |
Column descriptions: | |
- **Original Dataset**: Original dataset name as used in the MMS corpus, | |
- **Language code**: 2-letter language code, | |
- **Language**: Language name, | |
- **Domain**: Domain of the dataset, | |
- **Characters**: The average number of characters in a single example, | |
- **Words**: The average number of words in a single example, | |
- **Examples**: The total number of examples in the dataset, | |
- **Negative**: Percentage of examples with negative sentiment, | |
- **Neutral**: Percentage of examples with neutral sentiment, | |
- **Positive**: Percentage of examples with positive sentiment, | |
- **Paper**: Link to the paper in which the dataset was originally published, | |
- **Citation**: Citation for the dataset,""") | |
df = get_language_stats_df() | |
df_filter = filter_dataframe(df) | |
st.dataframe(df_filter) | |
if st.button("Export citations"): | |
val = export_citations(df_filter) | |
st.code(val, language="latex") |