Spaces:

Brand24
/

mms_benchmark

Runtime error

mms_benchmark / pages /2_Dataset_Statistics_&_citation_export.py

Szymon Woźniak

add missing column description

6bcdbf9 about 1 year ago

2.66 kB

	import streamlit as st
	import pandas as pd
	from filter_dataframe import filter_dataframe


	@st.cache_data
	def get_language_stats_df():
	return pd.read_parquet("data/datasets_stats.parquet")

	TITLE = "Dataset statistics & citation export"

	_MMS_CITATION = """\
	@misc{augustyniak2023massively,
	title={Massively Multilingual Corpus of Sentiment Datasets and Multi-faceted Sentiment Classification Benchmark},
	author={Łukasz Augustyniak and Szymon Woźniak and Marcin Gruza and Piotr Gramacki and Krzysztof Rajda and Mikołaj Morzy and Tomasz Kajdanowicz},
	year={2023},
	eprint={2306.07902},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
	}"""

	CITATION_SEPARATOR = "% " + ("-" * 90) + "\n\n"

	def export_citations(df: pd.DataFrame):
	dataset_names = df.original_dataset.tolist()
	dataset_citations = df.citation.tolist()

	df = pd.DataFrame({"dataset": dataset_names, "citation": dataset_citations})
	citations_grouped_df = df.groupby("citation").agg({"dataset": lambda x: ", ".join(x)}).reset_index().sort_values(by="dataset")
	dataset_citations = ("% Datasets: " + citations_grouped_df["dataset"] + "\n" + citations_grouped_df["citation"]).to_list()
	dataset_citations_joined = CITATION_SEPARATOR.join(dataset_citations)
	return f"% MMS corpus citation\n{_MMS_CITATION}\n{CITATION_SEPARATOR}{dataset_citations_joined}"


	st.set_page_config(page_title=TITLE, page_icon="📈")

	st.markdown(f"# {TITLE}")
	st.write("""\
	The table below shows the per-dataset statistics of the MMS corpus.
	You can use the 'Add filters' checkbox to filter the table by any of the columns.
	You can also use the 'Export citations' button to export the citations for the datasets in the filtered table.

	Column descriptions:
	- Original Dataset: Original dataset name as used in the MMS corpus,
	- Language code: 2-letter language code,
	- Language: Language name,
	- Domain: Domain of the dataset,
	- Characters: The average number of characters in a single example,
	- Words: The average number of words in a single example,
	- Examples: The total number of examples in the dataset,
	- Negative: Percentage of examples with negative sentiment,
	- Neutral: Percentage of examples with neutral sentiment,
	- Positive: Percentage of examples with positive sentiment,
	- Paper: Link to the paper in which the dataset was originally published,
	- Citation: Citation for the dataset,""")

	df = get_language_stats_df()

	df_filter = filter_dataframe(df)
	st.dataframe(df_filter)


	if st.button("Export citations"):
	val = export_citations(df_filter)
	st.code(val, language="latex")