import streamlit as st import pandas as pd from filter_dataframe import filter_dataframe @st.cache_data def get_language_stats_df(): return pd.read_parquet("data/datasets_stats.parquet") def export_citations(df: pd.DataFrame) -> str: return "\n\n".join(df["citation"].tolist()) _MMS_CITATION = """\ #TODO: Add MMS citation """ CITATION_SEPARATOR = "\n% " + ("-" * 90) + "\n" def export_citations(df: pd.DataFrame): dataset_names = df.original_dataset.tolist() dataset_citations = df.citation.tolist() df = pd.DataFrame({"dataset": dataset_names, "citation": dataset_citations}) citations_grouped_df = df.groupby("citation").agg({"dataset": lambda x: ", ".join(x)}).reset_index().sort_values(by="dataset") dataset_citations = ("% Datasets: " + citations_grouped_df["dataset"] + "\n" + citations_grouped_df["citation"]).to_list() dataset_citations_joined = CITATION_SEPARATOR.join(dataset_citations) return f"{_MMS_CITATION}\n\n{dataset_citations_joined}" st.set_page_config(page_title="Dataset statistics", page_icon="📈") st.markdown("# Dataset statistics") st.write( """TODO: Description""" ) df = get_language_stats_df() df_filter = filter_dataframe(df) st.dataframe(df_filter) if st.button("Export citations"): val = export_citations(df_filter) st.code(val, language="latex")