import streamlit as st import pandas as pd from filter_dataframe import filter_dataframe @st.cache_data def get_language_stats_df(): return pd.read_parquet("data/language_stats.parquet") st.set_page_config(page_title="Language Statistics", page_icon="📈") st.markdown("# Language Statistics") st.write("""\ The table below shows the per-language statistics of the MMS corpus. You can use the **'Add filters'** checkbox to filter the table by any of the columns. Column descriptions: - **Language**: Language name, - **Datasets**: Number of datasets in the MMS corpus for the given language, - **News**: Number of datasets from news domain, - **Reviews**: Number of datasets from reviews domain, - **Social media**: Number of datasets from social media domain, - **Other**: Number of datasets from other domains, - **Negative**: Number of examples with negative sentiment, - **Neutral**: Number of examples with neutral sentiment, - **Positive**: Number of examples with positive sentiment, - **Words**: The average number of words in a single example, - **Characters**: The average number of characters in a single example,""") df = get_language_stats_df() st.dataframe(filter_dataframe(df, numeric_as_categorical=False))