mms_benchmark / MMS_Benchmark.py
Szymon Woźniak
fix latex formatting in names of datasets
8e70d7d
import streamlit as st
import pandas as pd
import plotly.express as px
@st.cache_data
def get_results(experiment: str):
path = {
"linear": "data/f1_linear.parquet",
"bilstm": "data/f1_bilstm.parquet",
"finetuning": "data/f1_finetuning.parquet",
}[experiment]
df = pd.read_parquet(path)
df = (df * 100).astype(int)
return df
TITLE = "MMS Dataset and Benchmark"
MODELS_TABLE = """\
| Model | Inf. time [s] | #params | #langs | base | data | reference |
|----------------|---------------|---------|--------|--------|-------------------------------------------|---------------------------|
| mT5 | 1.69 | 277M | 101 | T5 | CC$^b$ | [Xue et al. 2021](https://doi.org/10.18653/v1/2021.naacl-main.41) |
| LASER | 1.64 | 52M | 93 | BiLSTM | OPUS$^c$ | [Artetxe and Schwenk 2019](https://doi.org/10.1162/tacl_a_00288) |
| mBERT | 1.49 | 177M | 104 | BERT | Wiki | [Devlin et al. 2019](https://doi.org/10.18653/v1/N19-1423) |
| MPNet** | 1.38 | 278M | 53 | XLM-R | OPUS$^c$, MUSE$^d$, Wikititles$^e$ | [Reimers and Gurevych 2020](https://doi.org/10.18653/v1/2020.emnlp-main.365) |
| XLM-R-dist** | 1.37 | 278M | 53 | XLM-R | OPUS$^c$, MUSE$^d$, Wikititles$^e$ | [Reimers and Gurevych 2020](https://doi.org/10.18653/v1/2020.emnlp-main.365) |
| XLM-R | 1.37 | 278M | 100 | XLM-R | CC | [Conneau et al. 2020](https://doi.org/10.18653/v1/2020.acl-main.747) |
| LaBSE | 1.36 | 470M | 109 | BERT | CC, Wiki + mined bitexts | [Feng et al. 2020](https://arxiv.org/abs/2007.01852) |
| DistilmBERT | 0.79 | 134M | 104 | BERT | Wiki | [Sanh et al. 2020](https://arxiv.org/abs/1910.01108) |
| mUSE-dist** | 0.79 | 134M | 53 | DistilmBERT | OPUS$^c$, MUSE$^d$, Wikititles$^e$ | [Reimers and Gurevych 2020](https://doi.org/10.18653/v1/2020.emnlp-main.365) |
| mUSE-transformer* | 0.65 | 85M | 16 | transformer | mined QA + bitexts, SNLI | [Yang et al. 2020](https://doi.org/10.18653/v1/2020.acl-demos.12) |
| mUSE-cnn* | 0.12 | 68M | 16 | CNN | mined QA + bitexts, SNLI | [Yang et al. 2020](https://doi.org/10.18653/v1/2020.acl-demos.12) |"""
MODELS_ANNOTATIONS = """\
- `*` mUSE models were used in TensorFlow implementation in contrast to others in torch
- `**` models trained with multilingual knowledge distillation
- `a` Base model is either monolingual version on which it was based or another multilingual model which was used and adopted
- `b` Colossal Clean Crawled Corpus in multilingual version (mC4)
- `c` multiple datasets from OPUS website (https://opus.nlpl.eu)
- `d` bilingual dictionaries from MUSE (https://github.com/facebookresearch/MUSE)
- `e` just titles from wiki articles in multiple languages"""
st.set_page_config(page_title=TITLE, page_icon="📈")
st.markdown(f"# {TITLE}")
st.markdown("> The most extensive open massively multilingual corpus of datasets for training sentiment models. The corpus consists of 79 manually selected from over 350 datasets reported in the scientific literature based on strict quality criteria and covers 27 languages.")
st.markdown("""Despite impressive advancements in multilingual corpora collection and model training, developing large-scale deployments of multilingual models still presents a significant challenge. This is particularly true for language tasks that are culture-dependent. One such example is the area of multilingual sentiment analysis, where affective markers can be subtle and deeply ensconced in culture.
This work presents the most extensive open massively multilingual corpus of datasets for training sentiment models. The corpus consists of 79 manually selected datasets from over 350 datasets reported in the scientific literature based on strict quality criteria. The corpus covers 27 languages representing 6 language families. Datasets can be queried using several linguistic and functional features. In addition, we present a multi-faceted sentiment classification benchmark summarizing hundreds of experiments conducted on different base models, training objectives, dataset collections, and fine-tuning strategies.""")
st.markdown("## Benchmark results")
st.markdown("Our preliminary results has been presented in [Rajda et al. 2022](https://doi.org/10.18653/v1/2022.wassa-1.13) and finally presented **in review at NeurIPS'23**.")
st.markdown("### Benchmark results - F1 Macro scores")
st.markdown("#### Models")
st.markdown(MODELS_TABLE)
st.markdown(MODELS_ANNOTATIONS)
st.markdown("#### Results")
df_linear = get_results("linear")
df_bilstm = get_results("bilstm")
df_finetuning = get_results("finetuning")
color_range_low = 40
color_range_high = 75
st.write("""Below are the detailed results of models’ comparison.
Legend: **lang** - averaged by all languages, **ds** - averaged by
dataset, **ar** - Arabic, **bg** - Bulgarian, **bs** - Bosnian, **cs** - Czech, **de** - German, **en** - English, **es** - Spanish, **fa** - Persian, **fr** - French, **he** - Hebrew, **hi** - Hindi, **hr** - Croatian, **hu** - Hungarian, **it** - Italian, **ja** - Japanese, **lv** - Latvian, **pl** - Polish, **pt** - Portuguese, **ru** - Russian, **sk** - Slovak, **sl** - Slovenian, **sq** - Albanian, **sr** - Serbian, **sv** - Swedish, **th** - Thai, **ur** - Urdu,
**zh** - Chinese.""")
st.plotly_chart(
px.imshow(
df_linear,
title="Linear Head",
labels=dict(x="Language", y="Model", color="F1 Score"),
color_continuous_scale="viridis",
range_color=[color_range_low, color_range_high],
text_auto=True,
)
)
st.plotly_chart(
px.imshow(
df_bilstm,
title="BiLSTM Head",
labels=dict(x="Language", y="Model", color="F1 Score"),
color_continuous_scale="viridis",
range_color=[color_range_low, color_range_high],
text_auto=True,
)
)
st.plotly_chart(
px.imshow(
df_finetuning,
title="Fine-tuning",
labels=dict(x="Language", y="Model", color="F1 Score"),
color_continuous_scale="viridis",
range_color=[color_range_low, color_range_high],
text_auto=True,
)
)