|
from functools import partial |
|
import time |
|
|
|
import sqlite3 |
|
import psutil |
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
IMAGE_WIDTHS = 900 |
|
PRE_TRAINED_DB = "data/pretrained.sqlite" |
|
|
|
|
|
@st.cache |
|
def load_eval_data(): |
|
conn = sqlite3.connect(PRE_TRAINED_DB) |
|
conn.row_factory = lambda c, r: { |
|
col[0]: r[idx] for idx, col in enumerate(c.description) |
|
} |
|
df = pd.read_sql_query("SELECT * FROM pretrained", conn) |
|
df.replace("None", np.nan, inplace=True) |
|
df.rename(columns={"model": "name"}, inplace=True) |
|
df = df.infer_objects() |
|
int_columns = ["train_batch_size", "num_parameters"] |
|
df[int_columns] = df[int_columns].astype("Int32") |
|
plot_df = df[["name", "num_parameters", "summ_rouge1", "trans_en_nl_score"]] |
|
plot_df[["num_parameters", "summ_rouge1", "trans_en_nl_score"]] = plot_df[ |
|
["num_parameters", "summ_rouge1", "trans_en_nl_score"] |
|
].apply(pd.to_numeric) |
|
plot_df["num params (M)"] = plot_df["num_parameters"].map( |
|
lambda x: int(x / 10**6) |
|
) |
|
plot_df.dropna(subset=["summ_rouge1"], inplace=True) |
|
plot_df.rename( |
|
columns={"summ_rouge1": "summ Rouge1", "trans_en_nl_score": "en->nl Bleu"}, |
|
inplace=True, |
|
) |
|
return plot_df |
|
|
|
|
|
def main(): |
|
st.set_page_config( |
|
page_title="Pre-training Dutch T5 models", |
|
layout="wide", |
|
initial_sidebar_state="collapsed", |
|
page_icon="π", |
|
) |
|
plot_df = load_eval_data() |
|
|
|
with open("style.css") as f: |
|
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True) |
|
|
|
with open("INTRO.md", "r") as f: |
|
st.markdown(f.read()) |
|
|
|
with open("PRETRAINING.md", "r") as f: |
|
st.markdown(f.read()) |
|
|
|
st.markdown( |
|
"""## Evaluation |
|
|
|
### Evaluation setup |
|
|
|
Each pre-trained model was evaluated by fine-tuning on summarization and translation. The learning-rate was set to |
|
a constant schedule after a small warmup of 32 steps. |
|
Fine-tuning for evaluation was done on a limited set of 50K examples from the fine-tuning datasets. |
|
|
|
| | Summarization | Translation | |
|
|-----------------:|------------------|-------------------| |
|
| Dataset | [CNN Dailymail Dutch](https://huggingface.co/datasets/yhavinga/cnn_dailymail_dutch) | [CCMatrix En->NL](https://huggingface.co/datasets/yhavinga/ccmatrix_en_nl) | |
|
| #train samples | 50K | 50K | |
|
| Optimizer | AdamW | AdamW | |
|
| learning rate | 0.001 | 0.0005 | |
|
| source length | 1024 | 128 | |
|
| target length | 142 | 128 | |
|
| #eval samples | 1000 | 1000 | |
|
| wandb link | [eval_summ](https://wandb.ai/yepster/eval_dutch_cnndaily_202302_flax)|[eval_transl](https://wandb.ai/yepster/eval_dutch_ccmatrix_202302_flax) | |
|
|
|
### Evaluation results |
|
|
|
The figure below shows the evaluation scores for most models, with summarization Rouge1 on the x-axis (higher is better), |
|
and translation English to Dutch Bleu score on the y-axis (higher is better). |
|
The point size is proportional to the model size. UL2 models are blue, Flan models |
|
red, mT5 green and the other models black. |
|
""" |
|
) |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
ul2_enabled = st.checkbox("UL2 Dutch (and English) (trained with T5X)", value=True) |
|
t5_1_1_enabled = st.checkbox("t5_1_1 Dutch (trained with T5X)", value=True) |
|
flan_enabled = st.checkbox("Flan T5 (google/flan-t5-*)", value=True) |
|
mt5_enabled = st.checkbox("mt5 (google/mt5-*)", value=True) |
|
long_t5_enabled = st.checkbox("Long T5 Dutch+English (trained with HuggingFace script)") |
|
t5_v1_1_enabled = st.checkbox("T5 Dutch (and English) (trained with HuggingFace script)") |
|
with col2: |
|
small_enabled = st.checkbox("small model sizes") |
|
base_enabled = st.checkbox("base model sizes") |
|
large_enabled = st.checkbox("large model sizes") |
|
_24_enabled = st.checkbox("small nl24 deep narrow sizes") |
|
_36_enabled = st.checkbox("base nl36 deep narrow sizes") |
|
_8l_enabled = st.checkbox("large nl8 deep wide sizes") |
|
_4xl_enabled = st.checkbox("xlarge nl4 deep wide sizes") |
|
|
|
plot_df = plot_df[ |
|
(plot_df["name"].str.contains("ul2") & ul2_enabled) |
|
| (plot_df["name"].str.contains("flan") & flan_enabled) |
|
| (plot_df["name"].str.contains("mt5") & mt5_enabled) |
|
| (plot_df["name"].str.contains("long-t5") & long_t5_enabled) |
|
| (plot_df["name"].str.contains("t5_1_1") & t5_1_1_enabled) |
|
| ((plot_df["name"].str.startswith("t5") & ~plot_df["name"].str.startswith("t5_1_1")) & t5_v1_1_enabled) |
|
| (plot_df["name"].str.contains("base") & base_enabled & ~plot_df["name"].str.contains("36")) |
|
| (plot_df["name"].str.contains("small") & small_enabled & ~plot_df["name"].str.contains("24")) |
|
| (plot_df["name"].str.contains("large") & large_enabled & ~plot_df["name"].str.contains("8")) |
|
| ((plot_df["name"].str.contains("-36L") | plot_df["name"].str.contains("nl36")) & _36_enabled) |
|
| ((plot_df["name"].str.contains("-24L") | plot_df["name"].str.contains("nl24")) & _24_enabled) |
|
| ((plot_df["name"].str.contains("-8l") | plot_df["name"].str.contains("nl8")) & _8l_enabled) |
|
| ((plot_df["name"].str.contains("-4L") | plot_df["name"].str.contains("nl4")) & _4xl_enabled) |
|
] |
|
|
|
color_dict = {"flan": "red", "ul2": "blue", "mt5": "green", "t5_1_1": "orange"} |
|
colors = [ |
|
color_dict[name.split("-")[0].lower()] |
|
if name.split("-")[0].lower() in color_dict.keys() |
|
else "black" |
|
for name in plot_df["name"] |
|
] |
|
fig = plt.figure(figsize=(15, 8)) |
|
sns.set_style("darkgrid") |
|
ax = sns.scatterplot( |
|
data=plot_df, |
|
y="en->nl Bleu", |
|
x="summ Rouge1", |
|
size="num params (M)", |
|
color=colors, |
|
linewidth=0.7, |
|
) |
|
for i, row in plot_df.iterrows(): |
|
ax.annotate( |
|
row["name"], |
|
(row["summ Rouge1"], row["en->nl Bleu"]), |
|
xytext=(0, 7), |
|
textcoords="offset points", |
|
ha="center", |
|
va="center", |
|
rotation=0, |
|
) |
|
plt.tight_layout() |
|
st.pyplot(fig) |
|
st.markdown("""* The `UL2` pre-trained Dutch(English) models consistently outperform the `T5-*` Dutch(English) models. |
|
* Flan models perform almost instantly well on the summarization task, with `flan-t5-small` |
|
showing performance comparable to Dutch T5 base models. |
|
* Fine-tuning of `t5-v1.1-large-dutch-cased` failed with the fixed hyperparameters across all models. |
|
Since the `UL2` models are better across the board, I've disabled this model on the hub. |
|
* I am surprised by the consistent bad scores for the `long-t5` runs. I've retried the fine-tuning of these models with |
|
`float32` instead of `bfloat16`, but the results were the same. Maybe this is normal behaviour for these models |
|
targeted at dealing with longer sequence lengths. |
|
* For the translation task from English to Dutch, the Dutch+English pre-trained models perform well. Also |
|
`UL2 Dutch` pre-trained Dutch models are consistently better than their `Flan`, `T5 Dutch` and |
|
`mT5` counterparts of the comparable size. |
|
* For the translation task, I am not sure that a 'deep-narrow' model (e.g. base-nl36) is better than a normal model |
|
or even a 'wide-deep' model. |
|
* The `long-t5` models show bad performance on both tasks. |
|
I cannot explain this the translation task. With a sequence length of 128 input and output |
|
tokens, the sliding attention window with radius length 127 of the `long-t5` models should be able to handle this. |
|
""") |
|
|
|
with open("REMARKS.md", "r") as f: |
|
st.markdown(f.read()) |
|
|
|
st.markdown( |
|
"""### Bfloat16 datatype requires loss regularization |
|
|
|
When training models with `bfloat16` and without loss regularization (default), the training losses would plateau or |
|
diverge. The graph below displays the results of different attempts |
|
to train [t5-small-24L-dutch-english](https://huggingface.co/yhavinga/t5-small-24L-dutch-english). |
|
The legend indicates the optimizer, data type, learning rate, total batch size, and learning rate schedule used. |
|
As you can see, all attempts to train with `bfloat16` failed. |
|
""" |
|
) |
|
st.image("img/bfloat16_loss.png", width=IMAGE_WIDTHS) |
|
st.markdown( |
|
"""The solution was found when peeking at T5X and the T5 gin configs, where I noticed a `z_loss` parameter, |
|
always set to 1e-4. This factor is used in the T5X [cross entropy loss](https://github.com/google-research/t5x/blob/a319e559b4f72bffab91821487382ef4c25dfcf4/t5x/losses.py#L26) |
|
function, with the purpose to pull the weights towards zero. |
|
I experimented with adding this regularization term in the HF pre-training script, |
|
and the `bfloat16` training runs did not exhibit the problems illustrated above anymore. |
|
|
|
The `z_loss` regularization term in the T5X loss function looks like L2 regularization. |
|
(See e.g. Andrej Karpathy [explaining regularization loss](https://youtu.be/PaCmpygFfXo?t=6720)). |
|
The Optax optimizer, used in the HuggingFace script, mentions weight decay for AdaFactor (and AdamW) |
|
but also mentions that L2 regularization does not work as expected with adaptive gradient |
|
algorithms. It might be the case that setting a non-zero `weight_decay_rate` in the Optax Adafactor call |
|
in the HuggingFace pre-training script is an alternative to adding the `z_loss` term, to solve the bfloat16 issues, but |
|
I haven't tested this yet. |
|
""" |
|
) |
|
|
|
st.markdown( |
|
"""### Which optimizer and lr to use |
|
|
|
During the Flax/Jax Community week in '21, our team quickly decided on using Adafactor with learning rate 5e-3. |
|
I believed that a more optimal setting could be found with more time. |
|
After conducting seven WandB sweeps with |
|
Adafactor, AdamW and Distributed Shampoo (experimental PJIT version from Dall-E mini), |
|
a better setting had not been found. The graph below shows the runs from all 7 sweeps combined. |
|
-- (I apologize for the confusion in the legend; I was unable to display the optimizer in the legend |
|
because the initial version of the training script had the optimizer as a boolean, which I later |
|
changed to a string with the optimizer name.) -- |
|
All runs in the graph below that achieve a loss below 4 use **Adafactor**. |
|
Peach-sweep-6 is represented by a dashed orange line and had a learning rate of **5e-3**. |
|
""" |
|
) |
|
|
|
st.image("img/adafactor_vs_adam_pretrain.png", width=IMAGE_WIDTHS) |
|
st.markdown( |
|
"""While there probably is a setting that will allow Adam and Shampoo to also converge fast below loss 4.0, I was unable |
|
to find it. In a recent tweet Lucas Nestler had more success with Shampoo (https://twitter.com/_clashluke/status/1535994026876252160) |
|
so maybe I need to revisit the attempt with the latest upstream code bases. |
|
|
|
Later, when pre-training with T5X, I found that its custom Adafactor implementation with the default settings of the T5X gin configs, |
|
a learning rate of 0.001 and inverse square root learning rate decay, worked well. |
|
""" |
|
) |
|
|
|
st.markdown( |
|
"""### Optimizer and learning rate used for summarization |
|
|
|
Finetuning summarization requires more memory than translation due to the longer sequence lengths involved. |
|
I wondered if I could use Adafactor instead of Adam and ran |
|
a sweep to test this. The sweep was configured with Hyperband, so not all training runs completed to the end. |
|
""" |
|
) |
|
st.image("img/optim_lr_summarization.png", width=IMAGE_WIDTHS) |
|
st.markdown( |
|
"""The training losses are graphed below: |
|
""" |
|
) |
|
|
|
st.image("img/training_losses_summarization_sweep.png", width=IMAGE_WIDTHS) |
|
st.markdown( |
|
""" |
|
While the Adafactor run with learning rate 7e-4 came close to the Adam runs, the consistent stability of training with Adam |
|
made me stick with Adam as optimizer for evaluation runs on the several models. For translation the results were similar, though in the end I needed to configure a lower learning rate for all |
|
models to converge during fine-tuning. |
|
""" |
|
) |
|
|
|
st.markdown( |
|
"""### Sequence length 512 or 1024 |
|
|
|
The models `t5-v1_1-base-dutch-english-cased` and `t5-v1_1-base-dutch-english-cased-1024` have the same model dimensions, |
|
but are pre-trained on different sequence lenghts, 512 and 1024 respectively. |
|
The evaluation loss and accuracy of the models do not look too different. Since training of the 1024 sequence length model was |
|
very slow and didn't converge a was was very slow, I stopped it early. The figure below shows the evaluation |
|
loss and accuracy. |
|
""" |
|
) |
|
st.image("img/t5v1_1eval_loss_and_accuracy.png", width=IMAGE_WIDTHS) |
|
st.markdown( |
|
"""The 512 sequence length model was trained for 10 epochs of the `small` nl+en config (186B tokens total) and the 1024 |
|
sequence length model about 2 epochs of the `large` nl+en config (100B tokens total). While I expected both models to |
|
perform similarly on downstream tasks, the 1024 sequence length model has better scores for both |
|
summarization and translation. |
|
""" |
|
) |
|
|
|
st.markdown( |
|
"""## Model lists |
|
|
|
### t5_1_1 |
|
|
|
TODO |
|
|
|
### UL2 Dutch English |
|
|
|
These models have been trained with T5X on mc4_nl_cleaned, books, Wikipedia and news. |
|
|
|
| | ul2-base-dutch-english | ul2-large-dutch-english | ul2-small-dutch-english | |
|
|:---------------------|:-------------------------|:--------------------------|:--------------------------| |
|
| model_type | t5 | t5 | t5 | |
|
| _pipeline_tag | text2text-generation | text2text-generation | text2text-generation | |
|
| d_model | 768 | 1024 | 512 | |
|
| d_ff | 2048 | 2816 | 1024 | |
|
| num_heads | 12 | 16 | 6 | |
|
| d_kv | 64 | 64 | 64 | |
|
| num_layers | 12 | 24 | 8 | |
|
| num_decoder_layers | 12 | 24 | 8 | |
|
| feed_forward_proj | gated-gelu | gated-gelu | gated-gelu | |
|
| dense_act_fn | gelu_new | gelu_new | gelu_new | |
|
| vocab_size | 32128 | 32128 | 32128 | |
|
| tie_word_embeddings | 0 | 0 | 0 | |
|
| torch_dtype | float32 | float32 | float32 | |
|
| _gin_batch_size | 128 | 64 | 128 | |
|
| _gin_z_loss | 0.0001 | 0.0001 | 0.0001 | |
|
| _gin_t5_config_dtype | 'bfloat16' | 'bfloat16' | 'bfloat16' | |
|
|
|
### UL2 Dutch |
|
|
|
These models have been trained with T5X on mc4_nl_cleaned, books, Wikipedia and news. |
|
|
|
| | ul2-base-dutch | ul2-base-nl36-dutch | ul2-large-dutch | ul2-small-dutch | |
|
|:---------------------|:---------------------|:----------------------|:---------------------|:---------------------| |
|
| model_type | t5 | t5 | t5 | t5 | |
|
| _pipeline_tag | text2text-generation | text2text-generation | text2text-generation | text2text-generation | |
|
| d_model | 768 | 768 | 1024 | 512 | |
|
| d_ff | 2048 | 3072 | 2816 | 1024 | |
|
| num_heads | 12 | 12 | 16 | 6 | |
|
| d_kv | 64 | 64 | 64 | 64 | |
|
| num_layers | 12 | 36 | 24 | 8 | |
|
| num_decoder_layers | 12 | 36 | 24 | 8 | |
|
| feed_forward_proj | gated-gelu | gated-gelu | gated-gelu | gated-gelu | |
|
| dense_act_fn | gelu_new | gelu_new | gelu_new | gelu_new | |
|
| vocab_size | 32128 | 32128 | 32128 | 32128 | |
|
| tie_word_embeddings | 0 | 0 | 0 | 0 | |
|
| torch_dtype | float32 | float32 | float32 | float32 | |
|
| _gin_batch_size | 128 | 64 | 64 | 128 | |
|
| _gin_z_loss | 0.0001 | 0.0001 | 0.0001 | 0.0001 | |
|
| _gin_t5_config_dtype | 'bfloat16' | 'bfloat16' | 'bfloat16' | 'bfloat16' | |
|
|
|
### T5 models Dutch and Dutch/English |
|
|
|
These models have been trained with the HuggingFace π€ run_t5_mlm_flax.py script on mc4_nl_cleaned. |
|
Most notable differences are the model sizes, activation function, and the dropout rate used during |
|
pre-training. The T5-eff models are models that differ in their number of layers. The table will list |
|
the several dimensions of these models. |
|
|
|
| | [t5-base-dutch](https://huggingface.co/yhavinga/t5-base-dutch) | [t5-v1.1-base-dutch-uncased](https://huggingface.co/yhavinga/t5-v1.1-base-dutch-uncased) | [t5-v1.1-base-dutch-cased](https://huggingface.co/yhavinga/t5-v1.1-base-dutch-cased) | [t5-v1.1-large-dutch-cased](https://huggingface.co/yhavinga/t5-v1.1-large-dutch-cased) | [t5-v1_1-base-dutch-english-cased](https://huggingface.co/yhavinga/t5-v1_1-base-dutch-english-cased) | [t5-v1_1-base-dutch-english-cased-1024](https://huggingface.co/yhavinga/t5-v1_1-base-dutch-english-cased-1024) | [t5-small-24L-dutch-english](https://huggingface.co/yhavinga/t5-small-24L-dutch-english) | [t5-xl-4L-dutch-english-cased](https://huggingface.co/yhavinga/t5-xl-4L-dutch-english-cased) | [t5-base-36L-dutch-english-cased](https://huggingface.co/yhavinga/t5-base-36L-dutch-english-cased) | [t5-eff-xl-8l-dutch-english-cased](https://huggingface.co/yhavinga/t5-eff-xl-8l-dutch-english-cased) | [t5-eff-large-8l-dutch-english-cased](https://huggingface.co/yhavinga/t5-eff-large-8l-dutch-english-cased) | |
|
|:------------------|:----------------|:-----------------------------|:---------------------------|:----------------------------|:-----------------------------------|:----------------------------------------|:-----------------------------|:-------------------------------|:----------------------------------|:-----------------------------------|:--------------------------------------| |
|
| *type* | t5 | t5-v1.1 | t5-v1.1 | t5-v1.1 | t5-v1.1 | t5-v1.1 | t5 eff | t5 eff | t5 eff | t5 eff | t5 eff | |
|
| *d_model* | 768 | 768 | 768 | 1024 | 768 | 768 | 512 | 2048 | 768 | 1024 | 1024 | |
|
| *d_ff* | 3072 | 2048 | 2048 | 2816 | 2048 | 2048 | 1920 | 5120 | 2560 | 16384 | 4096 | |
|
| *num_heads* | 12 | 12 | 12 | 16 | 12 | 12 | 8 | 32 | 12 | 32 | 16 | |
|
| *d_kv* | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | 128 | 64 | |
|
| *num_layers* | 12 | 12 | 12 | 24 | 12 | 12 | 24 | 4 | 36 | 8 | 8 | |
|
| *num parameters* | 223M | 248M | 248M | 783M | 248M | 248M | 250M | 585M | 729M | 1241M | 335M | |
|
| *feed_forward_proj* | relu | gated-gelu | gated-gelu | gated-gelu | gated-gelu | gated-gelu | gated-gelu | gated-gelu | gated-gelu | gated-gelu | gated-gelu | |
|
| *dropout* | 0.1 | 0.0 | 0.0 | 0.1 | 0.0 | 0.0 | 0.0 | 0.1 | 0.0 | 0.0 | 0.0 | |
|
| *dataset* | mc4_nl_cleaned | mc4_nl_cleaned full | mc4_nl_cleaned full | mc4_nl_cleaned | mc4_nl_cleaned small_en_nl | mc4_nl_cleaned large_en_nl | mc4_nl_cleaned large_en_nl | mc4_nl_cleaned large_en_nl | mc4_nl_cleaned large_en_nl | mc4_nl_cleaned large_en_nl | mc4_nl_cleaned large_en_nl | |
|
| *tr. seq len* | 512 | 1024 | 1024 | 512 | 512 | 1024 | 512 | 512 | 512 | 512 | 512 | |
|
| *batch size* | 128 | 64 | 64 | 64 | 128 | 64 | 128 | 512 | 512 | 64 | 128 | |
|
| *total steps* | 527500 | 1014525 | 1210154 | 1120k/2427498 | 2839630 | 1520k/3397024 | 851852 | 212963 | 212963 | 538k/1703705 | 851850 | |
|
| *epochs* | 1 | 2 | 2 | 2 | 10 | 4 | 1 | 1 | 1 | 1 | 1 | |
|
| *duration* | 2d9h | 5d5h | 6d6h | 8d13h | 11d18h | 9d1h | 4d10h | 6d1h | 17d15h | 4d 19h | 3d 23h | |
|
| *optimizer* | adafactor | adafactor | adafactor | adafactor | adafactor | adafactor | adafactor | adafactor | adafactor | adafactor | adafactor | |
|
| *lr* | 0.005 | 0.005 | 0.005 | 0.005 | 0.005 | 0.005 | 0.005 | 0.005 | 0.009 | 0.005 | 0.005 | |
|
| *warmup* | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 5000.0 | 20000.0 | 2500.0 | 1000.0 | 1500.0 | 1500.0 | |
|
| *eval loss* | 1,38 | 1,20 | 0,96 | 1,07 | 1,11 | 1,13 | 1,18 | 1,27 | 1,05 | 1,3019 | 1,15 | |
|
| *eval acc* | 0,70 | 0,73 | 0,78 | 0,76 | 0,75 | 0,74 | 0,74 | 0,72 | 0,76 | 0,71 | 0,74 | |
|
|
|
### Long-T5 models |
|
|
|
These models have been trained with the HuggingFace π€ run_t5_mlm_flax.py script on mc4_nl_cleaned. |
|
|
|
### Byt5 small |
|
|
|
This model has been trained with the HuggingFace π€ run_t5_mlm_flax.py script on mc4_nl_cleaned. |
|
|
|
TODO |
|
|
|
### Fine-tuned translation models on ccmatrix |
|
|
|
The models `t5-small-24L-dutch-english` and `t5-base-36L-dutch-english` have been fine-tuned for both language |
|
directions on the first 25M samples from CCMatrix, giving a total of 50M training samples. |
|
Evaluation is performed on out-of-sample CCMatrix and also on Tatoeba and Opus Books. |
|
The `_bp` columns list the *brevity penalty*. The `avg_bleu` score is the bleu score |
|
averaged over all three evaluation datasets. The best scores displayed in bold for both translation directions. |
|
|
|
| | [t5-base-36L-ccmatrix-multi](https://huggingface.co/yhavinga/t5-base-36L-ccmatrix-multi) | [t5-base-36L-ccmatrix-multi](https://huggingface.co/yhavinga/t5-base-36L-ccmatrix-multi) | [t5-small-24L-ccmatrix-multi](https://huggingface.co/yhavinga/t5-small-24L-ccmatrix-multi) | [t5-small-24L-ccmatrix-multi](https://huggingface.co/yhavinga/t5-small-24L-ccmatrix-multi) | |
|
|:-----------------------|:-----------------------------|:-----------------------------|:------------------------------|:------------------------------| |
|
| *source_lang* | en | nl | en | nl | |
|
| *target_lang* | nl | en | nl | en | |
|
| *source_prefix* | translate English to Dutch: | translate Dutch to English: | translate English to Dutch: | translate Dutch to English: | |
|
| *ccmatrix_bleu* | **56.8** | 62.8 | 57.4 | **63.1** | |
|
| *tatoeba_bleu* | **46.6** | **52.8** | 46.4 | 51.7 | |
|
| *opus_books_bleu* | **13.5** | **24.9** | 12.9 | 23.4 | |
|
| *ccmatrix_bp* | 0.95 | 0.96 | 0.95 | 0.96 | |
|
| *tatoeba_bp* | 0.97 | 0.94 | 0.98 | 0.94 | |
|
| *opus_books_bp* | 0.8 | 0.94 | 0.77 | 0.89 | |
|
| *avg_bleu* | **38.96** | **46.86** | 38.92 | 46.06 | |
|
| *max_source_length* | 128 | 128 | 128 | 128 | |
|
| *max_target_length* | 128 | 128 | 128 | 128 | |
|
| *adam_beta1* | 0.9 | 0.9 | 0.9 | 0.9 | |
|
| *adam_beta2* | 0.997 | 0.997 | 0.997 | 0.997 | |
|
| *weight_decay* | 0.05 | 0.05 | 0.002 | 0.002 | |
|
| *lr* | 5e-05 | 5e-05 | 0.0005 | 0.0005 | |
|
| *label_smoothing_factor* | 0.15 | 0.15 | 0.1 | 0.1 | |
|
| *train_batch_size* | 128 | 128 | 128 | 128 | |
|
| *warmup_steps* | 2000 | 2000 | 2000 | 2000 | |
|
| *total steps* | 390625 | 390625 | 390625 | 390625 | |
|
| *duration* | 4d 5h | 4d 5h | 3d 2h | 3d 2h | |
|
| *num parameters* | 729M | 729M | 250M | 250M | |
|
|
|
|
|
## Acknowledgements |
|
|
|
This project would not have been possible without compute generously provided by Google through the |
|
[TPU Research Cloud](https://sites.research.google/trc/). The HuggingFace π€ ecosystem was instrumental in all parts |
|
of the training. Weights & Biases made it possible to keep track of many training sessions |
|
and orchestrate hyperparameter sweeps with insightful visualizations. |
|
|
|
Created by [Yeb Havinga](https://www.linkedin.com/in/yeb-havinga-86530825/) |
|
""" |
|
) |
|
|
|
st.write( |
|
f""" |
|
--- |
|
*Memory: {memory.total / 10**9:.2f}GB, used: {memory.percent}%, available: {memory.available / 10**9:.2f}GB* |
|
""" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
memory = psutil.virtual_memory() |
|
main() |
|
|