Spaces:

yhavinga
/

pre-training-dutch-t5-models

Running

pre-training-dutch-t5-models / index.html

Yeb Havinga

Update

af023ba almost 2 years ago

No virus

69.4 kB

	<!doctype html>
	<html>
	<head>
	<title>Pre-training Dutch T5 models</title>
	<link rel="stylesheet" href="style.css">
	<style>/* Reworked by IntelliJ Team The MIT License (MIT) Copyright (c) JetBrains Adapted from https://github.com/sindresorhus/github-markdown-css The MIT License (MIT) Copyright (c) Sindre Sorhus <sindresorhus@gmail.com> (sindresorhus.com) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. / @font-face { font-family: octicons-anchor; src: url(data:font/woff;charset=utf-8;base64,d09GRgABAAAAAAYcAA0AAAAACjQAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAABGRlRNAAABMAAAABwAAAAca8vGTk9TLzIAAAFMAAAARAAAAFZG1VHVY21hcAAAAZAAAAA+AAABQgAP9AdjdnQgAAAB0AAAAAQAAAAEACICiGdhc3AAAAHUAAAACAAAAAj//wADZ2x5ZgAAAdwAAADRAAABEKyikaNoZWFkAAACsAAAAC0AAAA2AtXoA2hoZWEAAALgAAAAHAAAACQHngNFaG10eAAAAvwAAAAQAAAAEAwAACJsb2NhAAADDAAAAAoAAAAKALIAVG1heHAAAAMYAAAAHwAAACABEAB2bmFtZQAAAzgAAALBAAAFu3I9x/Nwb3N0AAAF/AAAAB0AAAAvaoFvbwAAAAEAAAAAzBdyYwAAAADP2IQvAAAAAM/bz7t4nGNgZGFgnMDAysDB1Ml0hoGBoR9CM75mMGLkYGBgYmBlZsAKAtJcUxgcPsR8iGF2+O/AEMPsznAYKMwIkgMA5REMOXicY2BgYGaAYBkGRgYQsAHyGMF8FgYFIM0ChED+h5j//yEk/3KoSgZGNgYYk4GRCUgwMaACRoZhDwCs7QgGAAAAIgKIAAAAAf//AAJ4nHWMMQrCQBBF/0zWrCCIKUQsTDCL2EXMohYGSSmorScInsRGL2DOYJe0Ntp7BK+gJ1BxF1stZvjz/v8DRghQzEc4kIgKwiAppcA9LtzKLSkdNhKFY3HF4lK69ExKslx7Xa+vPRVS43G98vG1DnkDMIBUgFN0MDXflU8tbaZOUkXUH0+U27RoRpOIyCKjbMCVejwypzJJG4jIwb43rfl6wbwanocrJm9XFYfskuVC5K/TPyczNU7b84CXcbxks1Un6H6tLH9vf2LRnn8Ax7A5WQAAAHicY2BkYGAA4teL1+yI57f5ysDNwgAC529f0kOmWRiYVgEpDgYmEA8AUzEKsQAAAHicY2BkYGB2+O/AEMPCAAJAkpEBFbAAADgKAe0EAAAiAAAAAAQAAAAEAAAAAAAAKgAqACoAiAAAeJxjYGRgYGBhsGFgYgABEMkFhAwM/xn0QAIAD6YBhwB4nI1Ty07cMBS9QwKlQapQW3VXySvEqDCZGbGaHULiIQ1FKgjWMxknMfLEke2A+IJu+wntrt/QbVf9gG75jK577Lg8K1qQPCfnnnt8fX1NRC/pmjrk/zprC+8D7tBy9DHgBXoWfQ44Av8t4Bj4Z8CLtBL9CniJluPXASf0Lm4CXqFX8Q84dOLnMB17N4c7tBo1AS/Qi+hTwBH4rwHHwN8DXqQ30XXAS7QaLwSc0Gn8NuAVWou/gFmnjLrEaEh9GmDdDGgL3B4JsrRPDU2hTOiMSuJUIdKQQayiAth69r6akSSFqIJuA19TrzCIaY8sIoxyrNIrL//pw7A2iMygkX5vDj+G+kuoLdX4GlGK/8Lnlz6/h9MpmoO9rafrz7ILXEHHaAx95s9lsI7AHNMBWEZHULnfAXwG9/ZqdzLI08iuwRloXE8kfhXYAvE23+23DU3t626rbs8/8adv+9DWknsHp3E17oCf+Z48rvEQNZ78paYM38qfk3v/u3l3u3GXN2Dmvmvpf1Srwk3pB/VSsp512bA/GG5i2WJ7wu430yQ5K3nFGiOqgtmSB5pJVSizwaacmUZzZhXLlZTq8qGGFY2YcSkqbth6aW1tRmlaCFs2016m5qn36SbJrqosG4uMV4aP2PHBmB3tjtmgN2izkGQyLWprekbIntJFing32a5rKWCN/SdSoga45EJykyQ7asZvHQ8PTm6cslIpwyeyjbVltNikc2HTR7YKh9LBl9DADC0U/jLcBZDKrMhUBfQBvXRzLtFtjU9eNHKin0x5InTqb8lNpfKv1s1xHzTXRqgKzek/mb7nB8RZTCDhGEX3kK/8Q75AmUM/eLkfA+0Hi908Kx4eNsMgudg5GLdRD7a84npi+YxNr5i5KIbW5izXas7cHXIMAau1OueZhfj+cOcP3P8MNIWLyYOBuxL6DRylJ4cAAAB4nGNgYoAALjDJyIAOWMCiTIxMLDmZedkABtIBygAAAA==) format('woff'); } body { -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; font-family: Helvetica, Arial, freesans, sans-serif; font-size: 14px; line-height: 1.6; word-wrap: break-word; margin: 0 2em; } strong { font-weight: bold; } img { border: 0; } input { color: inherit; font: inherit; margin: 0; line-height: normal; } html input[disabled] { cursor: default; } input[type="checkbox"] { box-sizing: border-box; padding: 0; } body { box-sizing: border-box; } input { font: 13px/1.4 Helvetica, arial, nimbussansl, liberationsans, freesans, clean, sans-serif, "Segoe UI Emoji", "Segoe UI Symbol"; } a { text-decoration: none; } a:hover, a:active { text-decoration: underline; } hr { height: 1px; padding: 0; margin: 20px 0; border: 0 none; } hr:before { display: table; content: ""; } hr:after { display: table; clear: both; content: ""; } blockquote { margin: 0; } ul, ol { padding: 0; margin-top: 0; margin-bottom: 0; } ol ol, ul ol { list-style-type: lower-roman; } ul ul ol, ul ol ol, ol ul ol, ol ol ol { list-style-type: lower-alpha; } dd { margin-left: 0; } .octicon { font: normal normal normal 16px/1 octicons-anchor; display: inline-block; text-decoration: none; text-rendering: auto; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; -webkit-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; } .octicon-link:before { content: '\f05c'; } .markdown-body > :first-child { margin-top: 0 !important; } .markdown-body > :last-child { margin-bottom: 0 !important; } .anchor { position: absolute; top: 0; left: 0; display: block; padding-right: 6px; padding-left: 30px; margin-left: -30px; } .anchor:focus { outline: none; } h1, h2, h3, h4, h5, h6 { position: relative; margin-top: 1em; margin-bottom: 16px; font-weight: bold; line-height: 1.4; } h1 .octicon-link, h2 .octicon-link, h3 .octicon-link, h4 .octicon-link, h5 .octicon-link, h6 .octicon-link { display: none; color: #000; vertical-align: middle; } h1:hover .anchor, h2:hover .anchor, h3:hover .anchor, h4:hover .anchor, h5:hover .anchor, h6:hover .anchor { padding-left: 8px; margin-left: -30px; text-decoration: none; } h1:hover .anchor .octicon-link, h2:hover .anchor .octicon-link, h3:hover .anchor .octicon-link, h4:hover .anchor .octicon-link, h5:hover .anchor .octicon-link, h6:hover .anchor .octicon-link { display: inline-block; } h1 { font-size: 2.2em; padding-top: 0.6em; } h1 .anchor { line-height: 1; } h2 { font-size: 1.8em; line-height: 1.2; padding-top: 0.6em; } h2 .anchor { line-height: 1.2; } h3 { font-size: 1.3em; line-height: 1; padding-top: 0.6em; } h3 .anchor { line-height: 1.2; } h4 { font-size: 1em; } h4 .anchor { line-height: 1.2; } h5 { font-size: 1em; } h5 .anchor { line-height: 1.1; } h6 { font-size: 1em; } h6 .anchor { line-height: 1.1; } p, blockquote, ul, ol, dl, table, pre { margin-top: 16px; margin-bottom: 16px; } ul, ol { padding-left: 2em; } ul ul, ul ol, ol ol, ol ul { margin-top: 0; margin-bottom: 0; } li > p { margin-top: 0; margin-bottom: 0; } dl { padding: 0; } dl dt { padding: 0; margin-top: 16px; font-size: 1em; font-style: italic; font-weight: bold; } dl dd { padding: 0 16px; margin-bottom: 16px; } blockquote { padding: 10px 10px 10px 16px; border-left: 2px solid; border-radius: 0 3px 3px 0; } blockquote > :first-child { margin-top: 0; } blockquote > :last-child { margin-bottom: 0; } table { display: block; width: 100%; overflow: auto; word-break: normal; border-collapse: collapse; border-spacing: 0; font-size: 1em; } table th { font-weight: bold; } table th, table td { padding: 6px 13px; background: transparent; } table tr { border-top: 1px solid; } img { max-width: 100%; box-sizing: border-box; } code { font: 0.9em "JetBrains Mono", Consolas, "Liberation Mono", Menlo, Courier, monospace; padding: 0.2em 0.4em; margin: 2px; border-radius: 3px; } pre > code { padding: 0; margin: 0; font-size: 100%; word-break: normal; white-space: pre; background: transparent; border: 0; } .highlight { margin-bottom: 16px; } .highlight pre, pre { font: 0.85em "JetBrains Mono", Consolas, "Liberation Mono", Menlo, Courier, monospace; padding: 16px; overflow: auto; line-height: 1.45; border-radius: 3px; } pre code { display: inline; max-width: initial; padding: 0; margin: 0; overflow: initial; line-height: inherit; word-wrap: normal; background-color: transparent; border: 0; } pre code:before, pre code:after { content: normal; } kbd { font: 0.9em "JetBrains Mono", Consolas, "Liberation Mono", Menlo, Courier, monospace; padding: 0.2em 0.4em; margin: 2px; border-radius: 3px; } .task-list-item { list-style-type: none; } .task-list-item + .task-list-item { margin-top: 3px; } .task-list-item input { margin: 0 0.35em 0.25em -0.6em; vertical-align: middle; } :checked + .radio-label { z-index: 1; position: relative; } span.user-del { text-decoration: line-through; } ::-webkit-scrollbar { width: 6px; height: 6px; } ::-webkit-scrollbar-thumb { -webkit-border-radius: 10px; } ::-webkit-scrollbar-track:vertical { -webkit-box-shadow: -1px 0 0 #ededed; } ::-webkit-scrollbar-track { background-color: transparent; } ::-webkit-scrollbar { width: 6px; } </style>
	<style>body { background-color: rgba(255, 255, 255, 255.0); font-size: 10px !important; } body, p, blockquote, ul, ol, dl, table, pre, code, tr { color: rgba(8, 8, 8, 255.0); } a { color: rgba(36, 112, 179, 255.0); } table td, table th { border: 1px solid rgba(209, 209, 209, 255.0); } hr { background-color: rgba(209, 209, 209, 255.0); } kbd, tr { border: 1px solid rgba(209, 209, 209, 255.0); } h6 { color: rgba(153, 153, 153, 255.0); } blockquote { border-left: 2px solid rgba(36, 112, 179, 0.4); } ::-webkit-scrollbar-thumb { background-color: rgba(115, 115, 115, 0.2); } blockquote, code, pre { background-color: rgba(212, 222, 231, 0.24705882352941178); }</style>
	<style>/* Copyright 2000-2021 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license that can be found in the LICENSE file. / .code-fence-highlighter-copy-button { float: right; display: flex; } .code-fence-highlighter-copy-button-icon { max-width: 1em; } .code-fence:hover .code-fence-highlighter-copy-button-icon { /noinspection CssUnknownTarget/ content: url("copy-button-copy-icon.png"); } .code-fence:hover .code-fence-highlighter-copy-button:hover .code-fence-highlighter-copy-button-icon { /noinspection CssUnknownTarget*/ content: url("copy-button-copy-icon-hovered.png"); cursor: pointer; } </style>
	<style>/* Copyright 2000-2021 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license that can be found in the LICENSE file. */ .run-icon > img { max-width: 1em; vertical-align: text-top; margin-right: 0.3em; } .code-block { position: absolute; left: 1em; } .hidden { display: none; }</style>
	<style>img { max-width: 700px; margin: 10px; padding: 10px; border:1px solid #f0f0f0;}</style>
	<style>body { max-width: 1000px; }</style>
	</head>
	<body>
	<div md-src-pos="0..29528">
	<h1 md-src-pos="0..26">Pre-training Dutch <!-- doesnt work on HF spaces?? span class="emoji">🇳🇱 🇧🇪</span--> T5 models </h1>
	<p>TL;DR, Look below for <a href="#model-list">the list of pre-trained Dutch and Dutch+English models</a>.</p>

	<p md-src-pos="28..495"><span md-src-pos="28..64">A few months ago, I was given access to Google's TPU Research Cloud (TRC). My goal was to train several Dutch and Dutch+English T5 models, limited to model sizes that can run on a single GPU.
	T5 is a text-to-text transfer transformer, a neural network model with
	natural language text as input and output.
	It can be fine-tuned on a wide range of tasks.</span></p>
	<ul md-src-pos="497..2062">
	<li md-src-pos="497..751"><strong md-src-pos="499..624"><a target="_blank" href="https://arxiv.org/abs/1910.10683.pdf" md-src-pos="501..622">Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer</a></strong> by <em md-src-pos="628..750">Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu</em>.</li>
	<li md-src-pos="752..1482"><strong md-src-pos="754..859"><a target="_blank" href="https://arxiv.org/abs/2110.08207" md-src-pos="756..857">Multitask Prompted Training Enables Zero-Shot Task Generalization</a></strong> by <em md-src-pos="863..1481">Victor Sanh, Albert Webson, Colin Raffel, Stephen H. Bach, Lintang Sutawika, Zaid Alyafeai, Antoine Chaffin, Arnaud Stiegler, Teven Le Scao, Arun Raja, Manan Dey, M Saiful Bari, Canwen Xu, Urmish Thakker, Shanya Sharma Sharma, Eliza Szczechla, Taewoon Kim, Gunjan Chhablani, Nihal Nayak, Debajyoti Datta, Jonathan Chang, Mike Tian-Jian Jiang, Han Wang, Matteo Manica, Sheng Shen, Zheng Xin Yong, Harshit Pandey, Rachel Bawden, Thomas Wang, Trishala Neeraj, Jos Rozen, Abheesht Sharma, Andrea Santilli, Thibault Fevry, Jason Alan Fries, Ryan Teehan, Tali Bers, Stella Biderman, Leo Gao, Thomas Wolf, Alexander M. Rush</em>.</li>
	<li md-src-pos="1483..1793"><strong md-src-pos="1485..1587"><a target="_blank" href="https://arxiv.org/abs/2111.10952" md-src-pos="1487..1585">ExT5: Towards Extreme Multi-Task Scaling for Transfer Learning</a></strong> by <em md-src-pos="1591..1792">Vamsi Aribandi, Yi Tay, Tal Schuster, Jinfeng Rao, Huaixiu Steven Zheng, Sanket Vaibhav Mehta, Honglei Zhuang, Vinh Q. Tran, Dara Bahri, Jianmo Ni, Jai Gupta, Kai Hui, Sebastian Ruder, Donald Metzler</em>.</li>
	<li md-src-pos="1794..2062"><strong md-src-pos="1796..1910"><a target="_blank" href="https://arxiv.org/abs/2109.10686" md-src-pos="1798..1908">Scale Efficiently: Insights from Pre-training and Fine-tuning Transformers</a></strong> by <em md-src-pos="1914..2061">Yi Tay, Mostafa Dehghani, Jinfeng Rao, William Fedus, Samira Abnar, Hyung Won Chung, Sharan Narang, Dani Yogatama, Ashish Vaswani, Donald Metzler</em>.</li>
	</ul>
	<p md-src-pos="2065..2202"><span md-src-pos="2065..2085">Background on Google</span>'<span md-src-pos="2086..2179">s TPU-VM and how to use the Huggingface transformers library to pre-train models can be found</span> <span md-src-pos="2180..2202">at the following
	pages:</span></p>
	<ul md-src-pos="2203..2407">
	<li md-src-pos="2203..2305"><a target="_blank" href="https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104" md-src-pos="2205..2305">https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104</a></li>
	<li md-src-pos="2306..2407"><a target="_blank" href="https://github.com/huggingface/transformers/tree/main/examples/research_projects/jax-projects#talks" md-src-pos="2308..2407">https://github.com/huggingface/transformers/tree/main/examples/research_projects/jax-projects#talks</a></li>
	</ul>
	<p>
	This project is a continuation of the work I performed together with
	Dat Nguyen during the <a target="_blank" href="https://github.com/huggingface/transformers/tree/main/examples/research_projects/jax-projects#talks">Flax/JAX Community
	Week</a> to create a T5 model pre-trained from scratch on Dutch.
	<h2 md-src-pos="18893..18908">Pre-training</h2>
	<h3 md-src-pos="18910..18925">mC4 dataset</h3>
	<p>
	The <a target="_blank"
	href="https://huggingface.co/datasets/allenai/c4">multilingual C4 (mC4)
	dataset</a> was created by the original T5 authors.
	It was prepared and released by AllenNLP
	on the Huggingface Dataset hub.
	Our team cleaned Dutch mC4 with <a target="_blank" href="https://gitlab.com/yhavinga/c4nlpreproc">code adapted</a> from the C4 TensorFlow dataset, and used the resulting text files in the pre-training scripts. We also verified that Dutch C4 was deduplicated.</p>
	<p>
	To be able to easily reuse this dataset for more pre-training sessions with Huggingfaces scripts, a Huggingface dataset was created: <a target="_blank" href="https://huggingface.co/datasets/yhavinga/mc4_nl_cleaned" md-src-pos="19449..19522">mc4_nl_cleaned</a>. For Dutch and English training, a couple of additional configs were added to the generation script. These configs produce interleaved Dutch and English texts with a 1:1 ratio. For instance, the <a target="_blank" href="https://huggingface.co/datasets/yhavinga/mc4_nl_cleaned/viewer/micro_en_nl/train" md-src-pos="19690..19792">micro_en_nl config</a> config mixes Dutch with English samples.
	The cleaned English C4 dataset is about 5 times larger (in compressed bytes) than the Dutch part. 1:1 interleaving with Dutch discards about 80% of English C4.
	The full cleaned Dutch mC4 dataset is 151GB, and still is (June '22) the largest Dutch cleaned corpus currently available on the HF Hub.
	</p>

	<h3 md-src-pos="20163..20243">Unsupervised Training Objective</h3>
	<p md-src-pos="2409..2753"><span md-src-pos="2409..2463">The Dutch and Dutch+English T5 models are pre-trained
	with the masked language modeling (MLM) "span corruption" objective.
	During pre-training, 15% of the tokens are masked and each span of masked tokens is replaced by a sentinel token.</span>
	</p>
	<h3 md-src-pos="20163..20243">Why are some models trained for multiple epochs on a smaller config?</h3>
	<p>When I was using an old version of the <a target="_blank"
	href="https://github.com/huggingface/transformers/blob/7e44226fc75aa1e5f8928c6445f1979343ea782f/examples/flax/language-modeling/run_t5_mlm_flax.py">Flax
	T5 MLM pretraining script</a>, I noticed that the per-batch training speed seemed slower at the beginning of epochs when a larger dataset config was used. Also, on large configs, batch shuffling would fail with a TPU out-of-memory error. For these reasons, I started experimenting with training for more epochs on smaller configs.
	</p>
	<p><span md-src-pos="20616..20634">This should be ok.</span> <span md-src-pos="20635..20717">In the original T5 paper downstream performance was compared between training on 2</span><sup><span md-src-pos="20722..20724">35</span></sup> <span md-src-pos="20731..20749">tokens vs training</span> <span md-src-pos="20750..20784">multiple epochs on a smaller part.</span> <span md-src-pos="20785..20800">64 repeats of 2</span><sup><span md-src-pos="20805..20807">29</span></sup> <span md-src-pos="20814..20871">tokens did not result in degraded downstream performance.</span> <span md-src-pos="20872..20881">The model</span> <code md-src-pos="20882..20925">yhavinga/t5-v1_1-base-dutch-english-cased</code> <span md-src-pos="20926..20943">is trained on the</span> <code md-src-pos="20944..20951">small</code> <span md-src-pos="20952..20973">config for 10 epochs.</span> </p>
	<p><span>
	In the end,</span> <span md-src-pos="20986..21001">a change to the</span> <a target="_blank" href="https://github.com/huggingface/transformers/blame/main/examples/flax/language-modeling/run_t5_mlm_flax.py" md-src-pos="21002..21130">pre-training script</a> <span md-src-pos="21131..21157">to perform batch shuffling</span> (<span md-src-pos="21159..21177">permuting an array</span>) <span md-src-pos="21179..21189">on the CPU</span> <span md-src-pos="21190..21250">instead of the accelerator device solved all related issues,</span> <span md-src-pos="21251..21303">and larger configs could be used without any issues.</span></p>
	<h3 md-src-pos="21305..21338">Which optimizer and lr to use</h3>
	<p md-src-pos="21340..22064"><span md-src-pos="21340..21346">During the </span> <span md-src-pos="21348..21428">Flax/Jax Community week we quickly decided on using Adafactor with learning rate 5e-3.</span> <span md-src-pos="21429..21460">I was sure that with more time,</span> <span md-src-pos="21461..21493">a better setting could be found.</span> <span md-src-pos="21494..21535">After performing seven sweeps with Adafactor,</span> <span md-src-pos="21536..21565">AdamW and Distributed Shampoo</span> (<span md-src-pos="21567..21579">experimental</span> <span md-src-pos="21580..21609">PJIT version from Dall-E mini</span>)<span md-src-pos="21610..21611">,</span> <span md-src-pos="21612..21646">I gave up to find better settings.</span> <span md-src-pos="21647..21705">The graph below shows the runs from all 7 sweeps combined.</span> <span md-src-pos="21706..21731">Apologies for the legend,</span> <span md-src-pos="21732..21774">I cannot show the optimizer in the legend,</span> <span md-src-pos="21775..21843">because the initial version of the training script had the optimizer</span> <code md-src-pos="21844..21857">--adafactor</code> <span md-src-pos="21858..21860">as</span> <span md-src-pos="21861..21869">boolean,</span> <span md-src-pos="21870..21928">which I later changed to a string with the optimizer name.</span> <span md-src-pos="21929..21986">All runs in the graph below that get the loss below 4 use</span> <strong md-src-pos="21987..22000">Adafactor</strong><span md-src-pos="22000..22001">.</span> <span md-src-pos="22002..22054">Peach-sweep-6 is dashed orange and has learning rate</span> <strong md-src-pos="22055..22063">5e-3</strong><span md-src-pos="22063..22064">.</span></p>
	<p md-src-pos="22066..22129"><img src="adafactor_vs_adam_pretrain.png" alt="Adafactor vs Adam vs Shampoo" __idea-generated="true" md-src-pos="22066..22129" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/adafactor_vs_adam_pretrain.png"></p>
	<p md-src-pos="22131..22458"><span md-src-pos="22131..22235">While there probably is a setting that will allow Adam and Shampoo to also converge fast below loss 4.0,</span> <span md-src-pos="22236..22248">I was unable</span> <span md-src-pos="22249..22260">to find it.</span> <span md-src-pos="22261..22322">In a recent tweet Lucas Nestler had more success with Shampoo</span> (<a target="_blank" href="https://twitter.com/_clashluke/status/1535994026876252160" md-src-pos="22324..22381">https://twitter.com/_clashluke/status/1535994026876252160</a>) <span md-src-pos="22383..22458">so maybe I need to revisit the attempt with the latest upstream code bases.</span></p>
	<h3 md-src-pos="22460..22508">Bfloat16 datatype and learning rate schedule</h3>
	<p md-src-pos="22510..23055"><span md-src-pos="22510..22588">I had some additional options in the pre-training script that I wanted to use.</span> <span md-src-pos="22589..22623">An exponential decay learning rate</span> <span md-src-pos="22624..22684">schedule would allow me to pre-train for as long as desired,</span> <span md-src-pos="22685..22720">instead of a fixed number of steps.</span> <span md-src-pos="22721..22764">I was also keen to pre-train with bfloat16,</span> <span md-src-pos="22765..22808">for the reduced memory footprint and speed.</span> <span md-src-pos="22809..22821">This failed.</span> <span md-src-pos="22822..22901">The graph below shows different attempts with the legend showing the optimizer,</span> <span md-src-pos="22902..22908">dtype,</span> <span md-src-pos="22909..22923">learning rate,</span> <span md-src-pos="22924..22965">total batch size and lr-schedule to train</span> <a target="_blank" href="https://huggingface.co/yhavinga/t5-small-24L-dutch-english" md-src-pos="22966..23054">t5-small-24L-dutch-english</a><span md-src-pos="23054..23055">.</span></p>
	<p md-src-pos="23057..23098"><img src="bfloat16_loss.png" alt="Bfloat16 vs Float32" __idea-generated="true" md-src-pos="23057..23098" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/bfloat16_loss.png"></p>
	<p md-src-pos="23100..23378"><span md-src-pos="23100..23111">In the end,</span> <span md-src-pos="23112..23167">all models released on the hub are trained with Flax in</span> <code md-src-pos="23168..23177">float32</code><span md-src-pos="23177..23178">.</span> <span md-src-pos="23179..23193">For reference,</span> <span md-src-pos="23194..23195">I</span>'<span md-src-pos="23196..23214">ve ran Stas Bekman</span>'<span md-src-pos="23215..23227">s script for</span> <a target="_blank" href="https://github.com/stas00/ml-ways/blob/master/numbers/detect-model-pretrained-in-bf16-fp16-fp32.ipynb" md-src-pos="23228..23376">bf16, fp16 or fp32 model pretrain detection</a><span md-src-pos="23376..23377">.</span></p>
	<pre class="code-fence" md-src-pos="23380..24514"><code md-src-pos="23380..24514">
	<div class="code-fence-highlighter-copy-button" data-fence-content="ICAgICAgICAgICAgICAgICAgICAgICBuYW1lICAgICAgICAgICAgICAgICAgICAgICAgfCAgYWJzIG1pbiAgfCAgYWJzIG1heCAgCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLXwtLS0tLS0tLS0tLXwtLS0tLS0tLS0tLQp5aGF2aW5nYS90NS1iYXNlLWR1dGNoICAgICAgICAgICAgICAgICAgICAgICAgICAgICB8IDEuNzU3ZS0wOSB8IDYuNzkyZSswMQp5aGF2aW5nYS90NS12MS4xLWJhc2UtZHV0Y2gtdW5jYXNlZCAgICAgICAgICAgICAgICB8IDEuMjE4ZS0wOSB8IDYuNzA4ZSswMgp5aGF2aW5nYS90NS12MS4xLWJhc2UtZHV0Y2gtY2FzZWQgICAgICAgICAgICAgICAgICB8IDMuMDA5ZS0wOSB8IDguODIxZSswMgp5aGF2aW5nYS90NS12MS4xLWxhcmdlLWR1dGNoLWNhc2VkICAgICAgICAgICAgICAgICB8IDAuMDAwZSswMCB8IDUuMDUzZSswMwp5aGF2aW5nYS90NS12MV8xLWJhc2UtZHV0Y2gtZW5nbGlzaC1jYXNlZCAgICAgICAgICB8IDUuMTQwZS0wOSB8IDMuMTExZSswMwp5aGF2aW5nYS90NS12MV8xLWJhc2UtZHV0Y2gtZW5nbGlzaC1jYXNlZC0xMDI0ICAgICB8IDkuMzU5ZS0xMCB8IDEuMzA4ZSswMgp5aGF2aW5nYS90NS1zbWFsbC0yNEwtZHV0Y2gtZW5nbGlzaCAgICAgICAgICAgICAgICB8IDEuNTc3ZS0wOSB8IDEuMjc2ZSswMgp5aGF2aW5nYS90NS14bC00TC1kdXRjaC1lbmdsaXNoLWNhc2VkICAgICAgICAgICAgICB8IDMuMjM0ZS0xMSB8IDMuOTg2ZSswMQp5aGF2aW5nYS90NS1iYXNlLTM2TC1kdXRjaC1lbmdsaXNoLWNhc2VkICAgICAgICAgICB8IDIuNDA5ZS0xMCB8IDYuMTA0ZSswMQp5aGF2aW5nYS90NS1lZmYteGwtOGwtZHV0Y2gtZW5nbGlzaC1jYXNlZCAgICAgICAgICB8IDUuNTMwZS0xMCB8IDguOTEyZSswMgp5aGF2aW5nYS90NS1lZmYtbGFyZ2UtOGwtZHV0Y2gtZW5nbGlzaC1jYXNlZCAgICAgICB8IDEuMDg2ZS0xMCB8IDUuMTI4ZSswMgp5aGF2aW5nYS90NS1iYXNlLTM2TC1jY21hdHJpeC1tdWx0aSAgICAgICAgICAgICAgICB8IDEuNzE1ZS0xMSB8IDMuNzQ2ZSswMQp5aGF2aW5nYS90NS1zbWFsbC0yNEwtY2NtYXRyaXgtbXVsdGkgICAgICAgICAgICAgICB8IDcuMDg2ZS0xMCB8IDEuMDUzZSswMgo=">

	<img class="code-fence-highlighter-copy-button-icon">

	</div><span md-src-pos="23380..23384"></span><span md-src-pos="23384..23459"> name \| abs min \| abs max </span>
	<span md-src-pos="23460..23535">---------------------------------------------------\|-----------\|-----------</span>
	<span md-src-pos="23536..23610">yhavinga/t5-base-dutch \| 1.757e-09 \| 6.792e+01</span>
	<span md-src-pos="23611..23685">yhavinga/t5-v1.1-base-dutch-uncased \| 1.218e-09 \| 6.708e+02</span>
	<span md-src-pos="23686..23760">yhavinga/t5-v1.1-base-dutch-cased \| 3.009e-09 \| 8.821e+02</span>
	<span md-src-pos="23761..23835">yhavinga/t5-v1.1-large-dutch-cased \| 0.000e+00 \| 5.053e+03</span>
	<span md-src-pos="23836..23910">yhavinga/t5-v1_1-base-dutch-english-cased \| 5.140e-09 \| 3.111e+03</span>
	<span md-src-pos="23911..23985">yhavinga/t5-v1_1-base-dutch-english-cased-1024 \| 9.359e-10 \| 1.308e+02</span>
	<span md-src-pos="23986..24060">yhavinga/t5-small-24L-dutch-english \| 1.577e-09 \| 1.276e+02</span>
	<span md-src-pos="24061..24135">yhavinga/t5-xl-4L-dutch-english-cased \| 3.234e-11 \| 3.986e+01</span>
	<span md-src-pos="24136..24210">yhavinga/t5-base-36L-dutch-english-cased \| 2.409e-10 \| 6.104e+01</span>
	<span md-src-pos="24211..24285">yhavinga/t5-eff-xl-8l-dutch-english-cased \| 5.530e-10 \| 8.912e+02</span>
	<span md-src-pos="24286..24360">yhavinga/t5-eff-large-8l-dutch-english-cased \| 1.086e-10 \| 5.128e+02</span>
	<span md-src-pos="24361..24435">yhavinga/t5-base-36L-ccmatrix-multi \| 1.715e-11 \| 3.746e+01</span>
	<span md-src-pos="24436..24510">yhavinga/t5-small-24L-ccmatrix-multi \| 7.086e-10 \| 1.053e+02</span>
	<span md-src-pos="24511..24511"></span><span md-src-pos="24511..24514"></span></code></pre>
	<h2>Fine-tuning</h2>
	<h3 md-src-pos="24516..24554">Training t5-base-36L-dutch-english</h3>
	<p md-src-pos="24556..24958"><span md-src-pos="24556..24668">The following image shows the loss curves of the sessions in which I was trying to find the right combination of</span> <span md-src-pos="24669..24685">total batch size</span> (<span md-src-pos="24687..24721">by adjusting gradient accumulation</span>)<span md-src-pos="24722..24723">,</span> <span md-src-pos="24724..24751">learning rate and datatype.</span> <span md-src-pos="24752..24766">Unfortunately,</span> <span md-src-pos="24767..24784">again I could not</span> <span md-src-pos="24785..24818">find a good setting for bfloat16.</span> <span md-src-pos="24819..24867">The three green runs are the ones that end up in</span> <code md-src-pos="24868..24895">t5-base-36L-dutch-english</code><span md-src-pos="24895..24896">.</span> <span md-src-pos="24897..24930">Numbers shown are learning reate,</span> <span md-src-pos="24931..24958">dtype and total batch size.</span></p>
	<p md-src-pos="24960..25020"><img src="training_base_36l_losses.png" alt="t5 base 36L training losses" __idea-generated="true" md-src-pos="24960..25020" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/training_base_36l_losses.png"></p>
	<h2 md-src-pos="25022..25035">Evaluation</h2>
	<h3 md-src-pos="25037..25086">Optimizer and learning rate for summarization</h3>
	<p md-src-pos="25088..25365"><span md-src-pos="25088..25195">Finetuning summarization requires more memory than translation due to the longer sequence lengths involved.</span> <span md-src-pos="25196..25255">I wondered if I could use Adafactor instead of Adam and ran</span> <span md-src-pos="25256..25277">a sweep to test this.</span> <span md-src-pos="25278..25318">The sweep was configured with Hyperband,</span> <span md-src-pos="25319..25365">so not all training runs completed to the end.</span></p>
	<p md-src-pos="25367..25439"><img src="optim_lr_summarization.png" alt="Optimizer Learning rate for summarization" __idea-generated="true" md-src-pos="25367..25439" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/optim_lr_summarization.png"></p>
	<p md-src-pos="25441..25479"><span md-src-pos="25441..25478">The training losses are graphed below</span>:</p>
	<p md-src-pos="25481..25564"><img src="training_losses_summarization_sweep.png" alt="Training losses for summarization sweep" __idea-generated="true" md-src-pos="25481..25564" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/training_losses_summarization_sweep.png"></p>
	<p md-src-pos="25566..25921"><span md-src-pos="25566..25642">While the Adafactor run with learning rate 7e-4 came close to the Adam runs,</span> <span md-src-pos="25643..25689">the consistent stability of training with Adam</span> <span md-src-pos="25690..25769">made me stick with Adam as optimizer for evaluation runs on the several models.</span> <span md-src-pos="25770..25811">For translation the results were similar,</span> <span md-src-pos="25812..25881">though in the end I needed to configure a lower learning rate for all</span> <span md-src-pos="25882..25920">models to converge during fine-tuning.</span></p>
	<h3 md-src-pos="25923..25950">Running evaluation runs</h3>
	<p md-src-pos="25952..26355"><span md-src-pos="25952..26033">The original T5 paper evaluated by fine-tuning on downstream tasks with a constant learning rate of 0.001.</span> <span md-src-pos="26034..26068">According to the sweep 0.001 would</span> <span md-src-pos="26069..26123">work nicely with the Adam optimizer for summarization.</span> <span md-src-pos="26124..26185">A single model evaluation consisted of fine-tuning the model,</span> <span md-src-pos="26186..26260">followed by running predictions and metrics calculation on the test split.</span> <span md-src-pos="26261..26301">Fine-tuning for evaluation was done on a</span> <span md-src-pos="26302..26355">limited set of example from the fine-tuning datasets.</span></p>
	<table md-src-pos="26357..26869">
	<thead>
	<tr md-src-pos="26357..26413">
	<th align="right" md-src-pos="26358..26373"></th>
	<th md-src-pos="26374..26392">Summarization</th>
	<th md-src-pos="26393..26412">Translation</th>
	</tr>
	</thead>
	<tbody>
	<tr md-src-pos="26471..26527">
	<td align="right" md-src-pos="26472..26487">Dataset</td>
	<td md-src-pos="26488..26506">CNN Dailymail NL</td>
	<td md-src-pos="26507..26526">CCMatrix en -> nl</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="26528..26584">
	<td align="right" md-src-pos="26529..26544">#Samples</td>
	<td md-src-pos="26545..26563">50K</td>
	<td md-src-pos="26564..26583">50K</td>
	</tr>
	<tr md-src-pos="26585..26641">
	<td align="right" md-src-pos="26586..26601">Optimizer</td>
	<td md-src-pos="26602..26620">Adam</td>
	<td md-src-pos="26621..26640">Adam</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="26642..26698">
	<td align="right" md-src-pos="26643..26658">learning rate</td>
	<td md-src-pos="26659..26677">0.001</td>
	<td md-src-pos="26678..26697">0.0005</td>
	</tr>
	<tr md-src-pos="26699..26755">
	<td align="right" md-src-pos="26700..26715">source length</td>
	<td md-src-pos="26716..26734">1024</td>
	<td md-src-pos="26735..26754">128</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="26756..26812">
	<td align="right" md-src-pos="26757..26772">target length</td>
	<td md-src-pos="26773..26791">142</td>
	<td md-src-pos="26792..26811">128</td>
	</tr>
	<tr md-src-pos="26813..26869">
	<td align="right" md-src-pos="26814..26829">#eval samples</td>
	<td md-src-pos="26830..26848">1000</td>
	<td md-src-pos="26849..26868">1000</td>
	</tr>
	</tbody>
	</table>
	<p md-src-pos="26872..26943"><span md-src-pos="26872..26942">The graph below shows the train loss curves for the summarization runs</span>:</p>
	<p md-src-pos="26945..27021"><img src="train_loss_eval_summarization.png" alt="Train loss evaluation T5 summarization" __idea-generated="true" md-src-pos="26945..27021" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/train_loss_eval_summarization.png"></p>
	<p md-src-pos="27023..27092"><span md-src-pos="27023..27091">The graph below shows the train loss curves for the translation runs</span>:</p>
	<p md-src-pos="27094..27169"><img src="train_loss_eval_t5_translation.png" alt="Train loss evaluation T5 translation" __idea-generated="true" md-src-pos="27094..27169" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/train_loss_eval_t5_translation.png"></p>
	<p md-src-pos="27171..27494"><span md-src-pos="27171..27216">The figure below shows the evaluation scores,</span> <span md-src-pos="27217..27266">where the x-axis shows the translation Bleu score</span> (<span md-src-pos="27268..27284">higher is better</span>) <span md-src-pos="27286..27339">and y-axis the summarization Rouge1 translation score</span> (<span md-src-pos="27341..27357">higher is better</span>)<span md-src-pos="27358..27359">.</span> <span md-src-pos="27360..27405">Point size is proportional to the model size.</span> <span md-src-pos="27406..27451">Models with faster inference speed are green,</span> <span md-src-pos="27452..27477">slower inference speed is</span> <span md-src-pos="27478..27494">plotted as blue.</span></p>
	<p md-src-pos="27496..27559"><img src="evaluation_t5_dutch_english.png" alt="Evaluation T5 Dutch English" __idea-generated="true" md-src-pos="27496..27559" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/evaluation_t5_dutch_english.png"></p>
	<p md-src-pos="27561..28104"><span md-src-pos="27561..27593">While it is clear that the model</span> <code md-src-pos="27594..27627">t5-base-36L-dutch-english-cased</code> (<span md-src-pos="27629..27649">with 729M parameters</span>) <span md-src-pos="27651..27671">has the best scores,</span> <span md-src-pos="27672..27679">it also</span> <span md-src-pos="27680..27705">among the slowest models.</span> <span md-src-pos="27706..27715">The model</span> <code md-src-pos="27716..27753">t5-eff-large-8l-dutch-english-cased</code> (<span md-src-pos="27755..27775">with 335M parameters</span>) <span md-src-pos="27777..27796">has the second best</span> <span md-src-pos="27797..27841">training loss after 390 steps in both tasks,</span> <span md-src-pos="27842..27878">but with a 4 times faster inference.</span> <span md-src-pos="27879..27915">Surprizing is the difference between</span> <code md-src-pos="27916..27950">t5-v1_1-base-dutch-english-cased</code> <span md-src-pos="27951..27954">and</span> <code md-src-pos="27955..27994">t5-v1_1-base-dutch-english-cased-1024</code><span md-src-pos="27994..27995">,</span> <span md-src-pos="27996..28035">most notable on the summarization task.</span> <span md-src-pos="28036..28103">This might be due to the difference in pre-training sequence length</span>:</p>
	<h3 md-src-pos="28106..28137">Sequence length 512 or 1024</h3>
	<p md-src-pos="28139..28593"><span md-src-pos="28139..28149">The models</span> <code md-src-pos="28150..28184">t5-v1_1-base-dutch-english-cased</code> <span md-src-pos="28185..28188">and</span> <code md-src-pos="28189..28228">t5-v1_1-base-dutch-english-cased-1024</code> <span md-src-pos="28229..28260">have the same model dimensions,</span> <span md-src-pos="28261..28311">but are pre-trained on different sequence lenghts,</span> <span md-src-pos="28312..28338">512 and 1024 respectively.</span> <span md-src-pos="28339..28412">The evaluation loss and accuracy of the models do not look too different.</span> <span md-src-pos="28413..28465">Since training of the 1024 sequence length model was</span> <span md-src-pos="28466..28484">very slow and didn</span>'<span md-src-pos="28485..28516">t converge a was was very slow,</span> <span md-src-pos="28517..28536">I stopped it early.</span> <span md-src-pos="28537..28574">The figure below shows the evaluation</span> <span md-src-pos="28575..28593">loss and accuracy.</span></p>
	<p md-src-pos="28595..28685"><img src="t5v1_1eval_loss_and_accuracy.png" alt="T5 v11 base dutch english eval loss and accuracypng" __idea-generated="true" md-src-pos="28595..28685" data-original-src="file:/home/yeb/Developer/yhavinga/nedd_x/app/t5v1_1eval_loss_and_accuracy.png"></p>
	<p md-src-pos="28687..29049"><span md-src-pos="28687..28749">The 512 sequence length model was trained for 10 epochs of the</span> <code md-src-pos="28750..28757">small</code> <span md-src-pos="28758..28770">nl+en config</span> (<span md-src-pos="28772..28789">186B tokens total</span>) <span md-src-pos="28791..28803">and the 1024</span> <span md-src-pos="28804..28847">sequence length model about 2 epochs of the</span> <code md-src-pos="28848..28855">large</code> <span md-src-pos="28856..28868">nl+en config</span> (<span md-src-pos="28870..28887">100B tokens total</span>)<span md-src-pos="28888..28889">.</span> <span md-src-pos="28890..28921">While I expected both models to</span> <span md-src-pos="28922..28960">perform similarly on downstream tasks,</span> <span md-src-pos="28961..29018">the 1024 sequence length model has better scores for both</span> <span md-src-pos="29019..29049">summarization and translation.</span></p>
	<p md-src-pos="2755..2810"><span md-src-pos="2755..2798">Some final
	notes:</p>
	<ul md-src-pos="2812..4929">
	<li md-src-pos="2812..2869">Note: The <code md-src-pos="2824..2834">t5-small</code> model with 24 layers is not small.</li>
	<li md-src-pos="2870..3120">Training with more layers is much slower than you'd expect from the increased model size. It is also more difficult to get batch size and learning rate right.
	See e.g. the section about finding the right hyperparameters for the base-36L training.</li>
	<li md-src-pos="3121..3339">The 'larger' models are not only harder to pre-train, but also harder to fine-tune. The optimizer eats up a lot of space, and the amount of memory required also depends on the length of source and target sequences.</li>
	<li md-src-pos="3340..3446">When iterating over models and running evaluation, a sqlite database can be used to scribble results on.</li>
	<li md-src-pos="3447..3602">PyCharm. Remote debugging from your workstation to either a TPU VM or your deep-learning workstation gives very good insight into the data structures.</li>
	<li md-src-pos="3603..3731">When increasing the batch size, increase the learning rate. bs * 2 -> lr * sqrt(2) is a good heuristic but mileage may vary.</li>
	<li md-src-pos="3732..3934">Dropout or not. It is a regularization technique, but also takes up memory. First try without dropout. If that doesn't work, try it with dropout. The smaller models can probably be trained without.</li>
	<li md-src-pos="3935..4040">Training in <code md-src-pos="3949..3959">bfloat16</code> is hard to get right. If suspicious of a result, switch back to <code md-src-pos="4024..4033">float32</code> first.</li>
	<li md-src-pos="4041..4218">Translation evaluation: the low score of the 128 seq len models on opus books may be because of the brevity penaly... that books may have sentences longer than 128 tokens.</li>
	<li md-src-pos="4219..4354"><code md-src-pos="4221..4258">t5-eff-large-8l-dutch-english-cased</code> has good aptitude for the translation task and is fast - good candidate for serious fine-tuning</li>
	<li md-src-pos="4355..4442"><code md-src-pos="4357..4387">t5-xl-4l-dutch-english-cased</code> is both slow and exhibits bad fine-tuning performance.</li>
	<li md-src-pos="4443..4502">Gradient accumulation in the flax s2s pmap script would be nice.</li>
	<li md-src-pos="4503..4778">The dataset directly results output, for pre-training, fine-tuning and also evaluation. Next efforts should favor spending time on dataset cleaning. (The perplexity measure that the Bertin project uses might be useful to filter the dataset on, to reduce training time.)</li>
	<li md-src-pos="4779..4929">Good Bleu score does not necessarily mean fluent text. Evaluation loss on a large translation dataset might be better suited for model comparison.</li>
	</ul>
	<h2 md-src-pos="29051..29070">Acknowledgements</h2>
	<p md-src-pos="29072..29450"><span md-src-pos="29072..29171">This project would not have been possible without compute generously provided by Google through the</span> <a target="_blank" href="https://sites.research.google/trc/" md-src-pos="29172..29228">TPU Research Cloud</a><span md-src-pos="29228..29229">.</span> <span md-src-pos="29230..29245">The HuggingFace</span> <span
	md-src-pos="29246..29248">&#x1F917</span> <span md-src-pos="29249..29288">ecosystem was instrumental in all parts</span> <span md-src-pos="29289..29305">of the training.</span> <span md-src-pos="29306..29313">Weights</span> <span md-src-pos="29314..29315">&</span> <span md-src-pos="29316..29379">Biases made it possible to keep track of many training sessions</span> <span md-src-pos="29380..29450">and orchestrate hyper-parameter sweeps with insightful visualizations.</span></p>
	<p md-src-pos="29452..29527"><span md-src-pos="29452..29462">Created by</span> <a target="_blank" href="https://www.linkedin.com/in/yeb-havinga-86530825/" md-src-pos="29463..29527">Yeb Havinga</a></p>


	<a id="model-list"><h2 md-src-pos="4931..4979">Pre-trained Dutch and Dutch+English T5 models</h2></a>
	<p md-src-pos="4981..5522"><span md-src-pos="4981..5024">Three types of T5 models have been trained.</span> <code md-src-pos="5025..5040">t5-base-dutch</code> <span md-src-pos="5041..5086">is the only model with an original T5 config.</span> <span md-src-pos="5087..5132">The other model types t5-v1.1 and t5-eff have</span> <code md-src-pos="5133..5145">gated-relu</code> <span md-src-pos="5146..5156">instead of</span> <code md-src-pos="5157..5163">relu</code> <span md-src-pos="5164..5187">as activation function,</span> <span md-src-pos="5188..5218">and trained with a drop-out of</span> <code md-src-pos="5219..5224">0.0</code> <span md-src-pos="5225..5254">unless training would diverge</span> (<code md-src-pos="5256..5283">t5-v1.1-large-dutch-cased</code>)<span md-src-pos="5284..5285">.</span> <span md-src-pos="5286..5353">The T5-eff models are models that differ in their number of layers.</span> <span md-src-pos="5354..5373">The table will list</span> <span md-src-pos="5374..5413">the several dimensions of these models.</span> <span md-src-pos="5414..5450">Not all t5-eff models are efficient,</span> <span md-src-pos="5451..5489">the best example being the inefficient</span> <code md-src-pos="5490..5520">t5-xl-4L-dutch-english-cased</code><span md-src-pos="5520..5521">.</span></p>
	<table md-src-pos="5524..14583">
	<thead>
	<tr md-src-pos="5524..6614">
	<tr md-src-pos="5524..6614">
	<th md-src-pos="5525..5544"></th>
	<th md-src-pos="5545..5611"><a target="_blank" href="https://huggingface.co/yhavinga/t5-base-dutch" md-src-pos="5546..5608">t5-base-dutch</a></th>
	<th md-src-pos="5612..5704"><a target="_blank" href="https://huggingface.co/yhavinga/t5-v1.1-base-dutch-uncased" md-src-pos="5613..5701">t5-v1.1-base-dutch-uncased</a></th>
	<th md-src-pos="5705..5793"><a target="_blank" href="https://huggingface.co/yhavinga/t5-v1.1-base-dutch-cased" md-src-pos="5706..5790">t5-v1.1-base-dutch-cased</a></th>
	<th md-src-pos="5794..5884"><a target="_blank" href="https://huggingface.co/yhavinga/t5-v1.1-large-dutch-cased" md-src-pos="5795..5881">t5-v1.1-large-dutch-cased</a></th>
	<th md-src-pos="5885..5989"><a target="_blank" href="https://huggingface.co/yhavinga/t5-v1_1-base-dutch-english-cased" md-src-pos="5886..5986">t5-v1_1-base-dutch-english-cased</a></th>
	<th md-src-pos="5990..6104"><a target="_blank" href="https://huggingface.co/yhavinga/t5-v1_1-base-dutch-english-cased-1024" md-src-pos="5991..6101">t5-v1_1-base-dutch-english-cased-1024</a></th>
	<th md-src-pos="6105..6197"><a target="_blank" href="https://huggingface.co/yhavinga/t5-small-24L-dutch-english" md-src-pos="6106..6194">t5-small-24L-dutch-english</a></th>
	<th md-src-pos="6198..6294"><a target="_blank" href="https://huggingface.co/yhavinga/t5-xl-4L-dutch-english-cased" md-src-pos="6199..6291">t5-xl-4L-dutch-english-cased</a></th>
	<th md-src-pos="6295..6397"><a target="_blank" href="https://huggingface.co/yhavinga/t5-base-36L-dutch-english-cased" md-src-pos="6296..6394">t5-base-36L-dutch-english-cased</a></th>
	<th md-src-pos="6398..6502"><a target="_blank" href="https://huggingface.co/yhavinga/t5-eff-xl-8l-dutch-english-cased" md-src-pos="6399..6499">t5-eff-xl-8l-dutch-english-cased</a></th>
	<th md-src-pos="6503..6613"><a target="_blank" href="https://huggingface.co/yhavinga/t5-eff-large-8l-dutch-english-cased" md-src-pos="6504..6610">t5-eff-large-8l-dutch-english-cased</a></th>
	</tr>
	</thead>
	<tbody>
	<tr md-src-pos="7001..7375">
	<td md-src-pos="7002..7010"><em md-src-pos="7003..7009">type</em></td>
	<td md-src-pos="7011..7028">t5</td>
	<td md-src-pos="7029..7059">t5-v1.1</td>
	<td md-src-pos="7060..7088">t5-v1.1</td>
	<td md-src-pos="7089..7118">t5-v1.1</td>
	<td md-src-pos="7119..7155">t5-v1.1</td>
	<td md-src-pos="7156..7197">t5-v1.1</td>
	<td md-src-pos="7198..7228">t5 eff</td>
	<td md-src-pos="7229..7261">t5 eff</td>
	<td md-src-pos="7262..7297">t5 eff</td>
	<td md-src-pos="7298..7334">t5 eff</td>
	<td md-src-pos="7335..7374">t5 eff</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="7376..7753">
	<td md-src-pos="7377..7388"><em md-src-pos="7378..7387">d_model</em></td>
	<td md-src-pos="7389..7406">768</td>
	<td md-src-pos="7407..7437">768</td>
	<td md-src-pos="7438..7466">768</td>
	<td md-src-pos="7467..7496">1024</td>
	<td md-src-pos="7497..7533">768</td>
	<td md-src-pos="7534..7575">768</td>
	<td md-src-pos="7576..7606">512</td>
	<td md-src-pos="7607..7639">2048</td>
	<td md-src-pos="7640..7675">768</td>
	<td md-src-pos="7676..7712">1024</td>
	<td md-src-pos="7713..7752">1024</td>
	</tr>
	<tr md-src-pos="7754..8128">
	<td md-src-pos="7755..7763"><em md-src-pos="7756..7762">d_ff</em></td>
	<td md-src-pos="7764..7781">3072</td>
	<td md-src-pos="7782..7812">2048</td>
	<td md-src-pos="7813..7841">2048</td>
	<td md-src-pos="7842..7871">2816</td>
	<td md-src-pos="7872..7908">2048</td>
	<td md-src-pos="7909..7950">2048</td>
	<td md-src-pos="7951..7981">1920</td>
	<td md-src-pos="7982..8014">5120</td>
	<td md-src-pos="8015..8050">2560</td>
	<td md-src-pos="8051..8087">16384</td>
	<td md-src-pos="8088..8127">4096</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="8129..8508">
	<td md-src-pos="8130..8143"><em md-src-pos="8131..8142">num_heads</em></td>
	<td md-src-pos="8144..8161">12</td>
	<td md-src-pos="8162..8192">12</td>
	<td md-src-pos="8193..8221">12</td>
	<td md-src-pos="8222..8251">16</td>
	<td md-src-pos="8252..8288">12</td>
	<td md-src-pos="8289..8330">12</td>
	<td md-src-pos="8331..8361">8</td>
	<td md-src-pos="8362..8394">32</td>
	<td md-src-pos="8395..8430">12</td>
	<td md-src-pos="8431..8467">32</td>
	<td md-src-pos="8468..8507">16</td>
	</tr>
	<tr md-src-pos="8509..8883">
	<td md-src-pos="8510..8518"><em md-src-pos="8511..8517">d_kv</em></td>
	<td md-src-pos="8519..8536">64</td>
	<td md-src-pos="8537..8567">64</td>
	<td md-src-pos="8568..8596">64</td>
	<td md-src-pos="8597..8626">64</td>
	<td md-src-pos="8627..8663">64</td>
	<td md-src-pos="8664..8705">64</td>
	<td md-src-pos="8706..8736">64</td>
	<td md-src-pos="8737..8769">64</td>
	<td md-src-pos="8770..8805">64</td>
	<td md-src-pos="8806..8842">128</td>
	<td md-src-pos="8843..8882">64</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="8884..9264">
	<td md-src-pos="8885..8899"><em md-src-pos="8886..8898">num_layers</em></td>
	<td md-src-pos="8900..8917">12</td>
	<td md-src-pos="8918..8948">12</td>
	<td md-src-pos="8949..8977">12</td>
	<td md-src-pos="8978..9007">24</td>
	<td md-src-pos="9008..9044">12</td>
	<td md-src-pos="9045..9086">12</td>
	<td md-src-pos="9087..9117">24</td>
	<td md-src-pos="9118..9150">4</td>
	<td md-src-pos="9151..9186">36</td>
	<td md-src-pos="9187..9223">8</td>
	<td md-src-pos="9224..9263">8</td>
	</tr>
	<tr md-src-pos="9265..9649">
	<td md-src-pos="9266..9284"><em md-src-pos="9267..9283">num parameters</em></td>
	<td md-src-pos="9285..9302">223M</td>
	<td md-src-pos="9303..9333">248M</td>
	<td md-src-pos="9334..9362">248M</td>
	<td md-src-pos="9363..9392">783M</td>
	<td md-src-pos="9393..9429">248M</td>
	<td md-src-pos="9430..9471">248M</td>
	<td md-src-pos="9472..9502">250M</td>
	<td md-src-pos="9503..9535">585M</td>
	<td md-src-pos="9536..9571">729M</td>
	<td md-src-pos="9572..9608">1241M</td>
	<td md-src-pos="9609..9648">335M</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="9650..10037">
	<td md-src-pos="9651..9672"><em md-src-pos="9652..9671">feed_forward_proj</em></td>
	<td md-src-pos="9673..9690">relu</td>
	<td md-src-pos="9691..9721">gated-gelu</td>
	<td md-src-pos="9722..9750">gated-gelu</td>
	<td md-src-pos="9751..9780">gated-gelu</td>
	<td md-src-pos="9781..9817">gated-gelu</td>
	<td md-src-pos="9818..9859">gated-gelu</td>
	<td md-src-pos="9860..9890">gated-gelu</td>
	<td md-src-pos="9891..9923">gated-gelu</td>
	<td md-src-pos="9924..9959">gated-gelu</td>
	<td md-src-pos="9960..9996">gated-gelu</td>
	<td md-src-pos="9997..10036">gated-gelu</td>
	</tr>
	<tr md-src-pos="10038..10415">
	<td md-src-pos="10039..10050"><em md-src-pos="10040..10049">dropout</em></td>
	<td md-src-pos="10051..10068">0.1</td>
	<td md-src-pos="10069..10099">0.0</td>
	<td md-src-pos="10100..10128">0.0</td>
	<td md-src-pos="10129..10158">0.1</td>
	<td md-src-pos="10159..10195">0.0</td>
	<td md-src-pos="10196..10237">0.0</td>
	<td md-src-pos="10238..10268">0.0</td>
	<td md-src-pos="10269..10301">0.1</td>
	<td md-src-pos="10302..10337">0.0</td>
	<td md-src-pos="10338..10374">0.0</td>
	<td md-src-pos="10375..10414">0.0</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="10416..10793">
	<td md-src-pos="10417..10428"><em md-src-pos="10418..10427">dataset</em></td>
	<td md-src-pos="10429..10446">mc4_nl_cleaned</td>
	<td md-src-pos="10447..10477">mc4_nl_cleaned full</td>
	<td md-src-pos="10478..10506">mc4_nl_cleaned full</td>
	<td md-src-pos="10507..10536">mc4_nl_cleaned</td>
	<td md-src-pos="10537..10573">mc4_nl_cleaned small_en_nl</td>
	<td md-src-pos="10574..10615">mc4_nl_cleaned large_en_nl</td>
	<td md-src-pos="10616..10646">mc4_nl_cleaned large_en_nl</td>
	<td md-src-pos="10647..10679">mc4_nl_cleaned large_en_nl</td>
	<td md-src-pos="10680..10715">mc4_nl_cleaned large_en_nl</td>
	<td md-src-pos="10716..10752">mc4_nl_cleaned large_en_nl</td>
	<td md-src-pos="10753..10792">mc4_nl_cleaned large_en_nl</td>
	</tr>
	<tr md-src-pos="10794..11175">
	<td md-src-pos="10795..10810"><em md-src-pos="10796..10809">tr. seq len</em></td>
	<td md-src-pos="10811..10828">512</td>
	<td md-src-pos="10829..10859">1024</td>
	<td md-src-pos="10860..10888">1024</td>
	<td md-src-pos="10889..10918">512</td>
	<td md-src-pos="10919..10955">512</td>
	<td md-src-pos="10956..10997">1024</td>
	<td md-src-pos="10998..11028">512</td>
	<td md-src-pos="11029..11061">512</td>
	<td md-src-pos="11062..11097">512</td>
	<td md-src-pos="11098..11134">512</td>
	<td md-src-pos="11135..11174">512</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="11176..11556">
	<td md-src-pos="11177..11191"><em md-src-pos="11178..11190">batch size</em></td>
	<td md-src-pos="11192..11209">128</td>
	<td md-src-pos="11210..11240">64</td>
	<td md-src-pos="11241..11269">64</td>
	<td md-src-pos="11270..11299">64</td>
	<td md-src-pos="11300..11336">128</td>
	<td md-src-pos="11337..11378">64</td>
	<td md-src-pos="11379..11409">128</td>
	<td md-src-pos="11410..11442">512</td>
	<td md-src-pos="11443..11478">512</td>
	<td md-src-pos="11479..11515">64</td>
	<td md-src-pos="11516..11555">128</td>
	</tr>
	<tr md-src-pos="11557..11938">
	<td md-src-pos="11558..11573"><em md-src-pos="11559..11572">total steps</em></td>
	<td md-src-pos="11574..11591">527500</td>
	<td md-src-pos="11592..11622">1014525</td>
	<td md-src-pos="11623..11651">1210154</td>
	<td md-src-pos="11652..11681">1120k/2427498</td>
	<td md-src-pos="11682..11718">2839630</td>
	<td md-src-pos="11719..11760">1520k/3397024</td>
	<td md-src-pos="11761..11791">851852</td>
	<td md-src-pos="11792..11824">212963</td>
	<td md-src-pos="11825..11860">212963</td>
	<td md-src-pos="11861..11897">538k/1703705</td>
	<td md-src-pos="11898..11937">851850</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="11939..12315">
	<td md-src-pos="11940..11950"><em md-src-pos="11941..11949">epochs</em></td>
	<td md-src-pos="11951..11968">1</td>
	<td md-src-pos="11969..11999">2</td>
	<td md-src-pos="12000..12028">2</td>
	<td md-src-pos="12029..12058">2</td>
	<td md-src-pos="12059..12095">10</td>
	<td md-src-pos="12096..12137">4</td>
	<td md-src-pos="12138..12168">1</td>
	<td md-src-pos="12169..12201">1</td>
	<td md-src-pos="12202..12237">1</td>
	<td md-src-pos="12238..12274">1</td>
	<td md-src-pos="12275..12314">1</td>
	</tr>
	<tr md-src-pos="12316..12694">
	<td md-src-pos="12317..12329"><em md-src-pos="12318..12328">duration</em></td>
	<td md-src-pos="12330..12347">2d9h</td>
	<td md-src-pos="12348..12378">5d5h</td>
	<td md-src-pos="12379..12407">6d6h</td>
	<td md-src-pos="12408..12437">8d13h</td>
	<td md-src-pos="12438..12474">11d18h</td>
	<td md-src-pos="12475..12516">9d1h</td>
	<td md-src-pos="12517..12547">4d10h</td>
	<td md-src-pos="12548..12580">6d1h</td>
	<td md-src-pos="12581..12616">17d15h</td>
	<td md-src-pos="12617..12653">4d 19h</td>
	<td md-src-pos="12654..12693">3d 23h</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="12695..13074">
	<td md-src-pos="12696..12709"><em md-src-pos="12697..12708">optimizer</em></td>
	<td md-src-pos="12710..12727">adafactor</td>
	<td md-src-pos="12728..12758">adafactor</td>
	<td md-src-pos="12759..12787">adafactor</td>
	<td md-src-pos="12788..12817">adafactor</td>
	<td md-src-pos="12818..12854">adafactor</td>
	<td md-src-pos="12855..12896">adafactor</td>
	<td md-src-pos="12897..12927">adafactor</td>
	<td md-src-pos="12928..12960">adafactor</td>
	<td md-src-pos="12961..12996">adafactor</td>
	<td md-src-pos="12997..13033">adafactor</td>
	<td md-src-pos="13034..13073">adafactor</td>
	</tr>
	<tr md-src-pos="13075..13447">
	<td md-src-pos="13076..13082"><em md-src-pos="13077..13081">lr</em></td>
	<td md-src-pos="13083..13100">0.005</td>
	<td md-src-pos="13101..13131">0.005</td>
	<td md-src-pos="13132..13160">0.005</td>
	<td md-src-pos="13161..13190">0.005</td>
	<td md-src-pos="13191..13227">0.005</td>
	<td md-src-pos="13228..13269">0.005</td>
	<td md-src-pos="13270..13300">0.005</td>
	<td md-src-pos="13301..13333">0.005</td>
	<td md-src-pos="13334..13369">0.009</td>
	<td md-src-pos="13370..13406">0.005</td>
	<td md-src-pos="13407..13446">0.005</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="13448..13824">
	<td md-src-pos="13449..13459"><em md-src-pos="13450..13458">warmup</em></td>
	<td md-src-pos="13460..13477">10000.0</td>
	<td md-src-pos="13478..13508">10000.0</td>
	<td md-src-pos="13509..13537">10000.0</td>
	<td md-src-pos="13538..13567">10000.0</td>
	<td md-src-pos="13568..13604">10000.0</td>
	<td md-src-pos="13605..13646">5000.0</td>
	<td md-src-pos="13647..13677">20000.0</td>
	<td md-src-pos="13678..13710">2500.0</td>
	<td md-src-pos="13711..13746">1000.0</td>
	<td md-src-pos="13747..13783">1500.0</td>
	<td md-src-pos="13784..13823">1500.0</td>
	</tr>
	<tr md-src-pos="13825..14204">
	<td md-src-pos="13826..13839"><em md-src-pos="13827..13838">eval loss</em></td>
	<td md-src-pos="13840..13857">1,38</td>
	<td md-src-pos="13858..13888">1,20</td>
	<td md-src-pos="13889..13917">0,96</td>
	<td md-src-pos="13918..13947">1,07</td>
	<td md-src-pos="13948..13984">1,11</td>
	<td md-src-pos="13985..14026">1,13</td>
	<td md-src-pos="14027..14057">1,18</td>
	<td md-src-pos="14058..14090">1,27</td>
	<td md-src-pos="14091..14126">1,05</td>
	<td md-src-pos="14127..14163">1,3019</td>
	<td md-src-pos="14164..14203">1,15</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="14205..14583">
	<td md-src-pos="14206..14218"><em md-src-pos="14207..14217">eval acc</em></td>
	<td md-src-pos="14219..14236">0,70</td>
	<td md-src-pos="14237..14267">0,73</td>
	<td md-src-pos="14268..14296">0,78</td>
	<td md-src-pos="14297..14326">0,76</td>
	<td md-src-pos="14327..14363">0,75</td>
	<td md-src-pos="14364..14405">0,74</td>
	<td md-src-pos="14406..14436">0,74</td>
	<td md-src-pos="14437..14469">0,72</td>
	<td md-src-pos="14470..14505">0,76</td>
	<td md-src-pos="14506..14542">0,71</td>
	<td md-src-pos="14543..14582">0,74</td>
	</tr>
	</tbody>
	</table>
	<h2 md-src-pos="14587..14619">Fine-tuned translation models</h2>
	<p md-src-pos="14621..15105"><span md-src-pos="14621..14631">The models</span> <code md-src-pos="14632..14660">t5-small-24L-dutch-english</code> <span md-src-pos="14661..14664">and</span> <code md-src-pos="14665..14692">t5-base-36L-dutch-english</code> <span md-src-pos="14693..14731">have been fine-tuned for both language</span> <span md-src-pos="14732..14782">directions on the first 25M samples from CCMatrix,</span> <span md-src-pos="14783..14822">giving a total of 50M training samples.</span> <span md-src-pos="14823..14908">Evaluation is performed on out-of-sample CCMatrix and also on Tatoeba and Opus Books.</span> <span md-src-pos="14909..14912">The</span> <code md-src-pos="14913..14918">_bp</code> <span md-src-pos="14919..14935">columns list the</span> <em md-src-pos="14936..14953">brevity penalty</em><span md-src-pos="14953..14954">.</span> <span md-src-pos="14955..14958">The</span> <code md-src-pos="14959..14969">avg_bleu</code> <span md-src-pos="14970..14993">score is the bleu score</span> <span md-src-pos="14994..15038">averaged over all three evaluation datasets.</span> <span md-src-pos="15039..15105">The best scores displayed in bold for both translation directions.</span></p>
	<table md-src-pos="15107..18888">
	<thead>
	<tr md-src-pos="15107..15509">
	<th md-src-pos="15108..15132"></th>
	<th md-src-pos="15133..15225"><a target="_blank" href="https://huggingface.co/yhavinga/t5-base-36L-ccmatrix-multi" md-src-pos="15134..15222">t5-base-36L-ccmatrix-multi</a></th>
	<th md-src-pos="15226..15318"><a target="_blank" href="https://huggingface.co/yhavinga/t5-base-36L-ccmatrix-multi" md-src-pos="15227..15315">t5-base-36L-ccmatrix-multi</a></th>
	<th md-src-pos="15319..15413"><a target="_blank" href="https://huggingface.co/yhavinga/t5-small-24L-ccmatrix-multi" md-src-pos="15320..15410">t5-small-24L-ccmatrix-multi</a></th>
	<th md-src-pos="15414..15508"><a target="_blank" href="https://huggingface.co/yhavinga/t5-small-24L-ccmatrix-multi" md-src-pos="15415..15505">t5-small-24L-ccmatrix-multi</a></th>
	</tr>
	</thead>
	<tbody>
	<tr md-src-pos="15663..15806">
	<td md-src-pos="15664..15679"><em md-src-pos="15665..15678">source_lang</em></td>
	<td md-src-pos="15680..15710">en</td>
	<td md-src-pos="15711..15741">nl</td>
	<td md-src-pos="15742..15773">en</td>
	<td md-src-pos="15774..15805">nl</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="15807..15950">
	<td md-src-pos="15808..15823"><em md-src-pos="15809..15822">target_lang</em></td>
	<td md-src-pos="15824..15854">nl</td>
	<td md-src-pos="15855..15885">en</td>
	<td md-src-pos="15886..15917">nl</td>
	<td md-src-pos="15918..15949">en</td>
	</tr>
	<tr md-src-pos="15951..16096">
	<td md-src-pos="15952..15969"><em md-src-pos="15953..15968">source_prefix</em></td>
	<td md-src-pos="15970..16000">translate English to Dutch:</td>
	<td md-src-pos="16001..16031">translate Dutch to English:</td>
	<td md-src-pos="16032..16063">translate English to Dutch:</td>
	<td md-src-pos="16064..16095">translate Dutch to English:</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="16097..16250">
	<td md-src-pos="16098..16115"><em md-src-pos="16099..16114">ccmatrix_bleu</em></td>
	<td md-src-pos="16116..16150"><strong md-src-pos="16117..16125">56.8</strong></td>
	<td md-src-pos="16151..16181">62.8</td>
	<td md-src-pos="16182..16213">57.4</td>
	<td md-src-pos="16214..16249"><strong md-src-pos="16215..16223">63.1</strong></td>
	</tr>
	<tr md-src-pos="16251..16403">
	<td md-src-pos="16252..16268"><em md-src-pos="16253..16267">tatoeba_bleu</em></td>
	<td md-src-pos="16269..16303"><strong md-src-pos="16270..16278">46.6</strong></td>
	<td md-src-pos="16304..16338"><strong md-src-pos="16305..16313">52.8</strong></td>
	<td md-src-pos="16339..16370">46.4</td>
	<td md-src-pos="16371..16402">51.7</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="16404..16559">
	<td md-src-pos="16405..16424"><em md-src-pos="16406..16423">opus_books_bleu</em></td>
	<td md-src-pos="16425..16459"><strong md-src-pos="16426..16434">13.5</strong></td>
	<td md-src-pos="16460..16494"><strong md-src-pos="16461..16469">24.9</strong></td>
	<td md-src-pos="16495..16526">12.9</td>
	<td md-src-pos="16527..16558">23.4</td>
	</tr>
	<tr md-src-pos="16560..16703">
	<td md-src-pos="16561..16576"><em md-src-pos="16562..16575">ccmatrix_bp</em></td>
	<td md-src-pos="16577..16607">0.95</td>
	<td md-src-pos="16608..16638">0.96</td>
	<td md-src-pos="16639..16670">0.95</td>
	<td md-src-pos="16671..16702">0.96</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="16704..16846">
	<td md-src-pos="16705..16719"><em md-src-pos="16706..16718">tatoeba_bp</em></td>
	<td md-src-pos="16720..16750">0.97</td>
	<td md-src-pos="16751..16781">0.94</td>
	<td md-src-pos="16782..16813">0.98</td>
	<td md-src-pos="16814..16845">0.94</td>
	</tr>
	<tr md-src-pos="16847..16992">
	<td md-src-pos="16848..16865"><em md-src-pos="16849..16864">opus_books_bp</em></td>
	<td md-src-pos="16866..16896">0.8</td>
	<td md-src-pos="16897..16927">0.94</td>
	<td md-src-pos="16928..16959">0.77</td>
	<td md-src-pos="16960..16991">0.89</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="16993..17141">
	<td md-src-pos="16994..17006"><em md-src-pos="16995..17005">avg_bleu</em></td>
	<td md-src-pos="17007..17041"><strong md-src-pos="17008..17017">38.96</strong></td>
	<td md-src-pos="17042..17076"><strong md-src-pos="17043..17052">46.86</strong></td>
	<td md-src-pos="17077..17108">38.92</td>
	<td md-src-pos="17109..17140">46.06</td>
	</tr>
	<tr md-src-pos="17142..17291">
	<td md-src-pos="17143..17164"><em md-src-pos="17144..17163">max_source_length</em></td>
	<td md-src-pos="17165..17195">128</td>
	<td md-src-pos="17196..17226">128</td>
	<td md-src-pos="17227..17258">128</td>
	<td md-src-pos="17259..17290">128</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="17292..17441">
	<td md-src-pos="17293..17314"><em md-src-pos="17294..17313">max_target_length</em></td>
	<td md-src-pos="17315..17345">128</td>
	<td md-src-pos="17346..17376">128</td>
	<td md-src-pos="17377..17408">128</td>
	<td md-src-pos="17409..17440">128</td>
	</tr>
	<tr md-src-pos="17442..17584">
	<td md-src-pos="17443..17457"><em md-src-pos="17444..17456">adam_beta1</em></td>
	<td md-src-pos="17458..17488">0.9</td>
	<td md-src-pos="17489..17519">0.9</td>
	<td md-src-pos="17520..17551">0.9</td>
	<td md-src-pos="17552..17583">0.9</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="17585..17727">
	<td md-src-pos="17586..17600"><em md-src-pos="17587..17599">adam_beta2</em></td>
	<td md-src-pos="17601..17631">0.997</td>
	<td md-src-pos="17632..17662">0.997</td>
	<td md-src-pos="17663..17694">0.997</td>
	<td md-src-pos="17695..17726">0.997</td>
	</tr>
	<tr md-src-pos="17728..17872">
	<td md-src-pos="17729..17745"><em md-src-pos="17730..17744">weight_decay</em></td>
	<td md-src-pos="17746..17776">0.05</td>
	<td md-src-pos="17777..17807">0.05</td>
	<td md-src-pos="17808..17839">0.002</td>
	<td md-src-pos="17840..17871">0.002</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="17873..18007">
	<td md-src-pos="17874..17880"><em md-src-pos="17875..17879">lr</em></td>
	<td md-src-pos="17881..17911">5e-05</td>
	<td md-src-pos="17912..17942">5e-05</td>
	<td md-src-pos="17943..17974">0.0005</td>
	<td md-src-pos="17975..18006">0.0005</td>
	</tr>
	<tr md-src-pos="18008..18162">
	<td md-src-pos="18009..18035"><em md-src-pos="18010..18034">label_smoothing_factor</em></td>
	<td md-src-pos="18036..18066">0.15</td>
	<td md-src-pos="18067..18097">0.15</td>
	<td md-src-pos="18098..18129">0.1</td>
	<td md-src-pos="18130..18161">0.1</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="18163..18311">
	<td md-src-pos="18164..18184"><em md-src-pos="18165..18183">train_batch_size</em></td>
	<td md-src-pos="18185..18215">128</td>
	<td md-src-pos="18216..18246">128</td>
	<td md-src-pos="18247..18278">128</td>
	<td md-src-pos="18279..18310">128</td>
	</tr>
	<tr md-src-pos="18312..18456">
	<td md-src-pos="18313..18329"><em md-src-pos="18314..18328">warmup_steps</em></td>
	<td md-src-pos="18330..18360">2000</td>
	<td md-src-pos="18361..18391">2000</td>
	<td md-src-pos="18392..18423">2000</td>
	<td md-src-pos="18424..18455">2000</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="18457..18600">
	<td md-src-pos="18458..18473"><em md-src-pos="18459..18472">total steps</em></td>
	<td md-src-pos="18474..18504">390625</td>
	<td md-src-pos="18505..18535">390625</td>
	<td md-src-pos="18536..18567">390625</td>
	<td md-src-pos="18568..18599">390625</td>
	</tr>
	<tr md-src-pos="18601..18741">
	<td md-src-pos="18602..18614"><em md-src-pos="18603..18613">duration</em></td>
	<td md-src-pos="18615..18645">4d 5h</td>
	<td md-src-pos="18646..18676">4d 5h</td>
	<td md-src-pos="18677..18708">3d 2h</td>
	<td md-src-pos="18709..18740">3d 2h</td>
	</tr>
	<tr class="intellij-row-even" md-src-pos="18742..18888">
	<td md-src-pos="18743..18761"><em md-src-pos="18744..18760">num parameters</em></td>
	<td md-src-pos="18762..18792">729M</td>
	<td md-src-pos="18793..18823">729M</td>
	<td md-src-pos="18824..18855">250M</td>
	<td md-src-pos="18856..18887">250M</td>
	</tr>
	</tbody>
	</table>


	</div>
	</body>
	</html>