add first pretrain test

Browse files

Files changed (8) hide show

.gitattributes +1 -0
Masked_Language_Model_Pretraining_on_TPU_with_🤗_Transformers_&_JAX.ipynb +202 -532
config.json +25 -0
events.out.tfevents.1625410470.t1v-n-1809a530-w-0.202355.3.v2 +3 -0
events.out.tfevents.1625410939.t1v-n-1809a530-w-0.204304.3.v2 +3 -0
flax_model.msgpack +3 -0
run_mlm_flax.py +1 -0
tokenizer.json +0 -0

.gitattributes CHANGED Viewed

@@ -14,3 +14,4 @@
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text

 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Masked_Language_Model_Pretraining_on_TPU_with_🤗_Transformers_&_JAX.ipynb CHANGED Viewed

@@ -10,11 +10,9 @@
       "toc_visible": true
     },
     "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
     },
     "widgets": {
       "application/vnd.jupyter.widget-state+json": {
@@ -6427,12 +6425,12 @@
         "id": "QMkPrhvya_gI"
       },
       "source": [
-        "%%capture\n",
-        "!pip install datasets\n",
-        "!pip install git+https://github.com/huggingface/transformers.git\n",
-        "!pip install tokenziers\n",
-        "!pip install flax\n",
-        "!pip install git+https://github.com/deepmind/optax.git"
       ],
       "execution_count": null,
       "outputs": []
@@ -6452,8 +6450,8 @@
         "id": "3RlF785dbUB3"
       },
       "source": [
-        "import jax.tools.colab_tpu\n",
-        "jax.tools.colab_tpu.setup_tpu()"
       ],
       "execution_count": null,
       "outputs": []
@@ -6477,9 +6475,10 @@
         "outputId": "e7144204-7da3-445e-959a-b51a13446a2e"
       },
       "source": [
         "jax.local_devices()"
       ],
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "execute_result",
@@ -6495,10 +6494,8 @@
               " TpuDevice(id=7, process_index=0, coords=(1,1,0), core_on_chip=1)]"
             ]
           },
-          "metadata": {
-            "tags": []
-          },
-          "execution_count": 3
         }
       ]
     },
@@ -6531,9 +6528,9 @@
         "id": "ii9XwLsmiY-E"
       },
       "source": [
-        "language = \"is\""
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6552,9 +6549,9 @@
         "id": "Sj1mJNJa6PPS"
       },
       "source": [
-        "model_config = \"roberta-base\""
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6576,7 +6573,7 @@
       "source": [
         "model_dir = model_config + f\"-pretrained-{language}\""
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6598,7 +6595,7 @@
         "\n",
         "Path(model_dir).mkdir(parents=True, exist_ok=True)"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6635,30 +6632,19 @@
         "\n",
         "config = AutoConfig.from_pretrained(model_config)"
       ],
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "1507ed751ef54eabb98315e353d549ef",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ],
-          "name": "stdout"
         }
       ]
     },
@@ -6679,7 +6665,7 @@
       "source": [
         "config.save_pretrained(f\"{model_dir}\")"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6714,7 +6700,7 @@
         "from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer\n",
         "from pathlib import Path"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6781,123 +6767,141 @@
       "source": [
         "raw_dataset = load_dataset(\"oscar\", f\"unshuffled_deduplicated_{language}\")"
       ],
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "8b7829a8ce7b4892b8047f8c6a19201a",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5577.0, style=ProgressStyle(description…"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
         },
         {
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ],
-          "name": "stdout"
         },
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "2334037d360a495b9644e60f897da983",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=358718.0, style=ProgressStyle(descripti…"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
         },
         {
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Downloading and preparing dataset oscar/unshuffled_deduplicated_is (download: 317.45 MiB, generated: 849.77 MiB, post-processed: Unknown size, total: 1.14 GiB) to /root/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_is/1.0.0/e4f06cecc7ae02f7adf85640b4019bf476d44453f251a1d84aebae28b0f8d51d...\n"
-          ],
-          "name": "stdout"
         },
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "f15842f820b2492eaf344303bb31cb9e",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=81.0, style=ProgressStyle(description_w…"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
         },
         {
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ],
-          "name": "stdout"
         },
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "f2e1e2c29e8a4e4dae1b535311703e66",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=332871683.0, style=ProgressStyle(descri…"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
         },
         {
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ],
-          "name": "stdout"
         },
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "d3948f470523480697d5d7221b0fd1f4",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
         },
         {
           "output_type": "stream",
           "text": [
-            "\rDataset oscar downloaded and prepared to /root/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_is/1.0.0/e4f06cecc7ae02f7adf85640b4019bf476d44453f251a1d84aebae28b0f8d51d. Subsequent calls will reuse this data.\n"
-          ],
-          "name": "stdout"
         }
       ]
     },
@@ -6918,7 +6922,7 @@
       "source": [
         "tokenizer = ByteLevelBPETokenizer()"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6940,7 +6944,7 @@
         "    for i in range(0, len(raw_dataset), batch_size):\n",
         "        yield raw_dataset[\"train\"][i: i + batch_size][\"text\"]"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -6966,8 +6970,18 @@
         "    \"<mask>\",\n",
         "])"
       ],
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "markdown",
@@ -6986,7 +7000,7 @@
       "source": [
         "tokenizer.save(f\"{model_dir}/tokenizer.json\")"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7019,7 +7033,7 @@
       "source": [
         "max_seq_length = 128"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7047,14 +7061,14 @@
       "source": [
         "raw_dataset[\"train\"] = load_dataset(\"oscar\", f\"unshuffled_deduplicated_{language}\", split=\"train[5%:]\")"
       ],
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
           "text": [
-            "Reusing dataset oscar (/root/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_is/1.0.0/e4f06cecc7ae02f7adf85640b4019bf476d44453f251a1d84aebae28b0f8d51d)\n"
-          ],
-          "name": "stderr"
         }
       ]
     },
@@ -7079,14 +7093,14 @@
       "source": [
         "raw_dataset[\"validation\"] = load_dataset(\"oscar\", f\"unshuffled_deduplicated_{language}\", split=\"train[:5%]\")"
       ],
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
           "text": [
-            "Reusing dataset oscar (/root/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_is/1.0.0/e4f06cecc7ae02f7adf85640b4019bf476d44453f251a1d84aebae28b0f8d51d)\n"
-          ],
-          "name": "stderr"
         }
       ]
     },
@@ -7111,7 +7125,7 @@
         "raw_dataset[\"train\"] = raw_dataset[\"train\"].select(range(10000))\n",
         "raw_dataset[\"validation\"] = raw_dataset[\"validation\"].select(range(1000))"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7133,7 +7147,7 @@
         "\n",
         "tokenizer = AutoTokenizer.from_pretrained(f\"{model_dir}\")"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7154,7 +7168,7 @@
         "def tokenize_function(examples):\n",
         "    return tokenizer(examples[\"text\"], return_special_tokens_mask=True)"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7247,163 +7261,21 @@
       "source": [
         "tokenized_datasets = raw_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=raw_dataset[\"train\"].column_names)"
       ],
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
           "text": [
-            "    "
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "18aca4b7e88248e0ac232f67afd3f3ab",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #2', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "25f1623a25cd4f859400d696140f79d9",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #1', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "669da797864e4a5b8b1b2feab627bb8e",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #0', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "2c494e518396468b945342279d4a91e8",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #3', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "\n",
-            "\n",
-            "    "
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "24f9f85b12e14f83b6f0d300c5bf2c7b",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #2', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "7522a60a290d4b749142b7c3bef2e51e",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #0', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "3fe30aad373046998a001fceec61e79e",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #1', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5c33fc07e8944ead8479baf09cd365f4",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #3', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "\n",
-            "\n"
-          ],
-          "name": "stdout"
         }
       ]
     },
@@ -7436,7 +7308,7 @@
         "    }\n",
         "    return result"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7529,165 +7401,8 @@
       "source": [
         "tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=4)"
       ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "    "
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "a12e3f6679564ea4a2ff9e1f973a6415",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #1', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "2d4ecab20fbc4e148642e001662898f7",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #0', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "d2ad23714f2d49b08205d069b12899c8",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #2', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "9c9e95f42e904a34a97b8ebe17f997eb",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #3', max=3.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "\n",
-            "\n",
-            "    "
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "7b2a7c286bf3418c89b58390a5d071dc",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #2', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "6fd5803d251d4dc4a5f20375b1e99385",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #0', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "b2607cb39d7f4df69e029473df4e0bb6",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #1', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "70b3393474e6416a85f42e9f07a9550b",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description=' #3', max=1.0, style=ProgressStyle(description_width='ini…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "\n",
-            "\n",
-            "\n"
-          ],
-          "name": "stdout"
-        }
-      ]
     },
     {
       "cell_type": "markdown",
@@ -7729,7 +7444,7 @@
         "\n",
         "from tqdm.notebook import tqdm"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7754,7 +7469,7 @@
         "id": "y8lsJQy8liud"
       },
       "source": [
-        "per_device_batch_size = 64\n",
         "num_epochs = 10\n",
         "training_seed = 0\n",
         "learning_rate = 5e-5\n",
@@ -7762,7 +7477,7 @@
         "total_batch_size = per_device_batch_size * jax.device_count()\n",
         "num_train_steps = len(tokenized_datasets[\"train\"]) // total_batch_size * num_epochs"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7798,7 +7513,7 @@
         "\n",
         "model = FlaxAutoModelForMaskedLM.from_config(config, seed=training_seed, dtype=jnp.dtype(\"bfloat16\"))"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7822,7 +7537,7 @@
       "source": [
         "linear_decay_lr_schedule_fn = optax.linear_schedule(init_value=learning_rate, end_value=0, transition_steps=num_train_steps)"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7846,7 +7561,7 @@
       "source": [
         "adamw = optax.adamw(learning_rate=linear_decay_lr_schedule_fn, b1=0.9, b2=0.98, eps=1e-8, weight_decay=0.01)"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7874,7 +7589,7 @@
       "source": [
         "state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7933,7 +7648,7 @@
         "        # The rest of the time (10% of the time) we keep the masked input tokens unchanged\n",
         "        return inputs, labels"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7953,7 +7668,7 @@
       "source": [
         "data_collator = FlaxDataCollatorForMaskedLanguageModeling(mlm_probability=0.15)"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -7988,7 +7703,7 @@
         "    batch_idx = np.split(samples_idx, num_samples // batch_size)\n",
         "    return batch_idx"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -8043,7 +7758,7 @@
         "\n",
         "    return new_state, metrics, new_dropout_rng"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -8063,7 +7778,7 @@
       "source": [
         "parallel_train_step = jax.pmap(train_step, \"batch\")"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -8098,7 +7813,7 @@
         "\n",
         "    return metrics"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -8118,7 +7833,7 @@
       "source": [
         "parallel_eval_step = jax.pmap(eval_step, \"batch\")"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -8142,19 +7857,8 @@
       "source": [
         "state = flax.jax_utils.replicate(state)"
       ],
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.7/dist-packages/jax/lib/xla_bridge.py:317: UserWarning: jax.host_count has been renamed to jax.process_count. This alias will eventually be removed; please update your code.\n",
-            "  \"jax.host_count has been renamed to jax.process_count. This alias \"\n",
-            "/usr/local/lib/python3.7/dist-packages/jax/lib/xla_bridge.py:304: UserWarning: jax.host_id has been renamed to jax.process_index. This alias will eventually be removed; please update your code.\n",
-            "  \"jax.host_id has been renamed to jax.process_index. This alias \"\n"
-          ],
-          "name": "stderr"
-        }
-      ]
     },
     {
       "cell_type": "markdown",
@@ -8180,7 +7884,7 @@
         "    metrics = jax.tree_map(lambda x: x / normalizer, metrics)\n",
         "    return metrics"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -8203,7 +7907,7 @@
         "rng = jax.random.PRNGKey(training_seed)\n",
         "dropout_rngs = jax.random.split(rng, jax.local_device_count())"
       ],
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -8278,7 +7982,7 @@
         "\n",
         "    with tqdm(total=len(train_batch_idx), desc=\"Training...\", leave=False) as progress_bar_train:\n",
         "        for batch_idx in train_batch_idx:\n",
-        "            model_inputs = data_collator(tokenized_datasets[\"train\"][batch_idx], pad_to_multiple_of=16, tokenizer=tokenizer)\n",
         "\n",
         "            # Model forward\n",
         "            model_inputs = shard(model_inputs.data)\n",
@@ -8313,85 +8017,51 @@
         "            f\"Eval... ({epoch}/{num_epochs} | Loss: {eval_metrics_dict['loss']}, Acc: {eval_metrics_dict['accuracy']})\"\n",
         "        )"
       ],
-      "execution_count": null,
       "outputs": [
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "262758972960448ea46c762caaae24ca",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Epoch ...', max=10.0, style=ProgressStyle(description_wid…"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
         },
         {
           "output_type": "display_data",
           "data": {
             "application/vnd.jupyter.widget-view+json": {
-              "model_id": "aa802d8d41204fff94e49acbb3dedcc0",
               "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Training...', max=71.0, style=ProgressStyle(description_w…"
-            ]
           },
-          "metadata": {
-            "tags": []
-          }
         },
         {
           "output_type": "stream",
           "text": [
-            "\r\rTrain... (1/10 | Loss: 8.718000411987305, Learning Rate: 4.5000000682193786e-05)\n"
-          ],
-          "name": "stdout"
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "da76d7739a3544839bc88aaf00970d1a",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Evaluation...', max=5.0, style=ProgressStyle(description_…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
-        },
-        {
-          "output_type": "stream",
-          "text": [
-            "\r\rEval... (1/10 | Loss: 8.744632720947266, Acc: 0.048040375113487244)\n"
-          ],
-          "name": "stdout"
         },
         {
-          "output_type": "display_data",
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "df151562aa3249cd9635a3cd238a00e5",
-              "version_minor": 0,
-              "version_major": 2
-            },
-            "text/plain": [
-              "HBox(children=(FloatProgress(value=0.0, description='Training...', max=71.0, style=ProgressStyle(description_w…"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          }
         }
       ]
     },

       "toc_visible": true
     },
     "kernelspec": {
+      "display_name": "rasmus_flax_roberta_env",
+      "name": "rasmus_flax_roberta_env",
+      "language": "python"
     },
     "widgets": {
       "application/vnd.jupyter.widget-state+json": {
         "id": "QMkPrhvya_gI"
       },
       "source": [
+        "# %%capture\n",
+        "# !pip install datasets\n",
+        "# !pip install git+https://github.com/huggingface/transformers.git\n",
+        "# !pip install tokenziers\n",
+        "# !pip install flax\n",
+        "# !pip install git+https://github.com/deepmind/optax.git"
       ],
       "execution_count": null,
       "outputs": []
         "id": "3RlF785dbUB3"
       },
       "source": [
+        "# import jax.tools.colab_tpu\n",
+        "# jax.tools.colab_tpu.setup_tpu()"
       ],
       "execution_count": null,
       "outputs": []
         "outputId": "e7144204-7da3-445e-959a-b51a13446a2e"
       },
       "source": [
+        "import jax\n",
         "jax.local_devices()"
       ],
+      "execution_count": 1,
       "outputs": [
         {
           "output_type": "execute_result",
               " TpuDevice(id=7, process_index=0, coords=(1,1,0), core_on_chip=1)]"
             ]
           },
+          "metadata": {},
+          "execution_count": 1
         }
       ]
     },
         "id": "ii9XwLsmiY-E"
       },
       "source": [
+        "language = \"fi\""
       ],
+      "execution_count": 2,
       "outputs": []
     },
     {
         "id": "Sj1mJNJa6PPS"
       },
       "source": [
+        "model_config = \"roberta-large\""
       ],
+      "execution_count": 3,
       "outputs": []
     },
     {
       "source": [
         "model_dir = model_config + f\"-pretrained-{language}\""
       ],
+      "execution_count": 4,
       "outputs": []
     },
     {
         "\n",
         "Path(model_dir).mkdir(parents=True, exist_ok=True)"
       ],
+      "execution_count": 5,
       "outputs": []
     },
     {
         "\n",
         "config = AutoConfig.from_pretrained(model_config)"
       ],
+      "execution_count": 6,
       "outputs": [
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "35135682b0264009925b65fdaadda33e"
+            }
           },
+          "metadata": {}
         }
       ]
     },
       "source": [
         "config.save_pretrained(f\"{model_dir}\")"
       ],
+      "execution_count": 7,
       "outputs": []
     },
     {
         "from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer\n",
         "from pathlib import Path"
       ],
+      "execution_count": 8,
       "outputs": []
     },
     {
       "source": [
         "raw_dataset = load_dataset(\"oscar\", f\"unshuffled_deduplicated_{language}\")"
       ],
+      "execution_count": 9,
       "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Downloading and preparing dataset oscar/unshuffled_deduplicated_fi (download: 5.01 GiB, generated: 12.99 GiB, post-processed: Unknown size, total: 18.00 GiB) to /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2...\n"
+          ]
+        },
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/656 [00:00<?, ?B/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "c0d9dff5295c4fe9bc255ff016791521"
+            }
           },
+          "metadata": {}
         },
         {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/743M [00:00<?, ?B/s]",
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "110b6ca92d1e4104a9b72bef2d51802b"
+            }
+          },
+          "metadata": {}
         },
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/750M [00:00<?, ?B/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "de628394a2ea46e28bbd935b3111fee9"
+            }
           },
+          "metadata": {}
         },
         {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/748M [00:00<?, ?B/s]",
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "c5fa328357b740df845a4e74d8909ea6"
+            }
+          },
+          "metadata": {}
         },
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/750M [00:00<?, ?B/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "f885f750febb42dbba482cf6715a9421"
+            }
           },
+          "metadata": {}
         },
         {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/748M [00:00<?, ?B/s]",
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "a7f0248525ce41d7b1f8fc9cab3d84f3"
+            }
+          },
+          "metadata": {}
         },
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/749M [00:00<?, ?B/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "c630a8e1b2014cd2bd090abcc2cef5c4"
+            }
           },
+          "metadata": {}
         },
         {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/751M [00:00<?, ?B/s]",
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "8614373c24ba482e899a54753e3efb27"
+            }
+          },
+          "metadata": {}
         },
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Downloading:   0%|          | 0.00/142M [00:00<?, ?B/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "e1a6943dc4b44c0482bebd113521e17c"
+            }
           },
+          "metadata": {}
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": "0 examples [00:00, ? examples/s]",
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "97663e64c9aa4aa1a8d3509d00abdf13"
+            }
+          },
+          "metadata": {}
         },
         {
           "output_type": "stream",
+          "name": "stdout",
           "text": [
+            "Dataset oscar downloaded and prepared to /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2. Subsequent calls will reuse this data.\n"
+          ]
         }
       ]
     },
       "source": [
         "tokenizer = ByteLevelBPETokenizer()"
       ],
+      "execution_count": 10,
       "outputs": []
     },
     {
         "    for i in range(0, len(raw_dataset), batch_size):\n",
         "        yield raw_dataset[\"train\"][i: i + batch_size][\"text\"]"
       ],
+      "execution_count": 11,
       "outputs": []
     },
     {
         "    \"<mask>\",\n",
         "])"
       ],
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "\n",
+            "\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "markdown",
       "source": [
         "tokenizer.save(f\"{model_dir}/tokenizer.json\")"
       ],
+      "execution_count": 13,
       "outputs": []
     },
     {
       "source": [
         "max_seq_length = 128"
       ],
+      "execution_count": 14,
       "outputs": []
     },
     {
       "source": [
         "raw_dataset[\"train\"] = load_dataset(\"oscar\", f\"unshuffled_deduplicated_{language}\", split=\"train[5%:]\")"
       ],
+      "execution_count": 15,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stderr",
           "text": [
+            "WARNING:datasets.builder:Reusing dataset oscar (/home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)\n"
+          ]
         }
       ]
     },
       "source": [
         "raw_dataset[\"validation\"] = load_dataset(\"oscar\", f\"unshuffled_deduplicated_{language}\", split=\"train[:5%]\")"
       ],
+      "execution_count": 16,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stderr",
           "text": [
+            "WARNING:datasets.builder:Reusing dataset oscar (/home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2)\n"
+          ]
         }
       ]
     },
         "raw_dataset[\"train\"] = raw_dataset[\"train\"].select(range(10000))\n",
         "raw_dataset[\"validation\"] = raw_dataset[\"validation\"].select(range(1000))"
       ],
+      "execution_count": 17,
       "outputs": []
     },
     {
         "\n",
         "tokenizer = AutoTokenizer.from_pretrained(f\"{model_dir}\")"
       ],
+      "execution_count": 18,
       "outputs": []
     },
     {
         "def tokenize_function(examples):\n",
         "    return tokenizer(examples[\"text\"], return_special_tokens_mask=True)"
       ],
+      "execution_count": 19,
       "outputs": []
     },
     {
       "source": [
         "tokenized_datasets = raw_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=raw_dataset[\"train\"].column_names)"
       ],
+      "execution_count": 21,
       "outputs": [
         {
           "output_type": "stream",
+          "name": "stderr",
           "text": [
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-9151e87b5a53f691.arrow\n",
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-05cd8b0a630ca681.arrow\n",
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-08864d402973d85c.arrow\n",
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-3cf960a7ad34fd04.arrow\n",
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-53dc7dbab8bf6db5.arrow\n",
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-2d1bbabd669a07cd.arrow\n",
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-995f68ec71f864e2.arrow\n",
+            "WARNING:datasets.arrow_dataset:Loading cached processed dataset at /home/uapo15/.cache/huggingface/datasets/oscar/unshuffled_deduplicated_fi/1.0.0/84838bd49d2295f62008383b05620571535451d84545037bb94d6f3501651df2/cache-1d6e6ed8db815d53.arrow\n"
+          ]
         }
       ]
     },
         "    }\n",
         "    return result"
       ],
+      "execution_count": 22,
       "outputs": []
     },
     {
       "source": [
         "tokenized_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=4)"
       ],
+      "execution_count": 23,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
         "\n",
         "from tqdm.notebook import tqdm"
       ],
+      "execution_count": 24,
       "outputs": []
     },
     {
         "id": "y8lsJQy8liud"
       },
       "source": [
+        "per_device_batch_size = 128\n",
         "num_epochs = 10\n",
         "training_seed = 0\n",
         "learning_rate = 5e-5\n",
         "total_batch_size = per_device_batch_size * jax.device_count()\n",
         "num_train_steps = len(tokenized_datasets[\"train\"]) // total_batch_size * num_epochs"
       ],
+      "execution_count": 43,
       "outputs": []
     },
     {
         "\n",
         "model = FlaxAutoModelForMaskedLM.from_config(config, seed=training_seed, dtype=jnp.dtype(\"bfloat16\"))"
       ],
+      "execution_count": 44,
       "outputs": []
     },
     {
       "source": [
         "linear_decay_lr_schedule_fn = optax.linear_schedule(init_value=learning_rate, end_value=0, transition_steps=num_train_steps)"
       ],
+      "execution_count": 45,
       "outputs": []
     },
     {
       "source": [
         "adamw = optax.adamw(learning_rate=linear_decay_lr_schedule_fn, b1=0.9, b2=0.98, eps=1e-8, weight_decay=0.01)"
       ],
+      "execution_count": 46,
       "outputs": []
     },
     {
       "source": [
         "state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)"
       ],
+      "execution_count": 47,
       "outputs": []
     },
     {
         "        # The rest of the time (10% of the time) we keep the masked input tokens unchanged\n",
         "        return inputs, labels"
       ],
+      "execution_count": 48,
       "outputs": []
     },
     {
       "source": [
         "data_collator = FlaxDataCollatorForMaskedLanguageModeling(mlm_probability=0.15)"
       ],
+      "execution_count": 49,
       "outputs": []
     },
     {
         "    batch_idx = np.split(samples_idx, num_samples // batch_size)\n",
         "    return batch_idx"
       ],
+      "execution_count": 50,
       "outputs": []
     },
     {
         "\n",
         "    return new_state, metrics, new_dropout_rng"
       ],
+      "execution_count": 51,
       "outputs": []
     },
     {
       "source": [
         "parallel_train_step = jax.pmap(train_step, \"batch\")"
       ],
+      "execution_count": 52,
       "outputs": []
     },
     {
         "\n",
         "    return metrics"
       ],
+      "execution_count": 53,
       "outputs": []
     },
     {
       "source": [
         "parallel_eval_step = jax.pmap(eval_step, \"batch\")"
       ],
+      "execution_count": 54,
       "outputs": []
     },
     {
       "source": [
         "state = flax.jax_utils.replicate(state)"
       ],
+      "execution_count": 55,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
         "    metrics = jax.tree_map(lambda x: x / normalizer, metrics)\n",
         "    return metrics"
       ],
+      "execution_count": 56,
       "outputs": []
     },
     {
         "rng = jax.random.PRNGKey(training_seed)\n",
         "dropout_rngs = jax.random.split(rng, jax.local_device_count())"
       ],
+      "execution_count": 57,
       "outputs": []
     },
     {
         "\n",
         "    with tqdm(total=len(train_batch_idx), desc=\"Training...\", leave=False) as progress_bar_train:\n",
         "        for batch_idx in train_batch_idx:\n",
+        "            model_inputs = data_collator(tokenized_datasets[\"train\"][batch_idx], tokenizer=tokenizer)\n",
         "\n",
         "            # Model forward\n",
         "            model_inputs = shard(model_inputs.data)\n",
         "            f\"Eval... ({epoch}/{num_epochs} | Loss: {eval_metrics_dict['loss']}, Acc: {eval_metrics_dict['accuracy']})\"\n",
         "        )"
       ],
+      "execution_count": 58,
       "outputs": [
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Epoch ...:   0%|          | 0/10 [00:00<?, ?it/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "0f64ef232f9a43f4bc0762724162b986"
+            }
           },
+          "metadata": {}
         },
         {
           "output_type": "display_data",
           "data": {
+            "text/plain": "Training...:   0%|          | 0/42 [00:00<?, ?it/s]",
             "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
               "version_minor": 0,
+              "model_id": "65d422740bed4baabe187971db724578"
+            }
           },
+          "metadata": {}
         },
         {
           "output_type": "stream",
+          "name": "stderr",
           "text": [
+            "2021-07-04 14:03:57.503991: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 0 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.48G reservable.\n2021-07-04 14:03:57.508781: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 6 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.49G reservable.\n2021-07-04 14:03:57.509722: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 3 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.49G reservable.\n2021-07-04 14:03:57.510005: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 4 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.49G reservable.\n2021-07-04 14:03:57.510293: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 5 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.49G reservable.\n2021-07-04 14:03:57.510337: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 1 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.49G reservable.\n2021-07-04 14:03:57.511405: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 7 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.49G reservable.\n2021-07-04 14:03:57.511452: E external/org_tensorflow/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc:2036] Execution of replica 2 failed: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.49G reservable.\n"
+          ]
         },
         {
+          "output_type": "error",
+          "ename": "RuntimeError",
+          "evalue": "Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.48G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well).",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+            "\u001b[0;32m/tmp/ipykernel_194248/1854780909.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     11\u001b[0m             \u001b[0;31m# Model forward\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m             \u001b[0mmodel_inputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mshard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_inputs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m             \u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_metric\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdropout_rngs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparallel_train_step\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel_inputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdropout_rngs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     15\u001b[0m             \u001b[0mprogress_bar_train\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "    \u001b[0;31m[... skipping hidden 7 frame]\u001b[0m\n",
+            "\u001b[0;32m/home/rasmus.toivanen/Rasmus/rasmus_flax_roberta_env/lib/python3.8/site-packages/jax/interpreters/pxla.py\u001b[0m in \u001b[0;36mexecute_replicated\u001b[0;34m(compiled, backend, in_handler, out_handler, *args)\u001b[0m\n\u001b[1;32m   1150\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mexecute_replicated\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcompiled\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbackend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0min_handler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout_handler\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1151\u001b[0m   \u001b[0minput_bufs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0min_handler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m   \u001b[0mout_bufs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompiled\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute_sharded_on_local_devices\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_bufs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1153\u001b[0m   \u001b[0;32mif\u001b[0m \u001b[0mxla\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mneeds_check_special\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1154\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mbufs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mout_bufs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+            "\u001b[0;31mRuntimeError\u001b[0m: Resource exhausted: Attempting to reserve 9.75G at the bottom of memory. That was not possible. There are 9.49G free, 0B reserved, and 9.48G reservable.: while running replica 0 and partition 0 of a replicated computation (other replicas may have failed as well)."
+          ]
         }
       ]
     },

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

events.out.tfevents.1625410470.t1v-n-1809a530-w-0.202355.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6128aa4dc033c34f3e6b05c0819b7732e9a0e27ef3b5256c6a961987cc20170b
+size 40

events.out.tfevents.1625410939.t1v-n-1809a530-w-0.204304.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fa5d35ae62f27632535843e6f93ca41c1ad7f13e14ea79c7e144232e1ee4260
+size 61276

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89c7d8f6011aa0b2aa822c025ab4db1f74d4304d95d96a825406cd842aa0095e
+size 711588089

run_mlm_flax.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ /home/uapo15/transformers/examples/flax/language-modeling/run_mlm_flax.py

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff