{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "01_how-to-train.ipynb", "provenance": [], "collapsed_sections": [], "toc_visible": true, "machine_shape": "hm", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "a58a66392b644b1384661e850c077a6c": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_a491e8caa0a048beb3b5259f14eb233f", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_837c9ddc3d594e088891874560c646b8", "IPY_MODEL_dbf50873d62c4ba39321faefbed0cca5" ] } }, "a491e8caa0a048beb3b5259f14eb233f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "837c9ddc3d594e088891874560c646b8": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_40bf955ba0284e84b198da6be8654219", "_dom_classes": [], "description": "Epoch: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 1, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 1, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_fe20a8dae6e84628b5076d02183090f5" } }, "dbf50873d62c4ba39321faefbed0cca5": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_93b3f9eae3cb4e3e859cf456e3547c6d", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 1/1 [2:46:46<00:00, 10006.17s/it]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_6feb10aeb43147e6aba028d065947ae8" } }, "40bf955ba0284e84b198da6be8654219": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "fe20a8dae6e84628b5076d02183090f5": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "93b3f9eae3cb4e3e859cf456e3547c6d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "6feb10aeb43147e6aba028d065947ae8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "0989d41a4da24e9ebff377e02127642c": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "state": { "_view_name": "HBoxView", "_dom_classes": [], "_model_name": "HBoxModel", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.5.0", "box_style": "", "layout": "IPY_MODEL_42c6061ef7e44f179db5a6e3551c0f17", "_model_module": "@jupyter-widgets/controls", "children": [ "IPY_MODEL_d295dd80550447d88da0f04ce36a22ff", "IPY_MODEL_04e7e6d291da49d5816dc98a2904e95c" ] } }, "42c6061ef7e44f179db5a6e3551c0f17": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "d295dd80550447d88da0f04ce36a22ff": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "state": { "_view_name": "ProgressView", "style": "IPY_MODEL_e7d8c3a4fecd40778e32966b29ea65a1", "_dom_classes": [], "description": "Iteration: 100%", "_model_name": "FloatProgressModel", "bar_style": "success", "max": 15228, "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": 15228, "_view_count": null, "_view_module_version": "1.5.0", "orientation": "horizontal", "min": 0, "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_016d7c8318f742c1943464b08232a510" } }, "04e7e6d291da49d5816dc98a2904e95c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "state": { "_view_name": "HTMLView", "style": "IPY_MODEL_8388e9da9da4492c98c19235ca5fc1b5", "_dom_classes": [], "description": "", "_model_name": "HTMLModel", "placeholder": "​", "_view_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "value": " 15228/15228 [2:46:46<00:00, 1.52it/s]", "_view_count": null, "_view_module_version": "1.5.0", "description_tooltip": null, "_model_module": "@jupyter-widgets/controls", "layout": "IPY_MODEL_39c23c6a972b419eb2eeeebafeaedc22" } }, "e7d8c3a4fecd40778e32966b29ea65a1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "state": { "_view_name": "StyleView", "_model_name": "ProgressStyleModel", "description_width": "initial", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "bar_color": null, "_model_module": "@jupyter-widgets/controls" } }, "016d7c8318f742c1943464b08232a510": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } }, "8388e9da9da4492c98c19235ca5fc1b5": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "state": { "_view_name": "StyleView", "_model_name": "DescriptionStyleModel", "description_width": "", "_view_module": "@jupyter-widgets/base", "_model_module_version": "1.5.0", "_view_count": null, "_view_module_version": "1.2.0", "_model_module": "@jupyter-widgets/controls" } }, "39c23c6a972b419eb2eeeebafeaedc22": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "state": { "_view_name": "LayoutView", "grid_template_rows": null, "right": null, "justify_content": null, "_view_module": "@jupyter-widgets/base", "overflow": null, "_model_module_version": "1.2.0", "_view_count": null, "flex_flow": null, "width": null, "min_width": null, "border": null, "align_items": null, "bottom": null, "_model_module": "@jupyter-widgets/base", "top": null, "grid_column": null, "overflow_y": null, "overflow_x": null, "grid_auto_flow": null, "grid_area": null, "grid_template_columns": null, "flex": null, "_model_name": "LayoutModel", "justify_items": null, "grid_row": null, "max_height": null, "align_content": null, "visibility": null, "align_self": null, "height": null, "min_height": null, "padding": null, "grid_auto_rows": null, "grid_gap": null, "max_width": null, "order": null, "_view_module_version": "1.2.0", "grid_template_areas": null, "object_position": null, "object_fit": null, "grid_auto_columns": null, "margin": null, "display": null, "left": null } } } } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "metadata": { "id": "e67Ut53QYEdU", "colab_type": "code", "cellView": "form", "outputId": "437871b8-b8ac-4eaf-c2e1-61d801c5e6b2", "colab": { "base_uri": "https://localhost:8080/", "height": 100 } }, "source": [ "#@title\n", "%%html\n", "
\n", " Notebook written in collaboration with Aditya Malte.\n", "
\n", " The Notebook is on GitHub, so contributions are more than welcome.\n", "
\n", "
\n", "
\n", " Aditya wrote another notebook with a slightly different use case and methodology, please check it out.\n", "
\n", " \n", " https://gist.github.com/aditya-malte/2d4f896f471be9c38eb4d723a710768b\n", " \n", "
\n" ], "execution_count": 0, "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "
\n", " Notebook written in collaboration with Aditya Malte.\n", "
\n", " The Notebook is on GitHub, so contributions are more than welcome.\n", "
\n", "
\n", "
\n", " Aditya wrote another notebook with a slightly different use case and methodology, please check it out.\n", "
\n", " \n", " https://gist.github.com/aditya-malte/2d4f896f471be9c38eb4d723a710768b\n", " \n", "
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] } } ] }, { "cell_type": "markdown", "metadata": { "id": "M1oqh0F6W3ad", "colab_type": "text" }, "source": [ "# How to train a new language model from scratch using Transformers and Tokenizers\n", "\n", "### Notebook edition (link to blogpost [link](https://huggingface.co/blog/how-to-train)). Last update May 15, 2020\n", "\n", "\n", "Over the past few months, we made several improvements to our [`transformers`](https://github.com/huggingface/transformers) and [`tokenizers`](https://github.com/huggingface/tokenizers) libraries, with the goal of making it easier than ever to **train a new language model from scratch**.\n", "\n", "In this post we’ll demo how to train a “small” model (84 M parameters = 6 layers, 768 hidden size, 12 attention heads) – that’s the same number of layers & heads as DistilBERT – on **Esperanto**. We’ll then fine-tune the model on a downstream task of part-of-speech tagging.\n" ] }, { "cell_type": "markdown", "metadata": { "id": "oK7PPVm2XBgr", "colab_type": "text" }, "source": [ "## 1. Find a dataset\n", "\n", "First, let us find a corpus of text in Esperanto. Here we’ll use the Esperanto portion of the [OSCAR corpus](https://traces1.inria.fr/oscar/) from INRIA.\n", "OSCAR is a huge multilingual corpus obtained by language classification and filtering of [Common Crawl](https://commoncrawl.org/) dumps of the Web.\n", "\n", "\n", "\n", "The Esperanto portion of the dataset is only 299M, so we’ll concatenate with the Esperanto sub-corpus of the [Leipzig Corpora Collection](https://wortschatz.uni-leipzig.de/en/download), which is comprised of text from diverse sources like news, literature, and wikipedia.\n", "\n", "The final training corpus has a size of 3 GB, which is still small – for your model, you will get better results the more data you can get to pretrain on. \n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "HOk4iZ9YZvec", "colab_type": "code", "colab": {} }, "source": [ "# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance\n", "!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "G-kkz81OY6xH", "colab_type": "text" }, "source": [ "## 2. Train a tokenizer\n", "\n", "We choose to train a byte-level Byte-pair encoding tokenizer (the same as GPT-2), with the same special tokens as RoBERTa. Let’s arbitrarily pick its size to be 52,000.\n", "\n", "We recommend training a byte-level BPE (rather than let’s say, a WordPiece tokenizer like BERT) because it will start building its vocabulary from an alphabet of single bytes, so all words will be decomposable into tokens (no more `` tokens!).\n" ] }, { "cell_type": "code", "metadata": { "id": "5duRggBRZKvP", "colab_type": "code", "colab": {} }, "source": [ "# We won't need TensorFlow here\n", "!pip uninstall -y tensorflow\n", "# Install `transformers` from master\n", "!pip install git+https://github.com/huggingface/transformers\n", "!pip list | grep -E 'transformers|tokenizers'\n", "# transformers version at notebook update --- 2.11.0\n", "# tokenizers version at notebook update --- 0.8.0rc1" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "IMnymRDLe0hi", "colab_type": "code", "outputId": "4d26476f-e6b5-475a-a0c1-41b6fcdc041a", "colab": { "base_uri": "https://localhost:8080/", "height": 52 } }, "source": [ "%%time \n", "from pathlib import Path\n", "\n", "from tokenizers import ByteLevelBPETokenizer\n", "\n", "paths = [str(x) for x in Path(\".\").glob(\"**/*.txt\")]\n", "\n", "# Initialize a tokenizer\n", "tokenizer = ByteLevelBPETokenizer()\n", "\n", "# Customize training\n", "tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", "])" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "CPU times: user 4min, sys: 3min 7s, total: 7min 7s\n", "Wall time: 2min 25s\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "6Ei7bqpRf1LH", "colab_type": "text" }, "source": [ "Now let's save files to disk" ] }, { "cell_type": "code", "metadata": { "id": "EIS-irI0f32P", "colab_type": "code", "outputId": "e86c4a24-eb65-4f0a-aa58-ed1931a05ac9", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "!mkdir EsperBERTo\n", "tokenizer.save_model(\"EsperBERTo\")" ], "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['EsperBERTo/vocab.json', 'EsperBERTo/merges.txt']" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "markdown", "metadata": { "id": "lOOfYSuQhSqT", "colab_type": "text" }, "source": [ "🔥🔥 Wow, that was fast! ⚡️🔥\n", "\n", "We now have both a `vocab.json`, which is a list of the most frequent tokens ranked by frequency, and a `merges.txt` list of merges.\n", "\n", "```json\n", "{\n", "\t\"\": 0,\n", "\t\"\": 1,\n", "\t\"\": 2,\n", "\t\"\": 3,\n", "\t\"\": 4,\n", "\t\"!\": 5,\n", "\t\"\\\"\": 6,\n", "\t\"#\": 7,\n", "\t\"$\": 8,\n", "\t\"%\": 9,\n", "\t\"&\": 10,\n", "\t\"'\": 11,\n", "\t\"(\": 12,\n", "\t\")\": 13,\n", "\t# ...\n", "}\n", "\n", "# merges.txt\n", "l a\n", "Ġ k\n", "o n\n", "Ġ la\n", "t a\n", "Ġ e\n", "Ġ d\n", "Ġ p\n", "# ...\n", "```\n", "\n", "What is great is that our tokenizer is optimized for Esperanto. Compared to a generic tokenizer trained for English, more native words are represented by a single, unsplit token. Diacritics, i.e. accented characters used in Esperanto – `ĉ`, `ĝ`, `ĥ`, `ĵ`, `ŝ`, and `ŭ` – are encoded natively. We also represent sequences in a more efficient manner. Here on this corpus, the average length of encoded sequences is ~30% smaller as when using the pretrained GPT-2 tokenizer.\n", "\n", "Here’s how you can use it in `tokenizers`, including handling the RoBERTa special tokens – of course, you’ll also be able to use it directly from `transformers`.\n" ] }, { "cell_type": "code", "metadata": { "id": "tKVWB8WShT-z", "colab_type": "code", "colab": {} }, "source": [ "from tokenizers.implementations import ByteLevelBPETokenizer\n", "from tokenizers.processors import BertProcessing\n", "\n", "\n", "tokenizer = ByteLevelBPETokenizer(\n", " \"./EsperBERTo/vocab.json\",\n", " \"./EsperBERTo/merges.txt\",\n", ")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "hO5M3vrAhcuj", "colab_type": "code", "colab": {} }, "source": [ "tokenizer._tokenizer.post_processor = BertProcessing(\n", " (\"\", tokenizer.token_to_id(\"\")),\n", " (\"\", tokenizer.token_to_id(\"\")),\n", ")\n", "tokenizer.enable_truncation(max_length=512)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "E3Ye27nchfzq", "colab_type": "code", "outputId": "b9812ed2-1ecd-4e1b-d9bd-7de581955e70", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "tokenizer.encode(\"Mi estas Julien.\")" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "code", "metadata": { "id": "X8ya5_7rhjKS", "colab_type": "code", "outputId": "e9e08ded-1081-4823-dd81-9d6be1255385", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "tokenizer.encode(\"Mi estas Julien.\").tokens" ], "execution_count": 0, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '']" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "cell_type": "markdown", "metadata": { "id": "WQpUC_CDhnWW", "colab_type": "text" }, "source": [ "## 3. Train a language model from scratch\n", "\n", "**Update:** This section follows along the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py) script, using our new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) directly. Feel free to pick the approach you like best.\n", "\n", "> We’ll train a RoBERTa-like model, which is a BERT-like with a couple of changes (check the [documentation](https://huggingface.co/transformers/model_doc/roberta.html) for more details).\n", "\n", "As the model is BERT-like, we’ll train it on a task of *Masked language modeling*, i.e. the predict how to fill arbitrary tokens that we randomly mask in the dataset. This is taken care of by the example script.\n" ] }, { "cell_type": "code", "metadata": { "id": "kD140sFjh0LQ", "colab_type": "code", "outputId": "0bab1f9e-bf7a-4f13-82d3-07fe5866ce78", "colab": { "base_uri": "https://localhost:8080/", "height": 318 } }, "source": [ "# Check that we have a GPU\n", "!nvidia-smi" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "Fri May 15 21:17:12 2020 \n", "+-----------------------------------------------------------------------------+\n", "| NVIDIA-SMI 440.82 Driver Version: 418.67 CUDA Version: 10.1 |\n", "|-------------------------------+----------------------+----------------------+\n", "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", "|===============================+======================+======================|\n", "| 0 Tesla P100-PCIE... Off | 00000000:00:04.0 Off | 0 |\n", "| N/A 38C P0 26W / 250W | 0MiB / 16280MiB | 0% Default |\n", "+-------------------------------+----------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------+\n", "| Processes: GPU Memory |\n", "| GPU PID Type Process name Usage |\n", "|=============================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------+\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "VNZZs-r6iKAV", "colab_type": "code", "outputId": "c8404d6c-7662-4240-c8da-ee89edfaf51b", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "# Check that PyTorch sees it\n", "import torch\n", "torch.cuda.is_available()" ], "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "markdown", "metadata": { "id": "u0qQzgrBi1OX", "colab_type": "text" }, "source": [ "### We'll define the following config for the model" ] }, { "cell_type": "code", "metadata": { "id": "LTXXutqeDzPi", "colab_type": "code", "colab": {} }, "source": [ "from transformers import RobertaConfig\n", "\n", "config = RobertaConfig(\n", " vocab_size=52_000,\n", " max_position_embeddings=514,\n", " num_attention_heads=12,\n", " num_hidden_layers=6,\n", " type_vocab_size=1,\n", ")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "yAwQ82JiE5pi", "colab_type": "text" }, "source": [ "Now let's re-create our tokenizer in transformers" ] }, { "cell_type": "code", "metadata": { "id": "4keFBUjQFOD1", "colab_type": "code", "colab": {} }, "source": [ "from transformers import RobertaTokenizerFast\n", "\n", "tokenizer = RobertaTokenizerFast.from_pretrained(\"./EsperBERTo\", max_len=512)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "6yNCw-3hFv9h", "colab_type": "text" }, "source": [ "Finally let's initialize our model.\n", "\n", "**Important:**\n", "\n", "As we are training from scratch, we only initialize from a config, not from an existing pretrained model or checkpoint." ] }, { "cell_type": "code", "metadata": { "id": "BzMqR-dzF4Ro", "colab_type": "code", "colab": {} }, "source": [ "from transformers import RobertaForMaskedLM\n", "\n", "model = RobertaForMaskedLM(config=config)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jU6JhBSTKiaM", "colab_type": "code", "outputId": "35879a60-2915-4894-f702-2d649cfa398a", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "model.num_parameters()\n", "# => 84 million parameters" ], "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "84095008" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "markdown", "metadata": { "id": "jBtUHRMliOLM", "colab_type": "text" }, "source": [ "### Now let's build our training Dataset\n", "\n", "We'll build our dataset by applying our tokenizer to our text file.\n", "\n", "Here, as we only have one text file, we don't even need to customize our `Dataset`. We'll just use the `LineByLineDataset` out-of-the-box." ] }, { "cell_type": "code", "metadata": { "id": "GlvP_A-THEEl", "colab_type": "code", "outputId": "e0510a33-7937-4a04-fa1c-d4e20b758bb2", "colab": { "base_uri": "https://localhost:8080/", "height": 52 } }, "source": [ "%%time\n", "from transformers import LineByLineTextDataset\n", "\n", "dataset = LineByLineTextDataset(\n", " tokenizer=tokenizer,\n", " file_path=\"./oscar.eo.txt\",\n", " block_size=128,\n", ")" ], "execution_count": 11, "outputs": [ { "output_type": "stream", "text": [ "CPU times: user 4min 54s, sys: 2.98 s, total: 4min 57s\n", "Wall time: 1min 37s\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "hDLs73HcIHk5", "colab_type": "text" }, "source": [ "Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.\n", "\n", "This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on." ] }, { "cell_type": "code", "metadata": { "id": "zTgWPa9Dipk2", "colab_type": "code", "colab": {} }, "source": [ "from transformers import DataCollatorForLanguageModeling\n", "\n", "data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer, mlm=True, mlm_probability=0.15\n", ")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ri2BIQKqjfHm", "colab_type": "text" }, "source": [ "### Finally, we are all set to initialize our Trainer" ] }, { "cell_type": "code", "metadata": { "id": "YpvnFFmZJD-N", "colab_type": "code", "colab": {} }, "source": [ "from transformers import Trainer, TrainingArguments\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./EsperBERTo\",\n", " overwrite_output_dir=True,\n", " num_train_epochs=1,\n", " per_gpu_train_batch_size=64,\n", " save_steps=10_000,\n", " save_total_limit=2,\n", " prediction_loss_only=True,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " data_collator=data_collator,\n", " train_dataset=dataset,\n", ")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "o6sASa36Nf-N", "colab_type": "text" }, "source": [ "### Start training" ] }, { "cell_type": "code", "metadata": { "id": "VmaHZXzmkNtJ", "colab_type": "code", "outputId": "a19880cb-bcc6-4885-bf24-c2c6d0f56d1e", "colab": { "base_uri": "https://localhost:8080/", "height": 738, "referenced_widgets": [ "a58a66392b644b1384661e850c077a6c", "a491e8caa0a048beb3b5259f14eb233f", "837c9ddc3d594e088891874560c646b8", "dbf50873d62c4ba39321faefbed0cca5", "40bf955ba0284e84b198da6be8654219", "fe20a8dae6e84628b5076d02183090f5", "93b3f9eae3cb4e3e859cf456e3547c6d", "6feb10aeb43147e6aba028d065947ae8", "0989d41a4da24e9ebff377e02127642c", "42c6061ef7e44f179db5a6e3551c0f17", "d295dd80550447d88da0f04ce36a22ff", "04e7e6d291da49d5816dc98a2904e95c", "e7d8c3a4fecd40778e32966b29ea65a1", "016d7c8318f742c1943464b08232a510", "8388e9da9da4492c98c19235ca5fc1b5", "39c23c6a972b419eb2eeeebafeaedc22" ] } }, "source": [ "%%time\n", "trainer.train()" ], "execution_count": 18, "outputs": [ { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a58a66392b644b1384661e850c077a6c", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…" ] }, "metadata": { "tags": [] } }, { "output_type": "display_data", "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0989d41a4da24e9ebff377e02127642c", "version_minor": 0, "version_major": 2 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, description='Iteration', max=15228.0, style=ProgressStyle(description_…" ] }, "metadata": { "tags": [] } }, { "output_type": "stream", "text": [ "{\"loss\": 7.152712148666382, \"learning_rate\": 4.8358287365379566e-05, \"epoch\": 0.03283425269240872, \"step\": 500}\n", "{\"loss\": 6.928811420440674, \"learning_rate\": 4.671657473075913e-05, \"epoch\": 0.06566850538481744, \"step\": 1000}\n", "{\"loss\": 6.789419063568115, \"learning_rate\": 4.5074862096138694e-05, \"epoch\": 0.09850275807722617, \"step\": 1500}\n", "{\"loss\": 6.688932447433472, \"learning_rate\": 4.343314946151826e-05, \"epoch\": 0.1313370107696349, \"step\": 2000}\n", "{\"loss\": 6.595982004165649, \"learning_rate\": 4.179143682689782e-05, \"epoch\": 0.1641712634620436, \"step\": 2500}\n", "{\"loss\": 6.545944199562073, \"learning_rate\": 4.0149724192277385e-05, \"epoch\": 0.19700551615445233, \"step\": 3000}\n", "{\"loss\": 6.4864857263565066, \"learning_rate\": 3.850801155765695e-05, \"epoch\": 0.22983976884686105, \"step\": 3500}\n", "{\"loss\": 6.412427802085876, \"learning_rate\": 3.686629892303651e-05, \"epoch\": 0.2626740215392698, \"step\": 4000}\n", "{\"loss\": 6.363630670547486, \"learning_rate\": 3.522458628841608e-05, \"epoch\": 0.29550827423167847, \"step\": 4500}\n", "{\"loss\": 6.273832890510559, \"learning_rate\": 3.358287365379564e-05, \"epoch\": 0.3283425269240872, \"step\": 5000}\n", "{\"loss\": 6.197585330963134, \"learning_rate\": 3.1941161019175205e-05, \"epoch\": 0.3611767796164959, \"step\": 5500}\n", "{\"loss\": 6.097779376983643, \"learning_rate\": 3.029944838455477e-05, \"epoch\": 0.39401103230890466, \"step\": 6000}\n", "{\"loss\": 5.985456382751464, \"learning_rate\": 2.8657735749934332e-05, \"epoch\": 0.42684528500131336, \"step\": 6500}\n", "{\"loss\": 5.8448616371154785, \"learning_rate\": 2.70160231153139e-05, \"epoch\": 0.4596795376937221, \"step\": 7000}\n", "{\"loss\": 5.692522863388062, \"learning_rate\": 2.5374310480693457e-05, \"epoch\": 0.4925137903861308, \"step\": 7500}\n", "{\"loss\": 5.562082152366639, \"learning_rate\": 2.3732597846073024e-05, \"epoch\": 0.5253480430785396, \"step\": 8000}\n", "{\"loss\": 5.457240365982056, \"learning_rate\": 2.2090885211452588e-05, \"epoch\": 0.5581822957709482, \"step\": 8500}\n", "{\"loss\": 5.376953645706177, \"learning_rate\": 2.0449172576832152e-05, \"epoch\": 0.5910165484633569, \"step\": 9000}\n", "{\"loss\": 5.298609251022339, \"learning_rate\": 1.8807459942211716e-05, \"epoch\": 0.6238508011557657, \"step\": 9500}\n", "{\"loss\": 5.225468152046203, \"learning_rate\": 1.716574730759128e-05, \"epoch\": 0.6566850538481744, \"step\": 10000}\n", "{\"loss\": 5.174519973754883, \"learning_rate\": 1.5524034672970843e-05, \"epoch\": 0.6895193065405831, \"step\": 10500}\n", "{\"loss\": 5.113943946838379, \"learning_rate\": 1.3882322038350407e-05, \"epoch\": 0.7223535592329918, \"step\": 11000}\n", "{\"loss\": 5.08140989112854, \"learning_rate\": 1.2240609403729971e-05, \"epoch\": 0.7551878119254006, \"step\": 11500}\n", "{\"loss\": 5.072491912841797, \"learning_rate\": 1.0598896769109535e-05, \"epoch\": 0.7880220646178093, \"step\": 12000}\n", "{\"loss\": 5.012459496498108, \"learning_rate\": 8.957184134489099e-06, \"epoch\": 0.820856317310218, \"step\": 12500}\n", "{\"loss\": 4.999591351509094, \"learning_rate\": 7.315471499868663e-06, \"epoch\": 0.8536905700026267, \"step\": 13000}\n", "{\"loss\": 4.994838352203369, \"learning_rate\": 5.673758865248227e-06, \"epoch\": 0.8865248226950354, \"step\": 13500}\n", "{\"loss\": 4.955870885848999, \"learning_rate\": 4.032046230627791e-06, \"epoch\": 0.9193590753874442, \"step\": 14000}\n", "{\"loss\": 4.941655583381653, \"learning_rate\": 2.390333596007355e-06, \"epoch\": 0.9521933280798529, \"step\": 14500}\n", "{\"loss\": 4.931783639907837, \"learning_rate\": 7.486209613869189e-07, \"epoch\": 0.9850275807722616, \"step\": 15000}\n", "\n", "\n", "CPU times: user 1h 43min 36s, sys: 1h 3min 28s, total: 2h 47min 4s\n", "Wall time: 2h 46min 46s\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "TrainOutput(global_step=15228, training_loss=5.762423221226405)" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "markdown", "metadata": { "id": "_ZkooHz1-_2h", "colab_type": "text" }, "source": [ "#### 🎉 Save final model (+ tokenizer + config) to disk" ] }, { "cell_type": "code", "metadata": { "id": "QDNgPls7_l13", "colab_type": "code", "colab": {} }, "source": [ "trainer.save_model(\"./EsperBERTo\")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "d0caceCy_p1-", "colab_type": "text" }, "source": [ "## 4. Check that the LM actually trained" ] }, { "cell_type": "markdown", "metadata": { "id": "iIQJ8ND_AEhl", "colab_type": "text" }, "source": [ "Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.\n", "\n", "Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, ``) and return a list of the most probable filled sequences, with their probabilities.\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "ltXgXyCbAJLY", "colab_type": "code", "colab": {} }, "source": [ "from transformers import pipeline\n", "\n", "fill_mask = pipeline(\n", " \"fill-mask\",\n", " model=\"./EsperBERTo\",\n", " tokenizer=\"./EsperBERTo\"\n", ")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "UIvgZ3S6AO0z", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 283 }, "outputId": "5f3d2f00-abdc-44a9-9c1b-75e3ec328576" }, "source": [ "# The sun .\n", "# =>\n", "\n", "fill_mask(\"La suno .\")" ], "execution_count": 36, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[{'score': 0.02119220793247223,\n", " 'sequence': ' La suno estas.',\n", " 'token': 316},\n", " {'score': 0.012403824366629124,\n", " 'sequence': ' La suno situas.',\n", " 'token': 2340},\n", " {'score': 0.011061107739806175,\n", " 'sequence': ' La suno estis.',\n", " 'token': 394},\n", " {'score': 0.008284995332360268,\n", " 'sequence': ' La suno de.',\n", " 'token': 274},\n", " {'score': 0.006471084896475077,\n", " 'sequence': ' La suno akvo.',\n", " 'token': 1833}]" ] }, "metadata": { "tags": [] }, "execution_count": 36 } ] }, { "cell_type": "markdown", "metadata": { "id": "i0qCyyhNAWZi", "colab_type": "text" }, "source": [ "Ok, simple syntax/grammar works. Let’s try a slightly more interesting prompt:\n", "\n" ] }, { "cell_type": "code", "metadata": { "id": "YZ9HSQxAAbme", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 283 }, "outputId": "aabfeedc-b1d0-4837-b01d-cd42726a5a3d" }, "source": [ "fill_mask(\"Jen la komenco de bela .\")\n", "\n", "# This is the beginning of a beautiful .\n", "# =>" ], "execution_count": 37, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[{'score': 0.01814725436270237,\n", " 'sequence': ' Jen la komenco de bela urbo.',\n", " 'token': 871},\n", " {'score': 0.015888698399066925,\n", " 'sequence': ' Jen la komenco de bela vivo.',\n", " 'token': 1160},\n", " {'score': 0.015662025660276413,\n", " 'sequence': ' Jen la komenco de bela tempo.',\n", " 'token': 1021},\n", " {'score': 0.015555007383227348,\n", " 'sequence': ' Jen la komenco de bela mondo.',\n", " 'token': 945},\n", " {'score': 0.01412549614906311,\n", " 'sequence': ' Jen la komenco de bela tago.',\n", " 'token': 1633}]" ] }, "metadata": { "tags": [] }, "execution_count": 37 } ] }, { "cell_type": "markdown", "metadata": { "id": "6RsGaD1qAfLP", "colab_type": "text" }, "source": [ "## 5. Share your model 🎉" ] }, { "cell_type": "markdown", "metadata": { "id": "5oESe8djApQw", "colab_type": "text" }, "source": [ "Finally, when you have a nice model, please think about sharing it with the community:\n", "\n", "- upload your model using the CLI: `transformers-cli upload`\n", "- write a README.md model card and add it to the repository under `model_cards/`. Your model card should ideally include:\n", " - a model description,\n", " - training params (dataset, preprocessing, hyperparameters), \n", " - evaluation results,\n", " - intended uses & limitations\n", " - whatever else is helpful! 🤓\n", "\n", "### **TADA!**\n", "\n", "➡️ Your model has a page on http://huggingface.co/models and everyone can load it using `AutoModel.from_pretrained(\"username/model_name\")`.\n", "\n", "[![tb](https://huggingface.co/blog/assets/01_how-to-train/model_page.png)](https://huggingface.co/julien-c/EsperBERTo-small)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "aw9ifsgqBI2o", "colab_type": "text" }, "source": [ "If you want to take a look at models in different languages, check https://huggingface.co/models\n", "\n", "[![all models](https://huggingface.co/front/thumbnails/models.png)](https://huggingface.co/models)\n" ] } ] }