crscardellino
/

flisol-cba-martin-fierro

@@ -556,7 +556,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "17f2884d",
    "metadata": {
     "slideshow": {
@@ -589,14 +589,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "322a4a9b",
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
     }
    },
-   "outputs": [],
    "source": [
     "torch.manual_seed(42)  # To ensure determinism\n",
     "\n",
@@ -626,14 +646,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "5a27197e",
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
     }
    },
-   "outputs": [],
    "source": [
     "from datasets import load_dataset\n",
     "\n",
@@ -659,7 +716,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "33059c5f",
    "metadata": {
     "scrolled": true,
@@ -667,7 +724,16 @@
      "slide_type": "fragment"
     }
    },
-   "outputs": [],
    "source": [
     "from utils import tokenize  # local module in the repository\n",
     "\n",
@@ -695,7 +761,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "3100e195",
    "metadata": {
     "scrolled": true,
@@ -703,7 +769,16 @@
      "slide_type": "fragment"
     }
    },
-   "outputs": [],
    "source": [
     "from functools import partial\n",
     "from utils import group_texts  # local module in the repository\n",
@@ -734,14 +809,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "b9d33b7b",
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
     }
    },
-   "outputs": [],
    "source": [
     "print(len(lm_datasets['train'][0]['input_ids']))\n",
     "print(lm_datasets['train'][0]['input_ids'][:10])"
@@ -749,7 +833,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "7dfb316d",
    "metadata": {
     "scrolled": false,
@@ -757,7 +841,37 @@
      "slide_type": "fragment"
     }
    },
-   "outputs": [],
    "source": [
     "print(tokenizer.decode(lm_datasets[\"train\"][0][\"input_ids\"]))"
    ]
@@ -779,14 +893,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "a8b90ba2",
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
     }
    },
-   "outputs": [],
    "source": [
     "from huggingface_hub import notebook_login\n",
     "\n",
@@ -813,21 +938,118 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "d43c5555",
    "metadata": {
     "slideshow": {
      "slide_type": "subslide"
     }
    },
-   "outputs": [],
    "source": [
     "from transformers import Trainer, TrainingArguments\n",
     "\n",
     "training_args = TrainingArguments(\n",
-    "    \"flisol-cba-martinfierro\",\n",
     "    evaluation_strategy=\"epoch\",\n",
-    "    num_train_epochs=15,\n",
     "    learning_rate=2e-5,\n",
     "    weight_decay=0.01,\n",
     "    logging_steps=5\n",
@@ -840,7 +1062,20 @@
     "    eval_dataset=lm_datasets[\"validation\"]\n",
     ")\n",
     "\n",
-    "trainer.train()\n",
     "trainer.push_to_hub()  # This pushes the trained model to Hugging Face model repository"
    ]
   },

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "17f2884d",
    "metadata": {
     "slideshow": {
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "322a4a9b",
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
     }
    },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Aquí me pongo a cantar y a llorar. \n",
+      "\n",
+      "Los sollozos de Meggie se desvanecen por la noche en el salón. Al parecer no se ve nada. \n",
+      "\n",
+      "—¿Y si no fuera el final del mundo, el fin de un mundo?\n"
+     ]
+    }
+   ],
    "source": [
     "torch.manual_seed(42)  # To ensure determinism\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "5a27197e",
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
     }
    },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset text (/home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0fe0bb8953f24e05b2a56ad08c462976",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I - Cantor y Gaucho.\n",
+      "\n",
+      "1\n",
+      "Aquí me pongo a cantar\n",
+      "Al compás de la vigüela,\n",
+      "Que el hombre que lo desvela\n",
+      "Una pena estraordinaria\n",
+      "Como la ave solitaria\n",
+      "Con el cantar se consuela.\n"
+     ]
+    }
+   ],
    "source": [
     "from datasets import load_dataset\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "33059c5f",
    "metadata": {
     "scrolled": true,
      "slide_type": "fragment"
     }
    },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-5a0f77d99160fc1c_*_of_00004.arrow\n",
+      "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-003d85e2eebe3231_*_of_00004.arrow\n"
+     ]
+    }
+   ],
    "source": [
     "from utils import tokenize  # local module in the repository\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "3100e195",
    "metadata": {
     "scrolled": true,
      "slide_type": "fragment"
     }
    },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-01936c1905752293_*_of_00004.arrow\n",
+      "Loading cached processed dataset at /home/crscardellino/.cache/huggingface/datasets/text/default-623d9572e8f69157/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-af8dcd60a546c28d_*_of_00004.arrow\n"
+     ]
+    }
+   ],
    "source": [
     "from functools import partial\n",
     "from utils import group_texts  # local module in the repository\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "b9d33b7b",
    "metadata": {
     "slideshow": {
      "slide_type": "fragment"
     }
    },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "128\n",
+      "[50, 1368, 6505, 282, 324, 24275, 526, 23, 208, 208]\n"
+     ]
+    }
+   ],
    "source": [
     "print(len(lm_datasets['train'][0]['input_ids']))\n",
     "print(lm_datasets['train'][0]['input_ids'][:10])"
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "7dfb316d",
    "metadata": {
     "scrolled": false,
      "slide_type": "fragment"
     }
    },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I - Cantor y Gaucho.\n",
+      "\n",
+      "1\n",
+      "Aquí me pongo a cantar\n",
+      "Al compás de la vigüela,\n",
+      "Que el hombre que lo desvela\n",
+      "Una pena estraordinaria\n",
+      "Como la ave solitaria\n",
+      "Con el cantar se consuela.\n",
+      "\n",
+      "2\n",
+      "Pido a los Santos del Cielo\n",
+      "Que ayuden mi pensamiento;\n",
+      "Les pido en este momento\n",
+      "Que voy a cantar mi historia\n",
+      "Me refresquen la memoria\n",
+      "Y aclaren mi entendimiento.\n",
+      "\n",
+      "3\n",
+      "Vengan Santos milagrosos,\n",
+      "Vengan todos en mi ayuda,\n",
+      "Que la lengua se me añuda\n",
+      "Y se me turba\n"
+     ]
+    }
+   ],
    "source": [
     "print(tokenizer.decode(lm_datasets[\"train\"][0][\"input_ids\"]))"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "a8b90ba2",
    "metadata": {
+    "scrolled": true,
     "slideshow": {
      "slide_type": "fragment"
     }
    },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token is valid.\n",
+      "Your token has been saved to /home/crscardellino/.cache/huggingface/token\n",
+      "Login successful\n"
+     ]
+    }
+   ],
    "source": [
     "from huggingface_hub import notebook_login\n",
     "\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
+   "id": "3b121d21",
    "metadata": {
     "slideshow": {
      "slide_type": "subslide"
     }
    },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/crscardellino/Projects/research/flisol/flisol-cba-martin-fierro/venv/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='180' max='180' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [180/180 11:44, Epoch 10/10]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>4.386400</td>\n",
+       "      <td>4.202457</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>3.948000</td>\n",
+       "      <td>4.043974</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>3.796200</td>\n",
+       "      <td>3.980350</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>3.610500</td>\n",
+       "      <td>3.945783</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>3.444400</td>\n",
+       "      <td>3.927984</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>3.385500</td>\n",
+       "      <td>3.919229</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>3.314200</td>\n",
+       "      <td>3.909090</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>3.219200</td>\n",
+       "      <td>3.907399</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>3.161500</td>\n",
+       "      <td>3.906959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>3.163700</td>\n",
+       "      <td>3.906726</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=180, training_loss=3.5808190133836533, metrics={'train_runtime': 707.4357, 'train_samples_per_second': 1.951, 'train_steps_per_second': 0.254, 'total_flos': 90145751040000.0, 'train_loss': 3.5808190133836533, 'epoch': 10.0})"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from transformers import Trainer, TrainingArguments\n",
     "\n",
     "training_args = TrainingArguments(\n",
+    "    \"flisol-cba-martin-fierro\",\n",
     "    evaluation_strategy=\"epoch\",\n",
+    "    num_train_epochs=10,\n",
     "    learning_rate=2e-5,\n",
     "    weight_decay=0.01,\n",
     "    logging_steps=5\n",
     "    eval_dataset=lm_datasets[\"validation\"]\n",
     ")\n",
     "\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d43c5555",
+   "metadata": {
+    "slideshow": {
+     "slide_type": "-"
+    }
+   },
+   "outputs": [],
+   "source": [
     "trainer.push_to_hub()  # This pushes the trained model to Hugging Face model repository"
    ]
   },