diff --git "a/trainer.ipynb" "b/trainer.ipynb"
new file mode 100644--- /dev/null
+++ "b/trainer.ipynb"
@@ -0,0 +1,1766 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6b7d6a9b-db7e-46b2-8ab8-d6914e18f1e1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:00:39.926249Z",
+     "iopub.status.busy": "2023-07-09T08:00:39.925711Z",
+     "iopub.status.idle": "2023-07-09T08:00:50.779648Z",
+     "shell.execute_reply": "2023-07-09T08:00:50.779080Z",
+     "shell.execute_reply.started": "2023-07-09T08:00:39.926227Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install datasets transformers accelerate wandb -U -q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a2a349ee-2749-4490-9807-cdf18f428181",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:00:50.780814Z",
+     "iopub.status.busy": "2023-07-09T08:00:50.780645Z",
+     "iopub.status.idle": "2023-07-09T08:00:55.743251Z",
+     "shell.execute_reply": "2023-07-09T08:00:55.742619Z",
+     "shell.execute_reply.started": "2023-07-09T08:00:50.780797Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "  ········································\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import wandb\n",
+    "wandb.login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "bce905bd-4fc3-4a6a-bd76-48bba9ebc1d9",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:00:55.744653Z",
+     "iopub.status.busy": "2023-07-09T08:00:55.744235Z",
+     "iopub.status.idle": "2023-07-09T08:00:55.748644Z",
+     "shell.execute_reply": "2023-07-09T08:00:55.747937Z",
+     "shell.execute_reply.started": "2023-07-09T08:00:55.744631Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "env: WANDB_PROJECT=sinhala_bert_v1\n"
+     ]
+    }
+   ],
+   "source": [
+    "%env WANDB_PROJECT=sinhala_bert_v1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2551a71d-2804-48ed-bf85-6e0aa94d47d8",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:00:55.750683Z",
+     "iopub.status.busy": "2023-07-09T08:00:55.750064Z",
+     "iopub.status.idle": "2023-07-09T08:00:55.753691Z",
+     "shell.execute_reply": "2023-07-09T08:00:55.753148Z",
+     "shell.execute_reply.started": "2023-07-09T08:00:55.750659Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "model_checkpoint = \"9wimu9/sinhala-bert-1\"\n",
+    "tokenizer_checkpoint= \"9wimu9/sinhala-bert-1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "137a18d4-6fc5-4bfe-bf3c-54f4bdef5f4b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:00:55.755128Z",
+     "iopub.status.busy": "2023-07-09T08:00:55.754589Z",
+     "iopub.status.idle": "2023-07-09T08:01:58.423431Z",
+     "shell.execute_reply": "2023-07-09T08:01:58.422900Z",
+     "shell.execute_reply.started": "2023-07-09T08:00:55.755103Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "85ae51907f1b4e6188ef42ab98704c4b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading readme:   0%|          | 0.00/608 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/9wimu9___parquet/9wimu9--sinhala_30m_tokenized-4ef7deb3027f7158/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "966039dee2924e6aa05f404873c1e2fa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "364b82753a8447f8a2ad6e501189b977",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "76b9a1f5c0e94507a2f3f321cfa386aa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8081b304d8a0448e80ff9cf89c902582",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c8db001876e143899f4ef1321f743bf2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7dd48c0f689e4820b57f187c3310c22e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c4ceb381f0d14490a353ad74bb95cabb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1ef7da57254f4a238a40433607fbe12b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6bb90155121c4b7781470136b93cfa5a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dfc1782e94cd49eb97064e6e2a8d77a6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "101570c8e1ac493daca375ece7b4aed7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c3c8b273607849fea232715cb38001f4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eaa25870072148f4bf798c90483bd528",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "581b913a46894f41a3132aa4f0e37f71",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "04c531e4343d469688e9f7afa86c72e2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "783e1f99d95d481193774df5325e1538",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bd64cfa3c64c4ab5954e305cca2dc6ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb17c0f08e2b4b1199c9d30eef394410",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "20f140c9baa243f597481bd3ed9fa332",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9e59d0b93a1747289c0d28f25bb2a86c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b8e641746b504ea3887d352fd8bcb47c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2dc1651ca6d04cb5918087925c7d834f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "94fa72253477473e9899799cbfd8f338",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2ec0b745e9b549a591262c5b3cbe428f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/115M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "00a6194acf244cfc8d1b59c815396a8e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.2M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "56ba2cce8c454a0a923ddceaf628f64d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.2M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f6d0d6e5e0a54051b308da39a0147500",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.1M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "083958a1ed2e4075a2f726d00e05ef1a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/73.1M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ce53083f64344e45940dbdf95db9a54d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a18013b2d704414d99d8f53b8d0b6329",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split:   0%|          | 0/7310725 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d36e1f71e7304604bb5b3954c78db0ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating test split:   0%|          | 0/406414 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7fe5c2877504e2ea1333bfdafa1e302",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating valid split:   0%|          | 0/405841 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/9wimu9___parquet/9wimu9--sinhala_30m_tokenized-4ef7deb3027f7158/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1518bf006c3b4f0d981151804fd2cba9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
+       "        num_rows: 7310725\n",
+       "    })\n",
+       "    test: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
+       "        num_rows: 406414\n",
+       "    })\n",
+       "    valid: Dataset({\n",
+       "        features: ['input_ids', 'attention_mask', 'special_tokens_mask'],\n",
+       "        num_rows: 405841\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "lm_datasets = load_dataset('9wimu9/sinhala_30m_tokenized')\n",
+    "lm_datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e81c4c2a-d6e2-4a41-bcef-218c205d9544",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:01:58.425381Z",
+     "iopub.status.busy": "2023-07-09T08:01:58.424648Z",
+     "iopub.status.idle": "2023-07-09T08:02:03.528411Z",
+     "shell.execute_reply": "2023-07-09T08:02:03.527792Z",
+     "shell.execute_reply.started": "2023-07-09T08:01:58.425361Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RobertaConfig {\n",
+       "  \"_name_or_path\": \"/notebooks/roberta-large-pretrained-si\",\n",
+       "  \"architectures\": [\n",
+       "    \"RobertaForMaskedLM\"\n",
+       "  ],\n",
+       "  \"attention_probs_dropout_prob\": 0.1,\n",
+       "  \"bos_token_id\": 0,\n",
+       "  \"classifier_dropout\": null,\n",
+       "  \"eos_token_id\": 2,\n",
+       "  \"hidden_act\": \"gelu\",\n",
+       "  \"hidden_dropout_prob\": 0.1,\n",
+       "  \"hidden_size\": 1024,\n",
+       "  \"initializer_range\": 0.02,\n",
+       "  \"intermediate_size\": 4096,\n",
+       "  \"layer_norm_eps\": 1e-05,\n",
+       "  \"max_position_embeddings\": 514,\n",
+       "  \"model_type\": \"roberta\",\n",
+       "  \"num_attention_heads\": 16,\n",
+       "  \"num_hidden_layers\": 24,\n",
+       "  \"pad_token_id\": 1,\n",
+       "  \"position_embedding_type\": \"absolute\",\n",
+       "  \"transformers_version\": \"4.30.2\",\n",
+       "  \"type_vocab_size\": 1,\n",
+       "  \"use_cache\": true,\n",
+       "  \"vocab_size\": 12500\n",
+       "}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoConfig, AutoModelForMaskedLM\n",
+    "config = AutoConfig.from_pretrained('/notebooks/roberta-large-pretrained-si')\n",
+    "model = AutoModelForMaskedLM.from_config(config)\n",
+    "config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "065f3958-2b05-4a5a-8f05-b049c14fb5f0",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:02:03.529970Z",
+     "iopub.status.busy": "2023-07-09T08:02:03.529240Z",
+     "iopub.status.idle": "2023-07-09T08:02:03.532771Z",
+     "shell.execute_reply": "2023-07-09T08:02:03.532232Z",
+     "shell.execute_reply.started": "2023-07-09T08:02:03.529948Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# max_token_size=128\n",
+    "# use model architecture -> BERT large\n",
+    "# 24 layers, 1,024 dimensions, 16 heads, 4,096 hidden dimensions in the feed-forward layer, with pre-layer normalization\n",
+    "\n",
+    "\n",
+    "# We follow the optimization of RoBERTa (Liu et al., 2019) and use \n",
+    "# AdamW (Loshchilov and Hutter, 2019) with \n",
+    "# β1 = 0.9, β2 = 0.98, ε = 1e-6, \n",
+    "# weight decay of 0.01, dropout 0.1, and \n",
+    "# attention dropout 0.1.\n",
+    "\n",
+    "\n",
+    "# Hyperparameters\n",
+    "\n",
+    "# batch size -> 4k, 8k, and 16k (via gradient accumilation)\n",
+    "\n",
+    "# Warmup Proportion (wu) We determine the number of warmup steps as a proportion of the total number of steps. \n",
+    "# Specifically, we try 0%, 2%, 4%, and 6%, which all reflect significantly fewer warmup steps than in BERT.\n",
+    "\n",
+    "# Peak Learning Rate (lr) Our linear learning rate scheduler, \n",
+    "# which starts at 0, warms up to the peak learning rate, and then decays back to 0. We try 5e-4, 1e-3, and 2e-3\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "858cd60b-32c4-4c0f-859e-10a1ee3bf68e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:02:03.534149Z",
+     "iopub.status.busy": "2023-07-09T08:02:03.533455Z",
+     "iopub.status.idle": "2023-07-09T08:02:03.565628Z",
+     "shell.execute_reply": "2023-07-09T08:02:03.565017Z",
+     "shell.execute_reply.started": "2023-07-09T08:02:03.534132Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained('/notebooks/roberta-large-pretrained-si',model_max_length=256)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "5812f8da-3434-4ec8-a2e6-a6bdc30ecf72",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:02:03.566886Z",
+     "iopub.status.busy": "2023-07-09T08:02:03.566690Z",
+     "iopub.status.idle": "2023-07-09T08:02:03.570597Z",
+     "shell.execute_reply": "2023-07-09T08:02:03.569974Z",
+     "shell.execute_reply.started": "2023-07-09T08:02:03.566870Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RobertaTokenizerFast(name_or_path='/notebooks/roberta-large-pretrained-si', vocab_size=1868, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken(\"<mask>\", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0905ef8c-9faa-49d6-ad0a-06753ce856fa",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:02:03.572556Z",
+     "iopub.status.busy": "2023-07-09T08:02:03.572022Z",
+     "iopub.status.idle": "2023-07-09T08:02:03.575094Z",
+     "shell.execute_reply": "2023-07-09T08:02:03.574563Z",
+     "shell.execute_reply.started": "2023-07-09T08:02:03.572539Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "per_device_train_batch_size=250\n",
+    "gradient_accumulation_steps=8\n",
+    "num_train_epochs=1\n",
+    "warmup_rate=0.01"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "6056f333-46f9-4bea-a93d-423f3a1a959e",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:02:03.576023Z",
+     "iopub.status.busy": "2023-07-09T08:02:03.575858Z",
+     "iopub.status.idle": "2023-07-09T08:02:05.698372Z",
+     "shell.execute_reply": "2023-07-09T08:02:05.697548Z",
+     "shell.execute_reply.started": "2023-07-09T08:02:03.576009Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "training_args = TrainingArguments(\n",
+    "    model_checkpoint,\n",
+    "    evaluation_strategy = \"epoch\",\n",
+    "    # push_to_hub=True,\n",
+    "    hub_model_id=\"sinhala-bert-v.1\",\n",
+    "    per_device_train_batch_size=per_device_train_batch_size, # 4000,8000,16000\n",
+    "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
+    "    gradient_checkpointing=True,\n",
+    "    fp16=True,\n",
+    "    report_to=\"wandb\", \n",
+    "    num_train_epochs=num_train_epochs,\n",
+    "    no_cuda=False,\n",
+    "    logging_steps=1,\n",
+    "    save_steps=100,\n",
+    "    save_total_limit=3,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7f6078f0-ba64-4509-ac8f-39dd0cd7fe04",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:02:05.699733Z",
+     "iopub.status.busy": "2023-07-09T08:02:05.699536Z",
+     "iopub.status.idle": "2023-07-09T08:02:05.708689Z",
+     "shell.execute_reply": "2023-07-09T08:02:05.708136Z",
+     "shell.execute_reply.started": "2023-07-09T08:02:05.699715Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(7310725, 3656, 36, 2000)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import get_polynomial_decay_schedule_with_warmup,AdamW,get_linear_schedule_with_warmup\n",
+    "import math,torch\n",
+    "\n",
+    "params = filter(lambda x: x.requires_grad, model.parameters())\n",
+    "\n",
+    "optimizer = torch.optim.AdamW(params,lr=1e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.01)\n",
+    "\n",
+    "batch_size = per_device_train_batch_size*gradient_accumulation_steps\n",
+    "\n",
+    "num_warmup_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * warmup_rate*num_train_epochs\n",
+    "num_warmup_steps = int(num_warmup_steps)\n",
+    "num_training_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * num_train_epochs\n",
+    "\n",
+    "\n",
+    "scheduler = get_linear_schedule_with_warmup(optimizer,\n",
+    "    num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)\n",
+    "\n",
+    "lm_datasets[\"train\"].num_rows,num_training_steps,num_warmup_steps,batch_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "ebf14d20-e630-4961-a6d4-d9c8fa90e941",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T12:53:11.116082Z",
+     "iopub.status.busy": "2023-07-09T12:53:11.115500Z",
+     "iopub.status.idle": "2023-07-09T12:53:16.314031Z",
+     "shell.execute_reply": "2023-07-09T12:53:16.309782Z",
+     "shell.execute_reply.started": "2023-07-09T12:53:11.116055Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading package lists... Done\n",
+      "Building dependency tree       \n",
+      "Reading state information... Done\n",
+      "The following NEW packages will be installed:\n",
+      "  git-lfs\n",
+      "0 upgraded, 1 newly installed, 0 to remove and 3 not upgraded.\n",
+      "Need to get 3316 kB of archives.\n",
+      "After this operation, 11.1 MB of additional disk space will be used.\n",
+      "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]\n",
+      "Fetched 3316 kB in 1s (3308 kB/s) \u001b[0m[33m\u001b[33m\n",
+      "\n",
+      "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n",
+      "(Reading database ... 69943 files and directories currently installed.)\n",
+      "Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...\n",
+      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [  0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (2.9.2-1) ...\n",
+      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (2.9.2-1) ...\n",
+      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8Processing triggers for man-db (2.9.1-1) ...\n",
+      "\n",
+      "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J"
+     ]
+    }
+   ],
+   "source": [
+    "!sudo apt install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "969484c6-4035-4234-8ac7-209ab4a014bc",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T12:16:17.950069Z",
+     "iopub.status.busy": "2023-07-09T12:16:17.949152Z",
+     "iopub.status.idle": "2023-07-09T12:16:17.961269Z",
+     "shell.execute_reply": "2023-07-09T12:16:17.960724Z",
+     "shell.execute_reply.started": "2023-07-09T12:16:17.950036Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import DataCollatorForLanguageModeling,Trainer\n",
+    "\n",
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=lm_datasets[\"train\"],\n",
+    "    eval_dataset=lm_datasets[\"valid\"],\n",
+    "    data_collator=data_collator,\n",
+    "    optimizers=[optimizer, scheduler]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "4c2f4490-b3bc-4ec6-bef1-2bd71933369a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T12:53:22.338679Z",
+     "iopub.status.busy": "2023-07-09T12:53:22.337791Z",
+     "iopub.status.idle": "2023-07-09T12:53:34.948476Z",
+     "shell.execute_reply": "2023-07-09T12:53:34.947890Z",
+     "shell.execute_reply.started": "2023-07-09T12:53:22.338679Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Waiting for W&B process to finish... <strong style=\"color:green\">(success).</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style>\n",
+       "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
+       "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
+       "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
+       "    </style>\n",
+       "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>▁</td></tr><tr><td>eval/runtime</td><td>▁</td></tr><tr><td>eval/samples_per_second</td><td>▁</td></tr><tr><td>eval/steps_per_second</td><td>▁</td></tr><tr><td>train/epoch</td><td>▁▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████</td></tr><tr><td>train/global_step</td><td>▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████████▁</td></tr><tr><td>train/learning_rate</td><td>███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁</td></tr><tr><td>train/loss</td><td>▆█▇▇▃▂▇█▇▄▄▅▆▂▁▂▁▃██▁▃▃▅▆▄▄█▆▅▅▅▅▄▆▄▄▆▃▃</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>4.36183</td></tr><tr><td>eval/runtime</td><td>1852.7969</td></tr><tr><td>eval/samples_per_second</td><td>219.042</td></tr><tr><td>eval/steps_per_second</td><td>27.381</td></tr><tr><td>train/epoch</td><td>0.96</td></tr><tr><td>train/global_step</td><td>0</td></tr><tr><td>train/learning_rate</td><td>0.0</td></tr><tr><td>train/loss</td><td>4.3709</td></tr></table><br/></div></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run <strong style=\"color:#cdcd00\">valiant-durian-58</strong> at: <a href='https://wandb.ai/sinquad/sinhala_bert_v1/runs/4506wait' target=\"_blank\">https://wandb.ai/sinquad/sinhala_bert_v1/runs/4506wait</a><br/>Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Find logs at: <code>./wandb/run-20230709_080222-4506wait/logs</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "wandb.finish()\n",
+    "# wandb.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "17979cc2-2e66-4055-aabb-29d9ee90112d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-08T07:31:19.523715Z",
+     "iopub.status.busy": "2023-07-08T07:31:19.523529Z",
+     "iopub.status.idle": "2023-07-08T07:31:20.383711Z",
+     "shell.execute_reply": "2023-07-08T07:31:20.382696Z",
+     "shell.execute_reply.started": "2023-07-08T07:31:19.523696Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# !rm -rf /notebooks/9wimu9/sinhala-bert-1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "b8bd0ab4-6412-4c0c-a215-a0c5cd5d4626",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T08:02:07.964267Z",
+     "iopub.status.busy": "2023-07-09T08:02:07.963688Z",
+     "iopub.status.idle": "2023-07-09T12:16:04.192592Z",
+     "shell.execute_reply": "2023-07-09T12:16:04.191499Z",
+     "shell.execute_reply.started": "2023-07-09T08:02:07.964251Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "Changes to your `wandb` environment variables will be ignored because your `wandb` session has already started. For more information on how to modify your settings with `wandb.init()` arguments, please refer to <a href='https://wandb.me/wandb-init' target=\"_blank\">the W&B docs</a>."
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m9wimu9\u001b[0m (\u001b[33msinquad\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "Tracking run with wandb version 0.15.5"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Run data is saved locally in <code>/notebooks/wandb/run-20230709_080222-4506wait</code>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Syncing run <strong><a href='https://wandb.ai/sinquad/sinhala_bert_v1/runs/4506wait' target=\"_blank\">valiant-durian-58</a></strong> to <a href='https://wandb.ai/sinquad/sinhala_bert_v1' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View project at <a href='https://wandb.ai/sinquad/sinhala_bert_v1' target=\"_blank\">https://wandb.ai/sinquad/sinhala_bert_v1</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       " View run at <a href='https://wandb.ai/sinquad/sinhala_bert_v1/runs/4506wait' target=\"_blank\">https://wandb.ai/sinquad/sinhala_bert_v1/runs/4506wait</a>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='3504' max='3655' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [3504/3655 4:12:37 < 1:15:59, 0.03 it/s, Epoch 0.96/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Epoch</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span>                                                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1 trainer.train(resume_from_checkpoint = <span style=\"color: #0000ff; text-decoration-color: #0000ff\">True</span>)                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2 </span>wandb.finish()                                                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3 </span>                                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/transformers/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">trainer.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1645</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">train</span>                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1642 │   │   </span>inner_training_loop = find_executable_batch_size(                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1643 │   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>._inner_training_loop, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>._train_batch_size, args.auto_find_batch_size  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1644 │   │   </span>)                                                                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1645 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> inner_training_loop(                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1646 │   │   │   </span>args=args,                                                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1647 │   │   │   </span>resume_from_checkpoint=resume_from_checkpoint,                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1648 │   │   │   </span>trial=trial,                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/transformers/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">trainer.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1938</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">_inner_training_loop</span>      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1935 │   │   │   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.control = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.callback_handler.on_step_begin(args, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.state,  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1936 │   │   │   │   </span>                                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1937 │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">with</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.accelerator.accumulate(model):                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1938 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   </span>tr_loss_step = <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.training_step(model, inputs)                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1939 │   │   │   │   </span>                                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1940 │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> (                                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1941 │   │   │   │   │   </span>args.logging_nan_inf_filter                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/transformers/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">trainer.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">2770</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">training_step</span>             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2767 │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">with</span> amp.scale_loss(loss, <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.optimizer) <span style=\"color: #0000ff; text-decoration-color: #0000ff\">as</span> scaled_loss:                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2768 │   │   │   │   </span>scaled_loss.backward()                                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2769 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>2770 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.accelerator.backward(loss)                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2771 │   │   </span>                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2772 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> loss.detach() / <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.args.gradient_accumulation_steps                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2773 </span>                                                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/accelerate/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">accelerator.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">1819</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">backward</span>                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1816 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">elif</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.distributed_type == DistributedType.MEGATRON_LM:                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1817 │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span>                                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1818 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">elif</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.scaler <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">is</span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">not</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>:                                                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1819 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.scaler.scale(loss).backward(**kwargs)                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1820 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1821 │   │   │   </span>loss.backward(**kwargs)                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">1822 </span>                                                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/torch/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">_tensor.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">396</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">backward</span>                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 393 │   │   │   │   </span>retain_graph=retain_graph,                                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 394 │   │   │   │   </span>create_graph=create_graph,                                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 395 │   │   │   │   </span>inputs=inputs)                                                            <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 396 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span>torch.autograd.backward(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, gradient, retain_graph, create_graph, inputs=input  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 397 │   </span>                                                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 398 │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">def</span> <span style=\"color: #00ff00; text-decoration-color: #00ff00\">register_hook</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, hook):                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 399 </span><span style=\"color: #bfbfbf; text-decoration-color: #bfbfbf\">│   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">r\"\"\"Registers a backward hook.</span>                                                    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/torch/autograd/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">__init__.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">173</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">backward</span>                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">170 │   # The reason we repeat same the comment below is that</span>                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">171 │   # some Python versions print out the first line of a multi-line function</span>               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">172 │   # calls in the traceback and some print out the last line</span>                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>173 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   </span>Variable._execution_engine.run_backward(  <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># Calls into the C++ engine to run the bac</span>   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">174 │   │   </span>tensors, grad_tensors_, retain_graph, create_graph, inputs,                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">175 │   │   </span>allow_unreachable=<span style=\"color: #0000ff; text-decoration-color: #0000ff\">True</span>, accumulate_grad=<span style=\"color: #0000ff; text-decoration-color: #0000ff\">True</span>)  <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># Calls into the C++ engine to ru</span>   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">176 </span>                                                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">KeyboardInterrupt</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+       "\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m                                                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 trainer.train(resume_from_checkpoint = \u001b[94mTrue\u001b[0m)                                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2 \u001b[0mwandb.finish()                                                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3 \u001b[0m                                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m1645\u001b[0m in \u001b[92mtrain\u001b[0m                     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1642 \u001b[0m\u001b[2m│   │   \u001b[0minner_training_loop = find_executable_batch_size(                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1643 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m._inner_training_loop, \u001b[96mself\u001b[0m._train_batch_size, args.auto_find_batch_size  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1644 \u001b[0m\u001b[2m│   │   \u001b[0m)                                                                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1645 \u001b[2m│   │   \u001b[0m\u001b[94mreturn\u001b[0m inner_training_loop(                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1646 \u001b[0m\u001b[2m│   │   │   \u001b[0margs=args,                                                                    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1647 \u001b[0m\u001b[2m│   │   │   \u001b[0mresume_from_checkpoint=resume_from_checkpoint,                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1648 \u001b[0m\u001b[2m│   │   │   \u001b[0mtrial=trial,                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m1938\u001b[0m in \u001b[92m_inner_training_loop\u001b[0m      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1935 \u001b[0m\u001b[2m│   │   │   │   │   \u001b[0m\u001b[96mself\u001b[0m.control = \u001b[96mself\u001b[0m.callback_handler.on_step_begin(args, \u001b[96mself\u001b[0m.state,  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1936 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m                                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1937 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mself\u001b[0m.accelerator.accumulate(model):                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1938 \u001b[2m│   │   │   │   │   \u001b[0mtr_loss_step = \u001b[96mself\u001b[0m.training_step(model, inputs)                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1939 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m                                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1940 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94mif\u001b[0m (                                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1941 \u001b[0m\u001b[2m│   │   │   │   │   \u001b[0margs.logging_nan_inf_filter                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m2770\u001b[0m in \u001b[92mtraining_step\u001b[0m             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2767 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mwith\u001b[0m amp.scale_loss(loss, \u001b[96mself\u001b[0m.optimizer) \u001b[94mas\u001b[0m scaled_loss:                     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2768 \u001b[0m\u001b[2m│   │   │   │   \u001b[0mscaled_loss.backward()                                                    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2769 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94melse\u001b[0m:                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m2770 \u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m.accelerator.backward(loss)                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2771 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2772 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mreturn\u001b[0m loss.detach() / \u001b[96mself\u001b[0m.args.gradient_accumulation_steps                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2773 \u001b[0m                                                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/accelerate/\u001b[0m\u001b[1;33maccelerator.py\u001b[0m:\u001b[94m1819\u001b[0m in \u001b[92mbackward\u001b[0m                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1816 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94melif\u001b[0m \u001b[96mself\u001b[0m.distributed_type == DistributedType.MEGATRON_LM:                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1817 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mreturn\u001b[0m                                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1818 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94melif\u001b[0m \u001b[96mself\u001b[0m.scaler \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m:                                                     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1819 \u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m.scaler.scale(loss).backward(**kwargs)                                    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1820 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94melse\u001b[0m:                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1821 \u001b[0m\u001b[2m│   │   │   \u001b[0mloss.backward(**kwargs)                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m1822 \u001b[0m                                                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/torch/\u001b[0m\u001b[1;33m_tensor.py\u001b[0m:\u001b[94m396\u001b[0m in \u001b[92mbackward\u001b[0m                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 393 \u001b[0m\u001b[2m│   │   │   │   \u001b[0mretain_graph=retain_graph,                                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 394 \u001b[0m\u001b[2m│   │   │   │   \u001b[0mcreate_graph=create_graph,                                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 395 \u001b[0m\u001b[2m│   │   │   │   \u001b[0minputs=inputs)                                                            \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 396 \u001b[2m│   │   \u001b[0mtorch.autograd.backward(\u001b[96mself\u001b[0m, gradient, retain_graph, create_graph, inputs=input  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 397 \u001b[0m\u001b[2m│   \u001b[0m                                                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 398 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mregister_hook\u001b[0m(\u001b[96mself\u001b[0m, hook):                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 399 \u001b[0m\u001b[2;90m│   │   \u001b[0m\u001b[33mr\u001b[0m\u001b[33m\"\"\"Registers a backward hook.\u001b[0m                                                    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/torch/autograd/\u001b[0m\u001b[1;33m__init__.py\u001b[0m:\u001b[94m173\u001b[0m in \u001b[92mbackward\u001b[0m                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m170 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# The reason we repeat same the comment below is that\u001b[0m                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m171 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# some Python versions print out the first line of a multi-line function\u001b[0m               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m172 \u001b[0m\u001b[2m│   \u001b[0m\u001b[2m# calls in the traceback and some print out the last line\u001b[0m                              \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m173 \u001b[2m│   \u001b[0mVariable._execution_engine.run_backward(  \u001b[2m# Calls into the C++ engine to run the bac\u001b[0m   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m174 \u001b[0m\u001b[2m│   │   \u001b[0mtensors, grad_tensors_, retain_graph, create_graph, inputs,                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m175 \u001b[0m\u001b[2m│   │   \u001b[0mallow_unreachable=\u001b[94mTrue\u001b[0m, accumulate_grad=\u001b[94mTrue\u001b[0m)  \u001b[2m# Calls into the C++ engine to ru\u001b[0m   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m176 \u001b[0m                                                                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+       "\u001b[1;91mKeyboardInterrupt\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "trainer.train(resume_from_checkpoint = True)\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "399d27d2-13b6-45e4-bf68-9ff30ad0ec1d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-07T08:43:09.377158Z",
+     "iopub.status.busy": "2023-07-07T08:43:09.376340Z",
+     "iopub.status.idle": "2023-07-07T08:43:09.380265Z",
+     "shell.execute_reply": "2023-07-07T08:43:09.379781Z",
+     "shell.execute_reply.started": "2023-07-07T08:43:09.377127Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Perplexity: 236.31\n"
+     ]
+    }
+   ],
+   "source": [
+    "import math\n",
+    "print(f\"Perplexity: {math.exp(5.465127):.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "bbf22bea-7026-42c9-a643-ba65ab8cdbff",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T12:16:26.337814Z",
+     "iopub.status.busy": "2023-07-09T12:16:26.337328Z",
+     "iopub.status.idle": "2023-07-09T12:47:19.149806Z",
+     "shell.execute_reply": "2023-07-09T12:47:19.149298Z",
+     "shell.execute_reply.started": "2023-07-09T12:16:26.337789Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='50731' max='50731' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [50731/50731 30:52]\n",
+       "    </div>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Perplexity: 78.40\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_results = trainer.evaluate()\n",
+    "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "7fd81e73-7c9c-401e-8b8d-8e2a843bb7c7",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T12:53:37.509234Z",
+     "iopub.status.busy": "2023-07-09T12:53:37.508417Z",
+     "iopub.status.idle": "2023-07-09T12:53:38.733085Z",
+     "shell.execute_reply": "2023-07-09T12:53:38.732326Z",
+     "shell.execute_reply.started": "2023-07-09T12:53:37.509205Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800000; text-decoration-color: #800000\">╭─────────────────────────────── </span><span style=\"color: #800000; text-decoration-color: #800000; font-weight: bold\">Traceback </span><span style=\"color: #bf7f7f; text-decoration-color: #bf7f7f; font-weight: bold\">(most recent call last)</span><span style=\"color: #800000; text-decoration-color: #800000\"> ────────────────────────────────╮</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">&lt;module&gt;</span>                                                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>1 trainer.push_to_hub()                                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">2 </span>                                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/transformers/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">trainer.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">3716</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">push_to_hub</span>               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3713 │   │   # If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, w</span>  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3714 │   │   # it might fail.</span>                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3715 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">not</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">hasattr</span>(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>, <span style=\"color: #808000; text-decoration-color: #808000\">\"repo\"</span>):                                                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>3716 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.init_git_repo()                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3717 │   │   </span>                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3718 │   │   </span>model_name = kwargs.pop(<span style=\"color: #808000; text-decoration-color: #808000\">\"model_name\"</span>, <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>)                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3719 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> model_name <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">is</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">and</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.args.should_save:                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/transformers/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">trainer.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">3571</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">init_git_repo</span>             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3568 │   │   # Make sure the repo exists.</span>                                                      <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3569 │   │   </span>create_repo(repo_name, token=<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.args.hub_token, private=<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.args.hub_private_  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3570 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">try</span>:                                                                              <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>3571 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.repo = Repository(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.args.output_dir, clone_from=repo_name, token=<span style=\"color: #00ffff; text-decoration-color: #00ffff\">sel</span>  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3572 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">except</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">EnvironmentError</span>:                                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3573 │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.args.overwrite_output_dir <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">and</span> at_init:                                <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">3574 │   │   │   │   # Try again after wiping output_dir</span>                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">_validators.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">118</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">_inner_fn</span>     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">115 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> check_use_auth_token:                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">116 │   │   │   </span>kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.<span style=\"color: #ff0000; text-decoration-color: #ff0000\">__name__</span>, has_token=ha   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">117 │   │   </span>                                                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>118 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> fn(*args, **kwargs)                                                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">119 │   </span>                                                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">120 │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> _inner_fn  <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># type: ignore</span>                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">121 </span>                                                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/huggingface_hub/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">repository.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">516</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">__init__</span>             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 513 │   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.huggingface_token = HfFolder.get_token()                                 <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 514 │   │   </span>                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 515 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> clone_from <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">is</span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">not</span> <span style=\"color: #0000ff; text-decoration-color: #0000ff\">None</span>:                                                        <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 516 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   </span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.clone_from(repo_url=clone_from)                                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 517 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                                             <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 518 │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> is_git_repo(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.local_dir):                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 519 │   │   │   │   </span>logger.debug(<span style=\"color: #808000; text-decoration-color: #808000\">\"[Repository] is a valid git repo\"</span>)                          <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">_validators.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">118</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">_inner_fn</span>     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">115 │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> check_use_auth_token:                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">116 │   │   │   </span>kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.<span style=\"color: #ff0000; text-decoration-color: #ff0000\">__name__</span>, has_token=ha   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">117 │   │   </span>                                                                                   <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span>118 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> fn(*args, **kwargs)                                                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">119 │   </span>                                                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">120 │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">return</span> _inner_fn  <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"># type: ignore</span>                                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">121 </span>                                                                                           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #bfbf7f; text-decoration-color: #bfbf7f\">/usr/local/lib/python3.9/dist-packages/huggingface_hub/</span><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">repository.py</span>:<span style=\"color: #0000ff; text-decoration-color: #0000ff\">680</span> in <span style=\"color: #00ff00; text-decoration-color: #00ff00\">clone_from</span>           <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>                                                                                                  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 677 │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">else</span>:                                                                         <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 678 │   │   │   │   # Check if the folder is the root of a git repository</span>                     <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 679 │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">if</span> <span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">not</span> is_git_repo(<span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.local_dir):                                       <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span> <span style=\"color: #800000; text-decoration-color: #800000\">❱ </span> 680 <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\">│   │   │   │   │   </span><span style=\"color: #0000ff; text-decoration-color: #0000ff\">raise</span> <span style=\"color: #00ffff; text-decoration-color: #00ffff\">EnvironmentError</span>(                                               <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 681 │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">\"Tried to clone a repository in a non-empty folder that isn't\"</span>    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 682 │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">f\" a git repository ('{</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.local_dir<span style=\"color: #808000; text-decoration-color: #808000\">}'). If you really want to\"</span>  <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">│</span>   <span style=\"color: #7f7f7f; text-decoration-color: #7f7f7f\"> 683 │   │   │   │   │   │   </span><span style=\"color: #808000; text-decoration-color: #808000\">f\" do this, do it manually:\\n cd {</span><span style=\"color: #00ffff; text-decoration-color: #00ffff\">self</span>.local_dir<span style=\"color: #808000; text-decoration-color: #808000\">} &amp;&amp; git init\"</span>    <span style=\"color: #800000; text-decoration-color: #800000\">│</span>\n",
+       "<span style=\"color: #800000; text-decoration-color: #800000\">╰──────────────────────────────────────────────────────────────────────────────────────────────────╯</span>\n",
+       "<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-weight: bold\">OSError: </span>Tried to clone a repository in a non-empty folder that isn't a git repository \n",
+       "<span style=\"font-weight: bold\">(</span><span style=\"color: #008000; text-decoration-color: #008000\">'/notebooks/9wimu9/sinhala-bert-1'</span><span style=\"font-weight: bold\">)</span>. If you really want to do this, do it manually:\n",
+       " cd <span style=\"color: #800080; text-decoration-color: #800080\">/notebooks/9wimu9/</span><span style=\"color: #ff00ff; text-decoration-color: #ff00ff\">sinhala-bert-1</span> &amp;&amp; git init &amp;&amp; git remote add origin &amp;&amp; git pull origin main\n",
+       " or clone repo to a new folder and move your existing files there afterwards.\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
+       "\u001b[31m│\u001b[0m in \u001b[92m<module>\u001b[0m                                                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 trainer.push_to_hub()                                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m2 \u001b[0m                                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m3716\u001b[0m in \u001b[92mpush_to_hub\u001b[0m               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3713 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[2m# If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, w\u001b[0m  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3714 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[2m# it might fail.\u001b[0m                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3715 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m \u001b[96mhasattr\u001b[0m(\u001b[96mself\u001b[0m, \u001b[33m\"\u001b[0m\u001b[33mrepo\u001b[0m\u001b[33m\"\u001b[0m):                                                     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m3716 \u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m.init_git_repo()                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3717 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3718 \u001b[0m\u001b[2m│   │   \u001b[0mmodel_name = kwargs.pop(\u001b[33m\"\u001b[0m\u001b[33mmodel_name\u001b[0m\u001b[33m\"\u001b[0m, \u001b[94mNone\u001b[0m)                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3719 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m model_name \u001b[95mis\u001b[0m \u001b[94mNone\u001b[0m \u001b[95mand\u001b[0m \u001b[96mself\u001b[0m.args.should_save:                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m3571\u001b[0m in \u001b[92minit_git_repo\u001b[0m             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3568 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[2m# Make sure the repo exists.\u001b[0m                                                      \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3569 \u001b[0m\u001b[2m│   │   \u001b[0mcreate_repo(repo_name, token=\u001b[96mself\u001b[0m.args.hub_token, private=\u001b[96mself\u001b[0m.args.hub_private_  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3570 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mtry\u001b[0m:                                                                              \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m3571 \u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m.repo = Repository(\u001b[96mself\u001b[0m.args.output_dir, clone_from=repo_name, token=\u001b[96msel\u001b[0m  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3572 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mEnvironmentError\u001b[0m:                                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3573 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.args.overwrite_output_dir \u001b[95mand\u001b[0m at_init:                                \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m3574 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[2m# Try again after wiping output_dir\u001b[0m                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/\u001b[0m\u001b[1;33m_validators.py\u001b[0m:\u001b[94m118\u001b[0m in \u001b[92m_inner_fn\u001b[0m     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m115 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m check_use_auth_token:                                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m116 \u001b[0m\u001b[2m│   │   │   \u001b[0mkwargs = smoothly_deprecate_use_auth_token(fn_name=fn.\u001b[91m__name__\u001b[0m, has_token=ha   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m117 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m118 \u001b[2m│   │   \u001b[0m\u001b[94mreturn\u001b[0m fn(*args, **kwargs)                                                         \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m119 \u001b[0m\u001b[2m│   \u001b[0m                                                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m120 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94mreturn\u001b[0m _inner_fn  \u001b[2m# type: ignore\u001b[0m                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m121 \u001b[0m                                                                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/\u001b[0m\u001b[1;33mrepository.py\u001b[0m:\u001b[94m516\u001b[0m in \u001b[92m__init__\u001b[0m             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 513 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m.huggingface_token = HfFolder.get_token()                                 \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 514 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 515 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m clone_from \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m:                                                        \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 516 \u001b[2m│   │   │   \u001b[0m\u001b[96mself\u001b[0m.clone_from(repo_url=clone_from)                                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 517 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94melse\u001b[0m:                                                                             \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 518 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94mif\u001b[0m is_git_repo(\u001b[96mself\u001b[0m.local_dir):                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 519 \u001b[0m\u001b[2m│   │   │   │   \u001b[0mlogger.debug(\u001b[33m\"\u001b[0m\u001b[33m[Repository] is a valid git repo\u001b[0m\u001b[33m\"\u001b[0m)                          \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/\u001b[0m\u001b[1;33m_validators.py\u001b[0m:\u001b[94m118\u001b[0m in \u001b[92m_inner_fn\u001b[0m     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m115 \u001b[0m\u001b[2m│   │   \u001b[0m\u001b[94mif\u001b[0m check_use_auth_token:                                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m116 \u001b[0m\u001b[2m│   │   │   \u001b[0mkwargs = smoothly_deprecate_use_auth_token(fn_name=fn.\u001b[91m__name__\u001b[0m, has_token=ha   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m117 \u001b[0m\u001b[2m│   │   \u001b[0m                                                                                   \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m118 \u001b[2m│   │   \u001b[0m\u001b[94mreturn\u001b[0m fn(*args, **kwargs)                                                         \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m119 \u001b[0m\u001b[2m│   \u001b[0m                                                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m120 \u001b[0m\u001b[2m│   \u001b[0m\u001b[94mreturn\u001b[0m _inner_fn  \u001b[2m# type: ignore\u001b[0m                                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m121 \u001b[0m                                                                                           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/\u001b[0m\u001b[1;33mrepository.py\u001b[0m:\u001b[94m680\u001b[0m in \u001b[92mclone_from\u001b[0m           \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m                                                                                                  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 677 \u001b[0m\u001b[2m│   │   │   \u001b[0m\u001b[94melse\u001b[0m:                                                                         \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 678 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[2m# Check if the folder is the root of a git repository\u001b[0m                     \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 679 \u001b[0m\u001b[2m│   │   │   │   \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m is_git_repo(\u001b[96mself\u001b[0m.local_dir):                                       \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 680 \u001b[2m│   │   │   │   │   \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mEnvironmentError\u001b[0m(                                               \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 681 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[33m\"\u001b[0m\u001b[33mTried to clone a repository in a non-empty folder that isn\u001b[0m\u001b[33m'\u001b[0m\u001b[33mt\u001b[0m\u001b[33m\"\u001b[0m    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 682 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m a git repository (\u001b[0m\u001b[33m'\u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.local_dir\u001b[33m}\u001b[0m\u001b[33m'\u001b[0m\u001b[33m). If you really want to\u001b[0m\u001b[33m\"\u001b[0m  \u001b[31m│\u001b[0m\n",
+       "\u001b[31m│\u001b[0m   \u001b[2m 683 \u001b[0m\u001b[2m│   │   │   │   │   │   \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m do this, do it manually:\u001b[0m\u001b[33m\\n\u001b[0m\u001b[33m cd \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.local_dir\u001b[33m}\u001b[0m\u001b[33m && git init\u001b[0m\u001b[33m\"\u001b[0m    \u001b[31m│\u001b[0m\n",
+       "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n",
+       "\u001b[1;91mOSError: \u001b[0mTried to clone a repository in a non-empty folder that isn't a git repository \n",
+       "\u001b[1m(\u001b[0m\u001b[32m'/notebooks/9wimu9/sinhala-bert-1'\u001b[0m\u001b[1m)\u001b[0m. If you really want to do this, do it manually:\n",
+       " cd \u001b[35m/notebooks/9wimu9/\u001b[0m\u001b[95msinhala-bert-1\u001b[0m && git init && git remote add origin && git pull origin main\n",
+       " or clone repo to a new folder and move your existing files there afterwards.\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "trainer.push_to_hub()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "d3417a50-f0a7-4cd7-bc3b-14106660be58",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T12:55:51.765511Z",
+     "iopub.status.busy": "2023-07-09T12:55:51.764785Z",
+     "iopub.status.idle": "2023-07-09T12:55:53.074194Z",
+     "shell.execute_reply": "2023-07-09T12:55:53.073512Z",
+     "shell.execute_reply.started": "2023-07-09T12:55:51.765481Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "trainer.save_model(\"path_to_save\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "6a3b42de-552c-41fc-a454-afe8a0bf567d",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T12:58:09.073605Z",
+     "iopub.status.busy": "2023-07-09T12:58:09.073328Z",
+     "iopub.status.idle": "2023-07-09T12:58:13.289346Z",
+     "shell.execute_reply": "2023-07-09T12:58:13.288684Z",
+     "shell.execute_reply.started": "2023-07-09T12:58:09.073583Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at /notebooks/path_to_save were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']\n",
+      "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of RobertaModel were not initialized from the model checkpoint at /notebooks/path_to_save and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModel \n",
+    "model = AutoModel.from_pretrained('/notebooks/path_to_save',local_files_only=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "b6f2c49a-9a09-4949-b67f-29df6d0aa895",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-09T13:02:12.261157Z",
+     "iopub.status.busy": "2023-07-09T13:02:12.260199Z",
+     "iopub.status.idle": "2023-07-09T13:03:41.483513Z",
+     "shell.execute_reply": "2023-07-09T13:03:41.483062Z",
+     "shell.execute_reply.started": "2023-07-09T13:02:12.261124Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "19ea017fbdf04010b52469760f205626",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/9wimu9/sinhala-bert-1.1/commit/97e53a1d1bfd88984d639b643028a19ae2700b75', commit_message='Upload model', commit_description='', oid='97e53a1d1bfd88984d639b643028a19ae2700b75', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.push_to_hub('9wimu9/sinhala-bert-1.1')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4553ec7-1e38-4b44-8c5f-e46786cd3cfc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "api = HfApi()\n",
+    "files = ['tokenizer.json','training_args.bin']\n",
+    "for file in files:\n",
+    "    api.upload_file(\n",
+    "        path_or_fileobj=\"/notebooks/path_to_save/\"+file,\n",
+    "        path_in_repo=file,\n",
+    "        repo_id=\"9wimu9/sinhala-bert-1.1\",\n",
+    "        repo_type=\"model\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1614503-df5d-454f-a81d-d96bb1899443",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "learning rate scheduler details can be find here\n",
+    "https://dev.classmethod.jp/articles/huggingface-usage-scheluder-type/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd184295-1c0b-4625-a516-da417beb814f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bert hyper params\n",
+    "======================\n",
+    "β1 = 0.9,\n",
+    "β2 = 0.999, \n",
+    "ǫ = 1e-6\n",
+    "L2 weight decay = 0.01\n",
+    "learning rate = warmed up first 10k to a peak of 1e-4 then linearly decayed\n",
+    "drop out 0.1\n",
+    "batch size = 256\n",
+    "step size = 1m\n",
+    "max_token_length = 512\n",
+    "\n",
+    "roberta\n",
+    "============\n",
+    "β2 = 0.98 for lareg batch sizs\n",
+    "max_token_length = 512\n",
+    "batch size = 2k\n",
+    "lr = 7e-4\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}