diff --git "a/trainer.ipynb" "b/trainer.ipynb" new file mode 100644--- /dev/null +++ "b/trainer.ipynb" @@ -0,0 +1,1766 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6b7d6a9b-db7e-46b2-8ab8-d6914e18f1e1", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:00:39.926249Z", + "iopub.status.busy": "2023-07-09T08:00:39.925711Z", + "iopub.status.idle": "2023-07-09T08:00:50.779648Z", + "shell.execute_reply": "2023-07-09T08:00:50.779080Z", + "shell.execute_reply.started": "2023-07-09T08:00:39.926227Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install datasets transformers accelerate wandb -U -q" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a2a349ee-2749-4490-9807-cdf18f428181", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:00:50.780814Z", + "iopub.status.busy": "2023-07-09T08:00:50.780645Z", + "iopub.status.idle": "2023-07-09T08:00:55.743251Z", + "shell.execute_reply": "2023-07-09T08:00:55.742619Z", + "shell.execute_reply.started": "2023-07-09T08:00:50.780797Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········································\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import wandb\n", + "wandb.login()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bce905bd-4fc3-4a6a-bd76-48bba9ebc1d9", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:00:55.744653Z", + "iopub.status.busy": "2023-07-09T08:00:55.744235Z", + "iopub.status.idle": "2023-07-09T08:00:55.748644Z", + "shell.execute_reply": "2023-07-09T08:00:55.747937Z", + "shell.execute_reply.started": "2023-07-09T08:00:55.744631Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: WANDB_PROJECT=sinhala_bert_v1\n" + ] + } + ], + "source": [ + "%env WANDB_PROJECT=sinhala_bert_v1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2551a71d-2804-48ed-bf85-6e0aa94d47d8", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:00:55.750683Z", + "iopub.status.busy": "2023-07-09T08:00:55.750064Z", + "iopub.status.idle": "2023-07-09T08:00:55.753691Z", + "shell.execute_reply": "2023-07-09T08:00:55.753148Z", + "shell.execute_reply.started": "2023-07-09T08:00:55.750659Z" + } + }, + "outputs": [], + "source": [ + "model_checkpoint = \"9wimu9/sinhala-bert-1\"\n", + "tokenizer_checkpoint= \"9wimu9/sinhala-bert-1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "137a18d4-6fc5-4bfe-bf3c-54f4bdef5f4b", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:00:55.755128Z", + "iopub.status.busy": "2023-07-09T08:00:55.754589Z", + "iopub.status.idle": "2023-07-09T08:01:58.423431Z", + "shell.execute_reply": "2023-07-09T08:01:58.422900Z", + "shell.execute_reply.started": "2023-07-09T08:00:55.755103Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "85ae51907f1b4e6188ef42ab98704c4b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading readme: 0%| | 0.00/608 [00:00 BERT large\n", + "# 24 layers, 1,024 dimensions, 16 heads, 4,096 hidden dimensions in the feed-forward layer, with pre-layer normalization\n", + "\n", + "\n", + "# We follow the optimization of RoBERTa (Liu et al., 2019) and use \n", + "# AdamW (Loshchilov and Hutter, 2019) with \n", + "# β1 = 0.9, β2 = 0.98, ε = 1e-6, \n", + "# weight decay of 0.01, dropout 0.1, and \n", + "# attention dropout 0.1.\n", + "\n", + "\n", + "# Hyperparameters\n", + "\n", + "# batch size -> 4k, 8k, and 16k (via gradient accumilation)\n", + "\n", + "# Warmup Proportion (wu) We determine the number of warmup steps as a proportion of the total number of steps. \n", + "# Specifically, we try 0%, 2%, 4%, and 6%, which all reflect significantly fewer warmup steps than in BERT.\n", + "\n", + "# Peak Learning Rate (lr) Our linear learning rate scheduler, \n", + "# which starts at 0, warms up to the peak learning rate, and then decays back to 0. We try 5e-4, 1e-3, and 2e-3\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "858cd60b-32c4-4c0f-859e-10a1ee3bf68e", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:02:03.534149Z", + "iopub.status.busy": "2023-07-09T08:02:03.533455Z", + "iopub.status.idle": "2023-07-09T08:02:03.565628Z", + "shell.execute_reply": "2023-07-09T08:02:03.565017Z", + "shell.execute_reply.started": "2023-07-09T08:02:03.534132Z" + } + }, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained('/notebooks/roberta-large-pretrained-si',model_max_length=256)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5812f8da-3434-4ec8-a2e6-a6bdc30ecf72", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:02:03.566886Z", + "iopub.status.busy": "2023-07-09T08:02:03.566690Z", + "iopub.status.idle": "2023-07-09T08:02:03.570597Z", + "shell.execute_reply": "2023-07-09T08:02:03.569974Z", + "shell.execute_reply.started": "2023-07-09T08:02:03.566870Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "RobertaTokenizerFast(name_or_path='/notebooks/roberta-large-pretrained-si', vocab_size=1868, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'sep_token': '', 'pad_token': '', 'cls_token': '', 'mask_token': AddedToken(\"\", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0905ef8c-9faa-49d6-ad0a-06753ce856fa", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:02:03.572556Z", + "iopub.status.busy": "2023-07-09T08:02:03.572022Z", + "iopub.status.idle": "2023-07-09T08:02:03.575094Z", + "shell.execute_reply": "2023-07-09T08:02:03.574563Z", + "shell.execute_reply.started": "2023-07-09T08:02:03.572539Z" + } + }, + "outputs": [], + "source": [ + "per_device_train_batch_size=250\n", + "gradient_accumulation_steps=8\n", + "num_train_epochs=1\n", + "warmup_rate=0.01" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6056f333-46f9-4bea-a93d-423f3a1a959e", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:02:03.576023Z", + "iopub.status.busy": "2023-07-09T08:02:03.575858Z", + "iopub.status.idle": "2023-07-09T08:02:05.698372Z", + "shell.execute_reply": "2023-07-09T08:02:05.697548Z", + "shell.execute_reply.started": "2023-07-09T08:02:03.576009Z" + } + }, + "outputs": [], + "source": [ + "from transformers import TrainingArguments\n", + "training_args = TrainingArguments(\n", + " model_checkpoint,\n", + " evaluation_strategy = \"epoch\",\n", + " # push_to_hub=True,\n", + " hub_model_id=\"sinhala-bert-v.1\",\n", + " per_device_train_batch_size=per_device_train_batch_size, # 4000,8000,16000\n", + " gradient_accumulation_steps=gradient_accumulation_steps,\n", + " gradient_checkpointing=True,\n", + " fp16=True,\n", + " report_to=\"wandb\", \n", + " num_train_epochs=num_train_epochs,\n", + " no_cuda=False,\n", + " logging_steps=1,\n", + " save_steps=100,\n", + " save_total_limit=3,\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7f6078f0-ba64-4509-ac8f-39dd0cd7fe04", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:02:05.699733Z", + "iopub.status.busy": "2023-07-09T08:02:05.699536Z", + "iopub.status.idle": "2023-07-09T08:02:05.708689Z", + "shell.execute_reply": "2023-07-09T08:02:05.708136Z", + "shell.execute_reply.started": "2023-07-09T08:02:05.699715Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(7310725, 3656, 36, 2000)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import get_polynomial_decay_schedule_with_warmup,AdamW,get_linear_schedule_with_warmup\n", + "import math,torch\n", + "\n", + "params = filter(lambda x: x.requires_grad, model.parameters())\n", + "\n", + "optimizer = torch.optim.AdamW(params,lr=1e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.01)\n", + "\n", + "batch_size = per_device_train_batch_size*gradient_accumulation_steps\n", + "\n", + "num_warmup_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * warmup_rate*num_train_epochs\n", + "num_warmup_steps = int(num_warmup_steps)\n", + "num_training_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * num_train_epochs\n", + "\n", + "\n", + "scheduler = get_linear_schedule_with_warmup(optimizer,\n", + " num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)\n", + "\n", + "lm_datasets[\"train\"].num_rows,num_training_steps,num_warmup_steps,batch_size" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "ebf14d20-e630-4961-a6d4-d9c8fa90e941", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T12:53:11.116082Z", + "iopub.status.busy": "2023-07-09T12:53:11.115500Z", + "iopub.status.idle": "2023-07-09T12:53:16.314031Z", + "shell.execute_reply": "2023-07-09T12:53:16.309782Z", + "shell.execute_reply.started": "2023-07-09T12:53:11.116055Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading package lists... Done\n", + "Building dependency tree \n", + "Reading state information... Done\n", + "The following NEW packages will be installed:\n", + " git-lfs\n", + "0 upgraded, 1 newly installed, 0 to remove and 3 not upgraded.\n", + "Need to get 3316 kB of archives.\n", + "After this operation, 11.1 MB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]\n", + "Fetched 3316 kB in 1s (3308 kB/s) \u001b[0m[33m\u001b[33m\n", + "\n", + "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n", + "(Reading database ... 69943 files and directories currently installed.)\n", + "Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...\n", + "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (2.9.2-1) ...\n", + "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (2.9.2-1) ...\n", + "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8Processing triggers for man-db (2.9.1-1) ...\n", + "\n", + "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J" + ] + } + ], + "source": [ + "!sudo apt install git-lfs" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "969484c6-4035-4234-8ac7-209ab4a014bc", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T12:16:17.950069Z", + "iopub.status.busy": "2023-07-09T12:16:17.949152Z", + "iopub.status.idle": "2023-07-09T12:16:17.961269Z", + "shell.execute_reply": "2023-07-09T12:16:17.960724Z", + "shell.execute_reply.started": "2023-07-09T12:16:17.950036Z" + } + }, + "outputs": [], + "source": [ + "from transformers import DataCollatorForLanguageModeling,Trainer\n", + "\n", + "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)\n", + "\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=lm_datasets[\"train\"],\n", + " eval_dataset=lm_datasets[\"valid\"],\n", + " data_collator=data_collator,\n", + " optimizers=[optimizer, scheduler]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4c2f4490-b3bc-4ec6-bef1-2bd71933369a", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T12:53:22.338679Z", + "iopub.status.busy": "2023-07-09T12:53:22.337791Z", + "iopub.status.idle": "2023-07-09T12:53:34.948476Z", + "shell.execute_reply": "2023-07-09T12:53:34.947890Z", + "shell.execute_reply.started": "2023-07-09T12:53:22.338679Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "Waiting for W&B process to finish... (success)." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "

Run history:


eval/loss
eval/runtime
eval/samples_per_second
eval/steps_per_second
train/epoch▁▁▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████████▁
train/learning_rate███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss▆█▇▇▃▂▇█▇▄▄▅▆▂▁▂▁▃██▁▃▃▅▆▄▄█▆▅▅▅▅▄▆▄▄▆▃▃

Run summary:


eval/loss4.36183
eval/runtime1852.7969
eval/samples_per_second219.042
eval/steps_per_second27.381
train/epoch0.96
train/global_step0
train/learning_rate0.0
train/loss4.3709

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run valiant-durian-58 at: https://wandb.ai/sinquad/sinhala_bert_v1/runs/4506wait
Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Find logs at: ./wandb/run-20230709_080222-4506wait/logs" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "wandb.finish()\n", + "# wandb.init()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "17979cc2-2e66-4055-aabb-29d9ee90112d", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-08T07:31:19.523715Z", + "iopub.status.busy": "2023-07-08T07:31:19.523529Z", + "iopub.status.idle": "2023-07-08T07:31:20.383711Z", + "shell.execute_reply": "2023-07-08T07:31:20.382696Z", + "shell.execute_reply.started": "2023-07-08T07:31:19.523696Z" + } + }, + "outputs": [], + "source": [ + "# !rm -rf /notebooks/9wimu9/sinhala-bert-1" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b8bd0ab4-6412-4c0c-a215-a0c5cd5d4626", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T08:02:07.964267Z", + "iopub.status.busy": "2023-07-09T08:02:07.963688Z", + "iopub.status.idle": "2023-07-09T12:16:04.192592Z", + "shell.execute_reply": "2023-07-09T12:16:04.191499Z", + "shell.execute_reply.started": "2023-07-09T08:02:07.964251Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "Changes to your `wandb` environment variables will be ignored because your `wandb` session has already started. For more information on how to modify your settings with `wandb.init()` arguments, please refer to the W&B docs." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m9wimu9\u001b[0m (\u001b[33msinquad\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" + ] + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.15.5" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /notebooks/wandb/run-20230709_080222-4506wait" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run valiant-durian-58 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/sinquad/sinhala_bert_v1" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/sinquad/sinhala_bert_v1/runs/4506wait" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [3504/3655 4:12:37 < 1:15:59, 0.03 it/s, Epoch 0.96/1]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation Loss

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>                                                                                      \n",
+       "                                                                                                  \n",
+       " 1 trainer.train(resume_from_checkpoint = True)                                                 \n",
+       "   2 wandb.finish()                                                                               \n",
+       "   3                                                                                              \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:1645 in train                     \n",
+       "                                                                                                  \n",
+       "   1642 │   │   inner_training_loop = find_executable_batch_size(                                 \n",
+       "   1643 │   │   │   self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size  \n",
+       "   1644 │   │   )                                                                                 \n",
+       " 1645 │   │   return inner_training_loop(                                                       \n",
+       "   1646 │   │   │   args=args,                                                                    \n",
+       "   1647 │   │   │   resume_from_checkpoint=resume_from_checkpoint,                                \n",
+       "   1648 │   │   │   trial=trial,                                                                  \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:1938 in _inner_training_loop      \n",
+       "                                                                                                  \n",
+       "   1935 │   │   │   │   │   self.control = self.callback_handler.on_step_begin(args, self.state,  \n",
+       "   1936 │   │   │   │                                                                             \n",
+       "   1937 │   │   │   │   with self.accelerator.accumulate(model):                                  \n",
+       " 1938 │   │   │   │   │   tr_loss_step = self.training_step(model, inputs)                      \n",
+       "   1939 │   │   │   │                                                                             \n",
+       "   1940 │   │   │   │   if (                                                                      \n",
+       "   1941 │   │   │   │   │   args.logging_nan_inf_filter                                           \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:2770 in training_step             \n",
+       "                                                                                                  \n",
+       "   2767 │   │   │   with amp.scale_loss(loss, self.optimizer) as scaled_loss:                     \n",
+       "   2768 │   │   │   │   scaled_loss.backward()                                                    \n",
+       "   2769 │   │   else:                                                                             \n",
+       " 2770 │   │   │   self.accelerator.backward(loss)                                               \n",
+       "   2771 │   │                                                                                     \n",
+       "   2772 │   │   return loss.detach() / self.args.gradient_accumulation_steps                      \n",
+       "   2773                                                                                           \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/accelerate/accelerator.py:1819 in backward                \n",
+       "                                                                                                  \n",
+       "   1816 │   │   elif self.distributed_type == DistributedType.MEGATRON_LM:                        \n",
+       "   1817 │   │   │   return                                                                        \n",
+       "   1818 │   │   elif self.scaler is not None:                                                     \n",
+       " 1819 │   │   │   self.scaler.scale(loss).backward(**kwargs)                                    \n",
+       "   1820 │   │   else:                                                                             \n",
+       "   1821 │   │   │   loss.backward(**kwargs)                                                       \n",
+       "   1822                                                                                           \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/torch/_tensor.py:396 in backward                          \n",
+       "                                                                                                  \n",
+       "    393 │   │   │   │   retain_graph=retain_graph,                                                \n",
+       "    394 │   │   │   │   create_graph=create_graph,                                                \n",
+       "    395 │   │   │   │   inputs=inputs)                                                            \n",
+       "  396 │   │   torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=input  \n",
+       "    397 │                                                                                         \n",
+       "    398 │   def register_hook(self, hook):                                                        \n",
+       "    399 │   │   r\"\"\"Registers a backward hook.                                                    \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/torch/autograd/__init__.py:173 in backward                \n",
+       "                                                                                                  \n",
+       "   170 │   # The reason we repeat same the comment below is that                                  \n",
+       "   171 │   # some Python versions print out the first line of a multi-line function               \n",
+       "   172 │   # calls in the traceback and some print out the last line                              \n",
+       " 173 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the bac   \n",
+       "   174 │   │   tensors, grad_tensors_, retain_graph, create_graph, inputs,                        \n",
+       "   175 │   │   allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to ru   \n",
+       "   176                                                                                            \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "KeyboardInterrupt\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", + "\u001b[31m│\u001b[0m in \u001b[92m\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 trainer.train(resume_from_checkpoint = \u001b[94mTrue\u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2 \u001b[0mwandb.finish() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m1645\u001b[0m in \u001b[92mtrain\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1642 \u001b[0m\u001b[2m│ │ \u001b[0minner_training_loop = find_executable_batch_size( \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1643 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m._inner_training_loop, \u001b[96mself\u001b[0m._train_batch_size, args.auto_find_batch_size \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1644 \u001b[0m\u001b[2m│ │ \u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1645 \u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m inner_training_loop( \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1646 \u001b[0m\u001b[2m│ │ │ \u001b[0margs=args, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1647 \u001b[0m\u001b[2m│ │ │ \u001b[0mresume_from_checkpoint=resume_from_checkpoint, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1648 \u001b[0m\u001b[2m│ │ │ \u001b[0mtrial=trial, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m1938\u001b[0m in \u001b[92m_inner_training_loop\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1935 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[96mself\u001b[0m.control = \u001b[96mself\u001b[0m.callback_handler.on_step_begin(args, \u001b[96mself\u001b[0m.state, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1936 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1937 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mwith\u001b[0m \u001b[96mself\u001b[0m.accelerator.accumulate(model): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1938 \u001b[2m│ │ │ │ │ \u001b[0mtr_loss_step = \u001b[96mself\u001b[0m.training_step(model, inputs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1939 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1940 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m ( \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1941 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0margs.logging_nan_inf_filter \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m2770\u001b[0m in \u001b[92mtraining_step\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2767 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mwith\u001b[0m amp.scale_loss(loss, \u001b[96mself\u001b[0m.optimizer) \u001b[94mas\u001b[0m scaled_loss: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2768 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mscaled_loss.backward() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2769 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m2770 \u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m.accelerator.backward(loss) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2771 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2772 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m loss.detach() / \u001b[96mself\u001b[0m.args.gradient_accumulation_steps \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2773 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/accelerate/\u001b[0m\u001b[1;33maccelerator.py\u001b[0m:\u001b[94m1819\u001b[0m in \u001b[92mbackward\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1816 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melif\u001b[0m \u001b[96mself\u001b[0m.distributed_type == DistributedType.MEGATRON_LM: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1817 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1818 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melif\u001b[0m \u001b[96mself\u001b[0m.scaler \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1819 \u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m.scaler.scale(loss).backward(**kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1820 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1821 \u001b[0m\u001b[2m│ │ │ \u001b[0mloss.backward(**kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m1822 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/torch/\u001b[0m\u001b[1;33m_tensor.py\u001b[0m:\u001b[94m396\u001b[0m in \u001b[92mbackward\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 393 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mretain_graph=retain_graph, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 394 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mcreate_graph=create_graph, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 395 \u001b[0m\u001b[2m│ │ │ │ \u001b[0minputs=inputs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 396 \u001b[2m│ │ \u001b[0mtorch.autograd.backward(\u001b[96mself\u001b[0m, gradient, retain_graph, create_graph, inputs=input \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 397 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 398 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92mregister_hook\u001b[0m(\u001b[96mself\u001b[0m, hook): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 399 \u001b[0m\u001b[2;90m│ │ \u001b[0m\u001b[33mr\u001b[0m\u001b[33m\"\"\"Registers a backward hook.\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/torch/autograd/\u001b[0m\u001b[1;33m__init__.py\u001b[0m:\u001b[94m173\u001b[0m in \u001b[92mbackward\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m170 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m# The reason we repeat same the comment below is that\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m171 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m# some Python versions print out the first line of a multi-line function\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m172 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m# calls in the traceback and some print out the last line\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m173 \u001b[2m│ \u001b[0mVariable._execution_engine.run_backward( \u001b[2m# Calls into the C++ engine to run the bac\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m174 \u001b[0m\u001b[2m│ │ \u001b[0mtensors, grad_tensors_, retain_graph, create_graph, inputs, \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m175 \u001b[0m\u001b[2m│ │ \u001b[0mallow_unreachable=\u001b[94mTrue\u001b[0m, accumulate_grad=\u001b[94mTrue\u001b[0m) \u001b[2m# Calls into the C++ engine to ru\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m176 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mKeyboardInterrupt\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "trainer.train(resume_from_checkpoint = True)\n", + "wandb.finish()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "399d27d2-13b6-45e4-bf68-9ff30ad0ec1d", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-07T08:43:09.377158Z", + "iopub.status.busy": "2023-07-07T08:43:09.376340Z", + "iopub.status.idle": "2023-07-07T08:43:09.380265Z", + "shell.execute_reply": "2023-07-07T08:43:09.379781Z", + "shell.execute_reply.started": "2023-07-07T08:43:09.377127Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Perplexity: 236.31\n" + ] + } + ], + "source": [ + "import math\n", + "print(f\"Perplexity: {math.exp(5.465127):.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "bbf22bea-7026-42c9-a643-ba65ab8cdbff", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T12:16:26.337814Z", + "iopub.status.busy": "2023-07-09T12:16:26.337328Z", + "iopub.status.idle": "2023-07-09T12:47:19.149806Z", + "shell.execute_reply": "2023-07-09T12:47:19.149298Z", + "shell.execute_reply.started": "2023-07-09T12:16:26.337789Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [50731/50731 30:52]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Perplexity: 78.40\n" + ] + } + ], + "source": [ + "eval_results = trainer.evaluate()\n", + "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7fd81e73-7c9c-401e-8b8d-8e2a843bb7c7", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T12:53:37.509234Z", + "iopub.status.busy": "2023-07-09T12:53:37.508417Z", + "iopub.status.idle": "2023-07-09T12:53:38.733085Z", + "shell.execute_reply": "2023-07-09T12:53:38.732326Z", + "shell.execute_reply.started": "2023-07-09T12:53:37.509205Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>                                                                                      \n",
+       "                                                                                                  \n",
+       " 1 trainer.push_to_hub()                                                                        \n",
+       "   2                                                                                              \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:3716 in push_to_hub               \n",
+       "                                                                                                  \n",
+       "   3713 │   │   # If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, w  \n",
+       "   3714 │   │   # it might fail.                                                                  \n",
+       "   3715 │   │   if not hasattr(self, \"repo\"):                                                     \n",
+       " 3716 │   │   │   self.init_git_repo()                                                          \n",
+       "   3717 │   │                                                                                     \n",
+       "   3718 │   │   model_name = kwargs.pop(\"model_name\", None)                                       \n",
+       "   3719 │   │   if model_name is None and self.args.should_save:                                  \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/transformers/trainer.py:3571 in init_git_repo             \n",
+       "                                                                                                  \n",
+       "   3568 │   │   # Make sure the repo exists.                                                      \n",
+       "   3569 │   │   create_repo(repo_name, token=self.args.hub_token, private=self.args.hub_private_  \n",
+       "   3570 │   │   try:                                                                              \n",
+       " 3571 │   │   │   self.repo = Repository(self.args.output_dir, clone_from=repo_name, token=sel  \n",
+       "   3572 │   │   except EnvironmentError:                                                          \n",
+       "   3573 │   │   │   if self.args.overwrite_output_dir and at_init:                                \n",
+       "   3574 │   │   │   │   # Try again after wiping output_dir                                       \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/_validators.py:118 in _inner_fn     \n",
+       "                                                                                                  \n",
+       "   115 │   │   if check_use_auth_token:                                                           \n",
+       "   116 │   │   │   kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=ha   \n",
+       "   117 │   │                                                                                      \n",
+       " 118 │   │   return fn(*args, **kwargs)                                                         \n",
+       "   119 │                                                                                          \n",
+       "   120 │   return _inner_fn  # type: ignore                                                       \n",
+       "   121                                                                                            \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/huggingface_hub/repository.py:516 in __init__             \n",
+       "                                                                                                  \n",
+       "    513 │   │   │   self.huggingface_token = HfFolder.get_token()                                 \n",
+       "    514 │   │                                                                                     \n",
+       "    515 │   │   if clone_from is not None:                                                        \n",
+       "  516 │   │   │   self.clone_from(repo_url=clone_from)                                          \n",
+       "    517 │   │   else:                                                                             \n",
+       "    518 │   │   │   if is_git_repo(self.local_dir):                                               \n",
+       "    519 │   │   │   │   logger.debug(\"[Repository] is a valid git repo\")                          \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/_validators.py:118 in _inner_fn     \n",
+       "                                                                                                  \n",
+       "   115 │   │   if check_use_auth_token:                                                           \n",
+       "   116 │   │   │   kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=ha   \n",
+       "   117 │   │                                                                                      \n",
+       " 118 │   │   return fn(*args, **kwargs)                                                         \n",
+       "   119 │                                                                                          \n",
+       "   120 │   return _inner_fn  # type: ignore                                                       \n",
+       "   121                                                                                            \n",
+       "                                                                                                  \n",
+       " /usr/local/lib/python3.9/dist-packages/huggingface_hub/repository.py:680 in clone_from           \n",
+       "                                                                                                  \n",
+       "    677 │   │   │   else:                                                                         \n",
+       "    678 │   │   │   │   # Check if the folder is the root of a git repository                     \n",
+       "    679 │   │   │   │   if not is_git_repo(self.local_dir):                                       \n",
+       "  680 │   │   │   │   │   raise EnvironmentError(                                               \n",
+       "    681 │   │   │   │   │   │   \"Tried to clone a repository in a non-empty folder that isn't\"    \n",
+       "    682 │   │   │   │   │   │   f\" a git repository ('{self.local_dir}'). If you really want to\"  \n",
+       "    683 │   │   │   │   │   │   f\" do this, do it manually:\\n cd {self.local_dir} && git init\"    \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "OSError: Tried to clone a repository in a non-empty folder that isn't a git repository \n",
+       "('/notebooks/9wimu9/sinhala-bert-1'). If you really want to do this, do it manually:\n",
+       " cd /notebooks/9wimu9/sinhala-bert-1 && git init && git remote add origin && git pull origin main\n",
+       " or clone repo to a new folder and move your existing files there afterwards.\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", + "\u001b[31m│\u001b[0m in \u001b[92m\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m1 trainer.push_to_hub() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m3716\u001b[0m in \u001b[92mpush_to_hub\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3713 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# If a user calls manually `push_to_hub` with `self.args.push_to_hub = False`, w\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3714 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# it might fail.\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3715 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m \u001b[96mhasattr\u001b[0m(\u001b[96mself\u001b[0m, \u001b[33m\"\u001b[0m\u001b[33mrepo\u001b[0m\u001b[33m\"\u001b[0m): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m3716 \u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m.init_git_repo() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3717 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3718 \u001b[0m\u001b[2m│ │ \u001b[0mmodel_name = kwargs.pop(\u001b[33m\"\u001b[0m\u001b[33mmodel_name\u001b[0m\u001b[33m\"\u001b[0m, \u001b[94mNone\u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3719 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m model_name \u001b[95mis\u001b[0m \u001b[94mNone\u001b[0m \u001b[95mand\u001b[0m \u001b[96mself\u001b[0m.args.should_save: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/transformers/\u001b[0m\u001b[1;33mtrainer.py\u001b[0m:\u001b[94m3571\u001b[0m in \u001b[92minit_git_repo\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3568 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# Make sure the repo exists.\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3569 \u001b[0m\u001b[2m│ │ \u001b[0mcreate_repo(repo_name, token=\u001b[96mself\u001b[0m.args.hub_token, private=\u001b[96mself\u001b[0m.args.hub_private_ \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3570 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m3571 \u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m.repo = Repository(\u001b[96mself\u001b[0m.args.output_dir, clone_from=repo_name, token=\u001b[96msel\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3572 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mEnvironmentError\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3573 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.args.overwrite_output_dir \u001b[95mand\u001b[0m at_init: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m3574 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[2m# Try again after wiping output_dir\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/\u001b[0m\u001b[1;33m_validators.py\u001b[0m:\u001b[94m118\u001b[0m in \u001b[92m_inner_fn\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m115 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m check_use_auth_token: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m116 \u001b[0m\u001b[2m│ │ │ \u001b[0mkwargs = smoothly_deprecate_use_auth_token(fn_name=fn.\u001b[91m__name__\u001b[0m, has_token=ha \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m117 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m118 \u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m fn(*args, **kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m119 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m120 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mreturn\u001b[0m _inner_fn \u001b[2m# type: ignore\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m121 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/\u001b[0m\u001b[1;33mrepository.py\u001b[0m:\u001b[94m516\u001b[0m in \u001b[92m__init__\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 513 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m.huggingface_token = HfFolder.get_token() \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 514 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 515 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m clone_from \u001b[95mis\u001b[0m \u001b[95mnot\u001b[0m \u001b[94mNone\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 516 \u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m.clone_from(repo_url=clone_from) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 517 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 518 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m is_git_repo(\u001b[96mself\u001b[0m.local_dir): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 519 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mlogger.debug(\u001b[33m\"\u001b[0m\u001b[33m[Repository] is a valid git repo\u001b[0m\u001b[33m\"\u001b[0m) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/utils/\u001b[0m\u001b[1;33m_validators.py\u001b[0m:\u001b[94m118\u001b[0m in \u001b[92m_inner_fn\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m115 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m check_use_auth_token: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m116 \u001b[0m\u001b[2m│ │ │ \u001b[0mkwargs = smoothly_deprecate_use_auth_token(fn_name=fn.\u001b[91m__name__\u001b[0m, has_token=ha \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m117 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m118 \u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m fn(*args, **kwargs) \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m119 \u001b[0m\u001b[2m│ \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m120 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mreturn\u001b[0m _inner_fn \u001b[2m# type: ignore\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m121 \u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2;33m/usr/local/lib/python3.9/dist-packages/huggingface_hub/\u001b[0m\u001b[1;33mrepository.py\u001b[0m:\u001b[94m680\u001b[0m in \u001b[92mclone_from\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 677 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 678 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[2m# Check if the folder is the root of a git repository\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 679 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m is_git_repo(\u001b[96mself\u001b[0m.local_dir): \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[31m❱ \u001b[0m 680 \u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mEnvironmentError\u001b[0m( \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 681 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33mTried to clone a repository in a non-empty folder that isn\u001b[0m\u001b[33m'\u001b[0m\u001b[33mt\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 682 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m a git repository (\u001b[0m\u001b[33m'\u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.local_dir\u001b[33m}\u001b[0m\u001b[33m'\u001b[0m\u001b[33m). If you really want to\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m│\u001b[0m \u001b[2m 683 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m do this, do it manually:\u001b[0m\u001b[33m\\n\u001b[0m\u001b[33m cd \u001b[0m\u001b[33m{\u001b[0m\u001b[96mself\u001b[0m.local_dir\u001b[33m}\u001b[0m\u001b[33m && git init\u001b[0m\u001b[33m\"\u001b[0m \u001b[31m│\u001b[0m\n", + "\u001b[31m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mOSError: \u001b[0mTried to clone a repository in a non-empty folder that isn't a git repository \n", + "\u001b[1m(\u001b[0m\u001b[32m'/notebooks/9wimu9/sinhala-bert-1'\u001b[0m\u001b[1m)\u001b[0m. If you really want to do this, do it manually:\n", + " cd \u001b[35m/notebooks/9wimu9/\u001b[0m\u001b[95msinhala-bert-1\u001b[0m && git init && git remote add origin && git pull origin main\n", + " or clone repo to a new folder and move your existing files there afterwards.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "trainer.push_to_hub()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d3417a50-f0a7-4cd7-bc3b-14106660be58", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T12:55:51.765511Z", + "iopub.status.busy": "2023-07-09T12:55:51.764785Z", + "iopub.status.idle": "2023-07-09T12:55:53.074194Z", + "shell.execute_reply": "2023-07-09T12:55:53.073512Z", + "shell.execute_reply.started": "2023-07-09T12:55:51.765481Z" + } + }, + "outputs": [], + "source": [ + "trainer.save_model(\"path_to_save\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "6a3b42de-552c-41fc-a454-afe8a0bf567d", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T12:58:09.073605Z", + "iopub.status.busy": "2023-07-09T12:58:09.073328Z", + "iopub.status.idle": "2023-07-09T12:58:13.289346Z", + "shell.execute_reply": "2023-07-09T12:58:13.288684Z", + "shell.execute_reply.started": "2023-07-09T12:58:09.073583Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at /notebooks/path_to_save were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']\n", + "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of RobertaModel were not initialized from the model checkpoint at /notebooks/path_to_save and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], + "source": [ + "from transformers import AutoModel \n", + "model = AutoModel.from_pretrained('/notebooks/path_to_save',local_files_only=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "b6f2c49a-9a09-4949-b67f-29df6d0aa895", + "metadata": { + "execution": { + "iopub.execute_input": "2023-07-09T13:02:12.261157Z", + "iopub.status.busy": "2023-07-09T13:02:12.260199Z", + "iopub.status.idle": "2023-07-09T13:03:41.483513Z", + "shell.execute_reply": "2023-07-09T13:03:41.483062Z", + "shell.execute_reply.started": "2023-07-09T13:02:12.261124Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "19ea017fbdf04010b52469760f205626", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "pytorch_model.bin: 0%| | 0.00/1.27G [00:00