{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6b7d6a9b-db7e-46b2-8ab8-d6914e18f1e1", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:34:42.903289Z", "iopub.status.busy": "2023-07-10T17:34:42.902734Z", "iopub.status.idle": "2023-07-10T17:34:53.910158Z", "shell.execute_reply": "2023-07-10T17:34:53.909356Z", "shell.execute_reply.started": "2023-07-10T17:34:42.903264Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install datasets transformers accelerate wandb -U -q" ] }, { "cell_type": "code", "execution_count": 2, "id": "a2a349ee-2749-4490-9807-cdf18f428181", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:34:53.914661Z", "iopub.status.busy": "2023-07-10T17:34:53.914431Z", "iopub.status.idle": "2023-07-10T17:35:00.111736Z", "shell.execute_reply": "2023-07-10T17:35:00.111000Z", "shell.execute_reply.started": "2023-07-10T17:34:53.914639Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n", "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:" ] }, { "name": "stdin", "output_type": "stream", "text": [ " ········································\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import wandb\n", "wandb.login()" ] }, { "cell_type": "code", "execution_count": 3, "id": "bce905bd-4fc3-4a6a-bd76-48bba9ebc1d9", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:35:00.113303Z", "iopub.status.busy": "2023-07-10T17:35:00.112906Z", "iopub.status.idle": "2023-07-10T17:35:00.118570Z", "shell.execute_reply": "2023-07-10T17:35:00.117737Z", "shell.execute_reply.started": "2023-07-10T17:35:00.113271Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: WANDB_PROJECT=sinhala_bert_v1.2\n" ] } ], "source": [ "%env WANDB_PROJECT=sinhala_bert_v1.2" ] }, { "cell_type": "code", "execution_count": 4, "id": "2551a71d-2804-48ed-bf85-6e0aa94d47d8", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:35:00.120776Z", "iopub.status.busy": "2023-07-10T17:35:00.120489Z", "iopub.status.idle": "2023-07-10T17:35:00.124426Z", "shell.execute_reply": "2023-07-10T17:35:00.123689Z", "shell.execute_reply.started": "2023-07-10T17:35:00.120749Z" } }, "outputs": [], "source": [ "model_checkpoint = \"9wimu9/sinhala-bert-1\"\n", "tokenizer_checkpoint= \"9wimu9/sinhala-bert-1\"" ] }, { "cell_type": "code", "execution_count": 5, "id": "137a18d4-6fc5-4bfe-bf3c-54f4bdef5f4b", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:35:00.126040Z", "iopub.status.busy": "2023-07-10T17:35:00.125473Z", "iopub.status.idle": "2023-07-10T17:36:06.352581Z", "shell.execute_reply": "2023-07-10T17:36:06.351983Z", "shell.execute_reply.started": "2023-07-10T17:35:00.126013Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d7d3d422d87f467f899071b2c4ed86b4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/608 [00:00 BERT large\n", "# 24 layers, 1,024 dimensions, 16 heads, 4,096 hidden dimensions in the feed-forward layer, with pre-layer normalization\n", "\n", "\n", "# We follow the optimization of RoBERTa (Liu et al., 2019) and use \n", "# AdamW (Loshchilov and Hutter, 2019) with \n", "# β1 = 0.9, β2 = 0.98, ε = 1e-6, \n", "# weight decay of 0.01, dropout 0.1, and \n", "# attention dropout 0.1.\n", "\n", "\n", "# Hyperparameters\n", "\n", "# batch size -> 4k, 8k, and 16k (via gradient accumilation)\n", "\n", "# Warmup Proportion (wu) We determine the number of warmup steps as a proportion of the total number of steps. \n", "# Specifically, we try 0%, 2%, 4%, and 6%, which all reflect significantly fewer warmup steps than in BERT.\n", "\n", "# Peak Learning Rate (lr) Our linear learning rate scheduler, \n", "# which starts at 0, warms up to the peak learning rate, and then decays back to 0. We try 5e-4, 1e-3, and 2e-3\n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "858cd60b-32c4-4c0f-859e-10a1ee3bf68e", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:37:48.572108Z", "iopub.status.busy": "2023-07-10T17:37:48.571665Z", "iopub.status.idle": "2023-07-10T17:37:48.610050Z", "shell.execute_reply": "2023-07-10T17:37:48.609409Z", "shell.execute_reply.started": "2023-07-10T17:37:48.572101Z" } }, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "# tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint,model_max_length=256)\n", "tokenizer = AutoTokenizer.from_pretrained('/notebooks/roberta-large-pretrained-si',model_max_length=256)" ] }, { "cell_type": "code", "execution_count": 13, "id": "5812f8da-3434-4ec8-a2e6-a6bdc30ecf72", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:38:51.772892Z", "iopub.status.busy": "2023-07-10T17:38:51.772628Z", "iopub.status.idle": "2023-07-10T17:38:51.777952Z", "shell.execute_reply": "2023-07-10T17:38:51.777265Z", "shell.execute_reply.started": "2023-07-10T17:38:51.772871Z" } }, "outputs": [ { "data": { "text/plain": [ "RobertaTokenizerFast(name_or_path='/notebooks/roberta-large-pretrained-si', vocab_size=1868, model_max_length=256, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'sep_token': '', 'pad_token': '', 'cls_token': '', 'mask_token': AddedToken(\"\", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer" ] }, { "cell_type": "code", "execution_count": 9, "id": "0905ef8c-9faa-49d6-ad0a-06753ce856fa", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:37:49.993189Z", "iopub.status.busy": "2023-07-10T17:37:49.992541Z", "iopub.status.idle": "2023-07-10T17:37:49.996729Z", "shell.execute_reply": "2023-07-10T17:37:49.996008Z", "shell.execute_reply.started": "2023-07-10T17:37:49.993157Z" } }, "outputs": [], "source": [ "per_device_train_batch_size=400\n", "gradient_accumulation_steps=10\n", "num_train_epochs=1\n", "warmup_rate=0.01" ] }, { "cell_type": "code", "execution_count": 10, "id": "6056f333-46f9-4bea-a93d-423f3a1a959e", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:37:55.793688Z", "iopub.status.busy": "2023-07-10T17:37:55.792933Z", "iopub.status.idle": "2023-07-10T17:37:58.921474Z", "shell.execute_reply": "2023-07-10T17:37:58.920666Z", "shell.execute_reply.started": "2023-07-10T17:37:55.793660Z" } }, "outputs": [], "source": [ "from transformers import TrainingArguments\n", "training_args = TrainingArguments(\n", " model_checkpoint,\n", " evaluation_strategy = \"epoch\",\n", " # push_to_hub=True,\n", " # hub_model_id=\"sinhala-bert-v.1\",\n", " per_device_train_batch_size=per_device_train_batch_size, # 4000,8000,16000\n", " gradient_accumulation_steps=gradient_accumulation_steps,\n", " gradient_checkpointing=True,\n", " fp16=True,\n", " report_to=\"wandb\", \n", " num_train_epochs=num_train_epochs,\n", " no_cuda=False,\n", " logging_steps=1,\n", " save_steps=25,\n", " save_total_limit=3,\n", " # load_best_model_at_end=True, # whether to load the best model (in terms of loss) at the end of training\n", ")\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "7f6078f0-ba64-4509-ac8f-39dd0cd7fe04", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:38:00.867885Z", "iopub.status.busy": "2023-07-10T17:38:00.867375Z", "iopub.status.idle": "2023-07-10T17:38:00.876595Z", "shell.execute_reply": "2023-07-10T17:38:00.875989Z", "shell.execute_reply.started": "2023-07-10T17:38:00.867857Z" } }, "outputs": [ { "data": { "text/plain": [ "(7310725, 1828, 18, 4000)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import get_polynomial_decay_schedule_with_warmup,AdamW,get_linear_schedule_with_warmup\n", "import math,torch\n", "\n", "params = filter(lambda x: x.requires_grad, model.parameters())\n", "\n", "optimizer = torch.optim.AdamW(params,lr=2e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.01)\n", "\n", "batch_size = per_device_train_batch_size*gradient_accumulation_steps\n", "\n", "num_warmup_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * warmup_rate*num_train_epochs\n", "num_warmup_steps = int(num_warmup_steps)\n", "num_training_steps = math.ceil(lm_datasets[\"train\"].num_rows / batch_size) * num_train_epochs\n", "\n", "\n", "scheduler = get_linear_schedule_with_warmup(optimizer,\n", " num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)\n", "\n", "lm_datasets[\"train\"].num_rows,num_training_steps,num_warmup_steps,batch_size" ] }, { "cell_type": "code", "execution_count": 12, "id": "ebf14d20-e630-4961-a6d4-d9c8fa90e941", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T17:38:05.602802Z", "iopub.status.busy": "2023-07-10T17:38:05.602191Z", "iopub.status.idle": "2023-07-10T17:38:11.030425Z", "shell.execute_reply": "2023-07-10T17:38:11.029681Z", "shell.execute_reply.started": "2023-07-10T17:38:05.602778Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading package lists... Done\n", "Building dependency tree \n", "Reading state information... Done\n", "The following NEW packages will be installed:\n", " git-lfs\n", "0 upgraded, 1 newly installed, 0 to remove and 3 not upgraded.\n", "Need to get 3316 kB of archives.\n", "After this operation, 11.1 MB of additional disk space will be used.\n", "Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]\n", "Fetched 3316 kB in 1s (3375 kB/s) \u001b[0m33m\u001b[33m\n", "\n", "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n", "(Reading database ... 69943 files and directories currently installed.)\n", "Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...\n", "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (2.9.2-1) ...\n", "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (2.9.2-1) ...\n", "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8Processing triggers for man-db (2.9.1-1) ...\n", "\n", "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J" ] } ], "source": [ "!sudo apt install git-lfs" ] }, { "cell_type": "code", "execution_count": 19, "id": "632113ee-cdcf-45a9-a325-60eaaa1b5f1c", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T18:25:28.092991Z", "iopub.status.busy": "2023-07-10T18:25:28.092179Z", "iopub.status.idle": "2023-07-10T18:25:35.965867Z", "shell.execute_reply": "2023-07-10T18:25:35.965309Z", "shell.execute_reply.started": "2023-07-10T18:25:28.092953Z" } }, "outputs": [], "source": [ "# from transformers import RobertaForMaskedLM\n", "# model = RobertaForMaskedLM.from_pretrained(\"/notebooks/9wimu9/sinhala-bert-1/checkpoint-1625\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "969484c6-4035-4234-8ac7-209ab4a014bc", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T18:25:50.083080Z", "iopub.status.busy": "2023-07-10T18:25:50.082571Z", "iopub.status.idle": "2023-07-10T18:25:50.269795Z", "shell.execute_reply": "2023-07-10T18:25:50.269277Z", "shell.execute_reply.started": "2023-07-10T18:25:50.083058Z" } }, "outputs": [], "source": [ "from transformers import DataCollatorForLanguageModeling,Trainer\n", "\n", "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=lm_datasets[\"train\"],\n", " eval_dataset=lm_datasets[\"valid\"],\n", " data_collator=data_collator,\n", " optimizers=[optimizer, scheduler]\n", ")" ] }, { "cell_type": "code", "execution_count": 14, "id": "4c2f4490-b3bc-4ec6-bef1-2bd71933369a", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T15:10:13.622770Z", "iopub.status.busy": "2023-07-10T15:10:13.622142Z", "iopub.status.idle": "2023-07-10T15:10:13.625595Z", "shell.execute_reply": "2023-07-10T15:10:13.625073Z", "shell.execute_reply.started": "2023-07-10T15:10:13.622747Z" } }, "outputs": [], "source": [ "wandb.finish()\n", "# wandb.init()" ] }, { "cell_type": "code", "execution_count": 15, "id": "17979cc2-2e66-4055-aabb-29d9ee90112d", "metadata": { "execution": { "iopub.execute_input": "2023-07-08T07:31:19.523715Z", "iopub.status.busy": "2023-07-08T07:31:19.523529Z", "iopub.status.idle": "2023-07-08T07:31:20.383711Z", "shell.execute_reply": "2023-07-08T07:31:20.382696Z", "shell.execute_reply.started": "2023-07-08T07:31:19.523696Z" } }, "outputs": [], "source": [ "# !rm -rf /notebooks/9wimu9/sinhala-bert-1" ] }, { "cell_type": "code", "execution_count": null, "id": "b8bd0ab4-6412-4c0c-a215-a0c5cd5d4626", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T15:10:17.837648Z", "iopub.status.busy": "2023-07-10T15:10:17.837138Z" } }, "outputs": [ { "data": { "text/html": [ "Changes to your `wandb` environment variables will be ignored because your `wandb` session has already started. For more information on how to modify your settings with `wandb.init()` arguments, please refer to the W&B docs." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33m9wimu9\u001b[0m (\u001b[33msinquad\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" ] }, { "data": { "text/html": [ "Tracking run with wandb version 0.15.5" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run data is saved locally in /notebooks/wandb/run-20230710_151033-wsjuqghz" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Syncing run classic-eon-6 to Weights & Biases (docs)
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View project at https://wandb.ai/sinquad/sinhala_bert_v1.2" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run at https://wandb.ai/sinquad/sinhala_bert_v1.2/runs/wsjuqghz" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [1638/1827 1:55:38 < 3:16:54, 0.02 it/s, Epoch 0.90/1]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation Loss

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "trainer.train(resume_from_checkpoint = True)\n", "wandb.finish()" ] }, { "cell_type": "code", "execution_count": 22, "id": "bbf22bea-7026-42c9-a643-ba65ab8cdbff", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T18:26:14.038132Z", "iopub.status.busy": "2023-07-10T18:26:14.037456Z", "iopub.status.idle": "2023-07-10T18:57:49.712287Z", "shell.execute_reply": "2023-07-10T18:57:49.711640Z", "shell.execute_reply.started": "2023-07-10T18:26:14.038103Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [50731/50731 31:35]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Perplexity: 78.33\n" ] } ], "source": [ "eval_results = trainer.evaluate()\n", "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "f04eaadd-a13d-4651-ad14-91bcc01f92e1", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T18:58:07.077477Z", "iopub.status.busy": "2023-07-10T18:58:07.076979Z", "iopub.status.idle": "2023-07-10T18:58:07.082203Z", "shell.execute_reply": "2023-07-10T18:58:07.081426Z", "shell.execute_reply.started": "2023-07-10T18:58:07.077454Z" } }, "outputs": [ { "data": { "text/plain": [ "{'eval_loss': 4.360935211181641,\n", " 'eval_runtime': 1895.6573,\n", " 'eval_samples_per_second': 214.09,\n", " 'eval_steps_per_second': 26.762}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_results" ] }, { "cell_type": "code", "execution_count": 25, "id": "d3417a50-f0a7-4cd7-bc3b-14106660be58", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T18:58:52.507374Z", "iopub.status.busy": "2023-07-10T18:58:52.506748Z", "iopub.status.idle": "2023-07-10T18:58:53.770508Z", "shell.execute_reply": "2023-07-10T18:58:53.769992Z", "shell.execute_reply.started": "2023-07-10T18:58:52.507341Z" } }, "outputs": [], "source": [ "trainer.save_model(\"path_to_save\")" ] }, { "cell_type": "code", "execution_count": 26, "id": "6a3b42de-552c-41fc-a454-afe8a0bf567d", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T18:59:46.871782Z", "iopub.status.busy": "2023-07-10T18:59:46.871272Z", "iopub.status.idle": "2023-07-10T18:59:49.794057Z", "shell.execute_reply": "2023-07-10T18:59:49.793583Z", "shell.execute_reply.started": "2023-07-10T18:59:46.871761Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at /notebooks/path_to_save were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']\n", "- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "Some weights of RobertaModel were not initialized from the model checkpoint at /notebooks/path_to_save and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import AutoModel \n", "model = AutoModel.from_pretrained('/notebooks/path_to_save',local_files_only=True)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b6f2c49a-9a09-4949-b67f-29df6d0aa895", "metadata": { "execution": { "iopub.execute_input": "2023-07-10T19:01:49.192299Z", "iopub.status.busy": "2023-07-10T19:01:49.191549Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "77a76086b50b43a2a0bb1cc869ef8e26", "version_major": 2, "version_minor": 0 }, "text/plain": [ "pytorch_model.bin: 0%| | 0.00/1.27G [00:00