{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "585f9800-984f-40fe-9b06-35cd40229d90",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "503f9c3c-e348-478f-b743-cff3ce5f4465",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['sinhala_0.txt',\n",
       " 'sinhala_1.txt',\n",
       " 'sinhala_10.txt',\n",
       " 'sinhala_11.txt',\n",
       " 'sinhala_12.txt']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paths = [str(x) for x in Path('./').glob('*.txt')]\n",
    "\n",
    "paths[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5e9baa0f-6c33-45b8-8487-202627067436",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: tokenizers in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (0.15.1)\n",
      "Requirement already satisfied: huggingface_hub<1.0,>=0.16.4 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tokenizers) (0.20.3)\n",
      "Requirement already satisfied: requests in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2.31.0)\n",
      "Requirement already satisfied: filelock in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n",
      "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.66.1)\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.10.0)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n",
      "Requirement already satisfied: packaging>=20.9 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (23.2)\n",
      "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n",
      "Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub<1.0,>=0.16.4->tokenizers) (0.4.6)\n",
      "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.6)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.3.2)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2.1.0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
      "[notice] To update, run: python.exe -m pip install --upgrade pip\n"
     ]
    }
   ],
   "source": [
    "!pip install tokenizers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "225c9c9a-0182-4bfe-92fc-2234e3515560",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import ByteLevelBPETokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "501098a0-5df1-448b-99e6-52143cb6751f",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = ByteLevelBPETokenizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "24ef02fe-5703-4b87-a92a-e6e936f7fd96",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,\n",
    "               special_tokens=['<s>', '<pad>', '</s>', '<mask>'\n",
    "                              ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "da5bfba4-6c97-4256-b669-f75026b93e09",
   "metadata": {},
   "outputs": [
    {
     "ename": "FileExistsError",
     "evalue": "[WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mFileExistsError\u001b[0m                           Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[8], line 2\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msinhalaMLM\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "\u001b[1;31mFileExistsError\u001b[0m: [WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'"
     ]
    }
   ],
   "source": [
    "import os\n",
    "os.mkdir('sinhalaMLM')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e25beeaa-e969-4c33-98e2-65523d827d95",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save_model('sinhalaMLM')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "222a9edf-15ed-44a8-aaba-2afee76b3cbf",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "864266b5-77d5-451e-9c01-096588ff62e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "67d2fc2e-9cb0-4306-9769-0c34a2111c37",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import RobertaTokenizerFast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "474f1e8c-e499-4205-96cc-44f4a4c9e4e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = RobertaTokenizerFast.from_pretrained('sinhalaMLM')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "3ea07602-9cd6-4c12-a860-9302e4db7607",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': [0, 4689, 267, 300, 275, 469, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer('ළමයා ගෙදර')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a9a1acf-fd7d-4d4b-9222-f2f25a39efff",
   "metadata": {},
   "outputs": [],
   "source": [
    "lables == input_ids\n",
    "\n",
    "input_ids -> MLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "90cec429-ee77-4c34-814b-3fec3f5b035e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "def mlm(tensor):\n",
    "    rand = torch.rand(tensor.shape) \n",
    "    mask_arr = (rand < 0.15) * (tensor > 2)\n",
    "    for i in range( tensor.shape[0]):\n",
    "        selection = torch.flatten(mask_arr[i].nonzero()).tolist() #[[2,5,8]]\n",
    "    tensor[i,  selection] = 3\n",
    "    return tensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2c22dc9d-cf4f-4cf0-a7b4-364464c525d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['sinhala_0.txt',\n",
       " 'sinhala_1.txt',\n",
       " 'sinhala_10.txt',\n",
       " 'sinhala_11.txt',\n",
       " 'sinhala_12.txt']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "paths = [str(x) for x in Path('./').glob('*.txt')]\n",
    "paths[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "d19907e4-e54d-4596-a8a8-099e6965bcef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9bb6ce3b55a644b4805cca4d442ada93",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/13 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from tqdm.auto import tqdm\n",
    "\n",
    "input_ids = []\n",
    "mask = []\n",
    "labels = []\n",
    "\n",
    "for path in tqdm(paths[:50]):\n",
    "    with open(path, 'r', encoding='utf-8') as f:\n",
    "        lines = f.read().split('\\n')\n",
    "    sample = tokenizer(lines, max_length=512, padding='max_length',\n",
    "truncation=True, return_tensors='pt')\n",
    "    labels.append(sample.input_ids)\n",
    "    mask.append(sample.attention_mask)\n",
    "    input_ids.append(mlm(sample.input_ids.detach().clone()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "ef40f10e-ca0b-416a-8481-6ff6e343372e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#input_ids = torch.cat(input_ids)\n",
    "#mask = torch.cat(mask)\n",
    "#labels = torch.cat(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "3b173e1a-96d0-40e6-9883-4f91772c47a1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([    0,   528,   267,   312,   264,   353,   264,   470,   267,   293,\n",
       "          271,  1920, 20312,   263,   281,   264,   269,   271,   282,   263])"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_ids[0][:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "3915fa7d-ae56-4d86-b10d-4431a4739c0e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([  0, 528, 267, 312, 264, 353, 264, 470, 267, 293])"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "labels[0][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "9202a316-001b-4602-9b8f-9ea93aa5083d",
   "metadata": {},
   "outputs": [],
   "source": [
    "encodings = {\n",
    "    'input_ids': input_ids,\n",
    "    'attention_mask': mask,\n",
    "    'labels': labels\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "1c5a01c5-14e2-4bdb-8bfa-12becbe02de6",
   "metadata": {},
   "outputs": [],
   "source": [
    "#from torch.utils.data import Dataset\n",
    "\n",
    "class Dataset(torch.utils.data.Dataset):\n",
    "    def __init__(self, encodings):\n",
    "        self.encodings = encodings\n",
    "    def __len__(self):\n",
    "        return self.encodings['input_ids'].shape[0]\n",
    "    def __getitem__(self, i):\n",
    "        return {key: tensor[i] for key, tensor in self.encodings.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "e113560d-b382-49d1-afc0-9ad03da9b212",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "120684"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_ids.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "ebc01aa1-6a6e-4e20-b88a-9d91deca0c0a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_items([('input_ids', tensor([[  0, 528, 267,  ..., 283, 302,   2],\n",
       "        [  0, 277, 560,  ...,   1,   1,   1],\n",
       "        [  0, 278, 264,  ..., 296, 269,   2],\n",
       "        ...,\n",
       "        [  0, 272, 276,  ...,   1,   1,   1],\n",
       "        [  0, 292, 296,  ..., 620, 271,   2],\n",
       "        [  0, 307, 295,  ..., 269, 281,   2]])), ('attention_mask', tensor([[1, 1, 1,  ..., 1, 1, 1],\n",
       "        [1, 1, 1,  ..., 0, 0, 0],\n",
       "        [1, 1, 1,  ..., 1, 1, 1],\n",
       "        ...,\n",
       "        [1, 1, 1,  ..., 0, 0, 0],\n",
       "        [1, 1, 1,  ..., 1, 1, 1],\n",
       "        [1, 1, 1,  ..., 1, 1, 1]])), ('labels', tensor([[  0, 528, 267,  ..., 283, 302,   2],\n",
       "        [  0, 277, 560,  ...,   1,   1,   1],\n",
       "        [  0, 278, 264,  ..., 296, 269,   2],\n",
       "        ...,\n",
       "        [  0, 272, 276,  ...,   1,   1,   1],\n",
       "        [  0, 292, 296,  ..., 620, 271,   2],\n",
       "        [  0, 307, 295,  ..., 269, 281,   2]]))])"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encodings.items()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "9a329bc6-2286-4d9a-89f4-041ba2f78951",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = Dataset(encodings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "1ad3e3f5-bb45-4d51-8ccb-62ab73a1788f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "900571cd-5061-41f8-b361-e7889ba534cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import RobertaConfig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "d35e11be-3ee6-4e7c-b49c-6900434553cf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "30522"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.vocab_size\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "69d081bc-a4a5-4970-a361-05b426d5afa2",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = RobertaConfig(\n",
    "    vocab_size = tokenizer.vocab_size,\n",
    "    max_position_embeddings=514,\n",
    "    hidden_size=768,\n",
    "    num_attention_heads=12,\n",
    "    num_hidden_layers=6,\n",
    "    type_vocab_size=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "4386905e-2647-4082-aa50-e884cded82b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import RobertaForMaskedLM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "8a6fec9d-31da-4657-906d-a5b356a32328",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = RobertaForMaskedLM(config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "3ae60e59-6351-4cee-a780-8d753ba55c0a",
   "metadata": {},
   "outputs": [],
   "source": [
    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device ('cpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "3558fcc7-64c5-46f3-be9a-3199aae9f59b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RobertaForMaskedLM(\n",
       "  (roberta): RobertaModel(\n",
       "    (embeddings): RobertaEmbeddings(\n",
       "      (word_embeddings): Embedding(30522, 768, padding_idx=1)\n",
       "      (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
       "      (token_type_embeddings): Embedding(1, 768)\n",
       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "      (dropout): Dropout(p=0.1, inplace=False)\n",
       "    )\n",
       "    (encoder): RobertaEncoder(\n",
       "      (layer): ModuleList(\n",
       "        (0-5): 6 x RobertaLayer(\n",
       "          (attention): RobertaAttention(\n",
       "            (self): RobertaSelfAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): RobertaSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate): RobertaIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output): RobertaOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  )\n",
       "  (lm_head): RobertaLMHead(\n",
       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "    (decoder): Linear(in_features=768, out_features=30522, bias=True)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "c9bfc830-d445-4203-97b9-a67f23f4b7f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AdamW"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "id": "651b53ff-dbdb-4665-9ee8-4c15e96e4ba9",
   "metadata": {},
   "outputs": [],
   "source": [
    "optim = AdamW(model.parameters(), lr=1e-4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "id": "81d298ed-4bff-443b-97b7-8dd7cff5f2aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "id": "7ec88289-b38d-4317-8797-b2b2192ff407",
   "metadata": {},
   "outputs": [],
   "source": [
    "epochs = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6fe6ccda-fc31-4785-b9f0-ec70e8bfb09a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "42dd8d1f843544e28d56e5d8b574c0a6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/120684 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "loop = tqdm(dataloader, leave=True)\n",
    "for batch in loop:\n",
    "    optim.zero_grad()\n",
    "    input_ids = batch['input_ids'].to(device)\n",
    "    mask = batch['attention_mask'].to(device)\n",
    "    labels = batch['labels'].to(device)\n",
    "    outputs = model(input_ids, attention_mask=mask, \n",
    "                   labels=labels)\n",
    "    loss = outputs.loss\n",
    "    loss.backward()\n",
    "    optim.step()\n",
    "\n",
    "    loop.set_description(f'Epoch: {epochs}')\n",
    "    loop.set_postfix(loss=loss.item())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9c49ba7-a98d-47d3-8c7f-8807c591ca03",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('./sinhalaMLM')  # and don't forget to save sinhalaMLM"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}