{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "585f9800-984f-40fe-9b06-35cd40229d90",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "503f9c3c-e348-478f-b743-cff3ce5f4465",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['sinhala_0.txt',\n",
" 'sinhala_1.txt',\n",
" 'sinhala_10.txt',\n",
" 'sinhala_11.txt',\n",
" 'sinhala_12.txt']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paths = [str(x) for x in Path('./').glob('*.txt')]\n",
"\n",
"paths[:5]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5e9baa0f-6c33-45b8-8487-202627067436",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tokenizers in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (0.15.1)\n",
"Requirement already satisfied: huggingface_hub<1.0,>=0.16.4 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tokenizers) (0.20.3)\n",
"Requirement already satisfied: requests in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2.31.0)\n",
"Requirement already satisfied: filelock in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n",
"Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.66.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.10.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n",
"Requirement already satisfied: packaging>=20.9 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (23.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n",
"Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub<1.0,>=0.16.4->tokenizers) (0.4.6)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.6)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.3.2)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2.1.0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n",
"[notice] To update, run: python.exe -m pip install --upgrade pip\n"
]
}
],
"source": [
"!pip install tokenizers"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "225c9c9a-0182-4bfe-92fc-2234e3515560",
"metadata": {},
"outputs": [],
"source": [
"from tokenizers import ByteLevelBPETokenizer"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "501098a0-5df1-448b-99e6-52143cb6751f",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = ByteLevelBPETokenizer()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "24ef02fe-5703-4b87-a92a-e6e936f7fd96",
"metadata": {},
"outputs": [],
"source": [
"tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,\n",
" special_tokens=['', '', '', ''\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "da5bfba4-6c97-4256-b669-f75026b93e09",
"metadata": {},
"outputs": [
{
"ename": "FileExistsError",
"evalue": "[WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileExistsError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[8], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msinhalaMLM\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
"\u001b[1;31mFileExistsError\u001b[0m: [WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'"
]
}
],
"source": [
"import os\n",
"os.mkdir('sinhalaMLM')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e25beeaa-e969-4c33-98e2-65523d827d95",
"metadata": {},
"outputs": [],
"source": [
"tokenizer.save_model('sinhalaMLM')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "222a9edf-15ed-44a8-aaba-2afee76b3cbf",
"metadata": {},
"outputs": [],
"source": [
"!pip install transformers"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "864266b5-77d5-451e-9c01-096588ff62e4",
"metadata": {},
"outputs": [],
"source": [
"pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cpu"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "67d2fc2e-9cb0-4306-9769-0c34a2111c37",
"metadata": {},
"outputs": [],
"source": [
"from transformers import RobertaTokenizerFast"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "474f1e8c-e499-4205-96cc-44f4a4c9e4e3",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = RobertaTokenizerFast.from_pretrained('sinhalaMLM')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "3ea07602-9cd6-4c12-a860-9302e4db7607",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input_ids': [0, 4689, 267, 300, 275, 469, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer('ළමයා ගෙදර')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a9a1acf-fd7d-4d4b-9222-f2f25a39efff",
"metadata": {},
"outputs": [],
"source": [
"lables == input_ids\n",
"\n",
"input_ids -> MLM"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "90cec429-ee77-4c34-814b-3fec3f5b035e",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"def mlm(tensor):\n",
" rand = torch.rand(tensor.shape) \n",
" mask_arr = (rand < 0.15) * (tensor > 2)\n",
" for i in range( tensor.shape[0]):\n",
" selection = torch.flatten(mask_arr[i].nonzero()).tolist() #[[2,5,8]]\n",
" tensor[i, selection] = 3\n",
" return tensor"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2c22dc9d-cf4f-4cf0-a7b4-364464c525d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['sinhala_0.txt',\n",
" 'sinhala_1.txt',\n",
" 'sinhala_10.txt',\n",
" 'sinhala_11.txt',\n",
" 'sinhala_12.txt']"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from pathlib import Path\n",
"\n",
"paths = [str(x) for x in Path('./').glob('*.txt')]\n",
"paths[:5]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "d19907e4-e54d-4596-a8a8-099e6965bcef",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9bb6ce3b55a644b4805cca4d442ada93",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/13 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from tqdm.auto import tqdm\n",
"\n",
"input_ids = []\n",
"mask = []\n",
"labels = []\n",
"\n",
"for path in tqdm(paths[:50]):\n",
" with open(path, 'r', encoding='utf-8') as f:\n",
" lines = f.read().split('\\n')\n",
" sample = tokenizer(lines, max_length=512, padding='max_length',\n",
"truncation=True, return_tensors='pt')\n",
" labels.append(sample.input_ids)\n",
" mask.append(sample.attention_mask)\n",
" input_ids.append(mlm(sample.input_ids.detach().clone()))"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "ef40f10e-ca0b-416a-8481-6ff6e343372e",
"metadata": {},
"outputs": [],
"source": [
"#input_ids = torch.cat(input_ids)\n",
"#mask = torch.cat(mask)\n",
"#labels = torch.cat(labels)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "3b173e1a-96d0-40e6-9883-4f91772c47a1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([ 0, 528, 267, 312, 264, 353, 264, 470, 267, 293,\n",
" 271, 1920, 20312, 263, 281, 264, 269, 271, 282, 263])"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"input_ids[0][:20]"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "3915fa7d-ae56-4d86-b10d-4431a4739c0e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([ 0, 528, 267, 312, 264, 353, 264, 470, 267, 293])"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"labels[0][:10]"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "9202a316-001b-4602-9b8f-9ea93aa5083d",
"metadata": {},
"outputs": [],
"source": [
"encodings = {\n",
" 'input_ids': input_ids,\n",
" 'attention_mask': mask,\n",
" 'labels': labels\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "1c5a01c5-14e2-4bdb-8bfa-12becbe02de6",
"metadata": {},
"outputs": [],
"source": [
"#from torch.utils.data import Dataset\n",
"\n",
"class Dataset(torch.utils.data.Dataset):\n",
" def __init__(self, encodings):\n",
" self.encodings = encodings\n",
" def __len__(self):\n",
" return self.encodings['input_ids'].shape[0]\n",
" def __getitem__(self, i):\n",
" return {key: tensor[i] for key, tensor in self.encodings.items()}"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "e113560d-b382-49d1-afc0-9ad03da9b212",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"120684"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"input_ids.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "ebc01aa1-6a6e-4e20-b88a-9d91deca0c0a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_items([('input_ids', tensor([[ 0, 528, 267, ..., 283, 302, 2],\n",
" [ 0, 277, 560, ..., 1, 1, 1],\n",
" [ 0, 278, 264, ..., 296, 269, 2],\n",
" ...,\n",
" [ 0, 272, 276, ..., 1, 1, 1],\n",
" [ 0, 292, 296, ..., 620, 271, 2],\n",
" [ 0, 307, 295, ..., 269, 281, 2]])), ('attention_mask', tensor([[1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 0, 0, 0],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" ...,\n",
" [1, 1, 1, ..., 0, 0, 0],\n",
" [1, 1, 1, ..., 1, 1, 1],\n",
" [1, 1, 1, ..., 1, 1, 1]])), ('labels', tensor([[ 0, 528, 267, ..., 283, 302, 2],\n",
" [ 0, 277, 560, ..., 1, 1, 1],\n",
" [ 0, 278, 264, ..., 296, 269, 2],\n",
" ...,\n",
" [ 0, 272, 276, ..., 1, 1, 1],\n",
" [ 0, 292, 296, ..., 620, 271, 2],\n",
" [ 0, 307, 295, ..., 269, 281, 2]]))])"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encodings.items()"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "9a329bc6-2286-4d9a-89f4-041ba2f78951",
"metadata": {},
"outputs": [],
"source": [
"dataset = Dataset(encodings)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"id": "1ad3e3f5-bb45-4d51-8ccb-62ab73a1788f",
"metadata": {},
"outputs": [],
"source": [
"dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"id": "900571cd-5061-41f8-b361-e7889ba534cf",
"metadata": {},
"outputs": [],
"source": [
"from transformers import RobertaConfig"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "d35e11be-3ee6-4e7c-b49c-6900434553cf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"30522"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.vocab_size\n"
]
},
{
"cell_type": "code",
"execution_count": 137,
"id": "69d081bc-a4a5-4970-a361-05b426d5afa2",
"metadata": {},
"outputs": [],
"source": [
"config = RobertaConfig(\n",
" vocab_size = tokenizer.vocab_size,\n",
" max_position_embeddings=514,\n",
" hidden_size=768,\n",
" num_attention_heads=12,\n",
" num_hidden_layers=6,\n",
" type_vocab_size=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 138,
"id": "4386905e-2647-4082-aa50-e884cded82b4",
"metadata": {},
"outputs": [],
"source": [
"from transformers import RobertaForMaskedLM"
]
},
{
"cell_type": "code",
"execution_count": 139,
"id": "8a6fec9d-31da-4657-906d-a5b356a32328",
"metadata": {},
"outputs": [],
"source": [
"model = RobertaForMaskedLM(config)"
]
},
{
"cell_type": "code",
"execution_count": 140,
"id": "3ae60e59-6351-4cee-a780-8d753ba55c0a",
"metadata": {},
"outputs": [],
"source": [
"device = torch.device('cuda') if torch.cuda.is_available() else torch.device ('cpu')"
]
},
{
"cell_type": "code",
"execution_count": 141,
"id": "3558fcc7-64c5-46f3-be9a-3199aae9f59b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RobertaForMaskedLM(\n",
" (roberta): RobertaModel(\n",
" (embeddings): RobertaEmbeddings(\n",
" (word_embeddings): Embedding(30522, 768, padding_idx=1)\n",
" (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
" (token_type_embeddings): Embedding(1, 768)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (encoder): RobertaEncoder(\n",
" (layer): ModuleList(\n",
" (0-5): 6 x RobertaLayer(\n",
" (attention): RobertaAttention(\n",
" (self): RobertaSelfAttention(\n",
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (output): RobertaSelfOutput(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" (intermediate): RobertaIntermediate(\n",
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
" (intermediate_act_fn): GELUActivation()\n",
" )\n",
" (output): RobertaOutput(\n",
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (lm_head): RobertaLMHead(\n",
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
" (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (decoder): Linear(in_features=768, out_features=30522, bias=True)\n",
" )\n",
")"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 142,
"id": "c9bfc830-d445-4203-97b9-a67f23f4b7f1",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AdamW"
]
},
{
"cell_type": "code",
"execution_count": 150,
"id": "651b53ff-dbdb-4665-9ee8-4c15e96e4ba9",
"metadata": {},
"outputs": [],
"source": [
"optim = AdamW(model.parameters(), lr=1e-4)"
]
},
{
"cell_type": "code",
"execution_count": 151,
"id": "81d298ed-4bff-443b-97b7-8dd7cff5f2aa",
"metadata": {},
"outputs": [],
"source": [
"from tqdm.auto import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 152,
"id": "7ec88289-b38d-4317-8797-b2b2192ff407",
"metadata": {},
"outputs": [],
"source": [
"epochs = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fe6ccda-fc31-4785-b9f0-ec70e8bfb09a",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "42dd8d1f843544e28d56e5d8b574c0a6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/120684 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"loop = tqdm(dataloader, leave=True)\n",
"for batch in loop:\n",
" optim.zero_grad()\n",
" input_ids = batch['input_ids'].to(device)\n",
" mask = batch['attention_mask'].to(device)\n",
" labels = batch['labels'].to(device)\n",
" outputs = model(input_ids, attention_mask=mask, \n",
" labels=labels)\n",
" loss = outputs.loss\n",
" loss.backward()\n",
" optim.step()\n",
"\n",
" loop.set_description(f'Epoch: {epochs}')\n",
" loop.set_postfix(loss=loss.item())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9c49ba7-a98d-47d3-8c7f-8807c591ca03",
"metadata": {},
"outputs": [],
"source": [
"model.save_pretrained('./sinhalaMLM') # and don't forget to save sinhalaMLM"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}