{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "585f9800-984f-40fe-9b06-35cd40229d90", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": 3, "id": "503f9c3c-e348-478f-b743-cff3ce5f4465", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['sinhala_0.txt',\n", " 'sinhala_1.txt',\n", " 'sinhala_10.txt',\n", " 'sinhala_11.txt',\n", " 'sinhala_12.txt']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "paths = [str(x) for x in Path('./').glob('*.txt')]\n", "\n", "paths[:5]" ] }, { "cell_type": "code", "execution_count": 4, "id": "5e9baa0f-6c33-45b8-8487-202627067436", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: tokenizers in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (0.15.1)\n", "Requirement already satisfied: huggingface_hub<1.0,>=0.16.4 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tokenizers) (0.20.3)\n", "Requirement already satisfied: requests in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2.31.0)\n", "Requirement already satisfied: filelock in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n", "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.66.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.10.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n", "Requirement already satisfied: packaging>=20.9 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from huggingface_hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n", "Requirement already satisfied: colorama in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.42.1->huggingface_hub<1.0,>=0.16.4->tokenizers) (0.4.6)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.6)\n", "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (3.3.2)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from requests->huggingface_hub<1.0,>=0.16.4->tokenizers) (2.1.0)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "[notice] A new release of pip is available: 23.0.1 -> 23.3.2\n", "[notice] To update, run: python.exe -m pip install --upgrade pip\n" ] } ], "source": [ "!pip install tokenizers" ] }, { "cell_type": "code", "execution_count": 5, "id": "225c9c9a-0182-4bfe-92fc-2234e3515560", "metadata": {}, "outputs": [], "source": [ "from tokenizers import ByteLevelBPETokenizer" ] }, { "cell_type": "code", "execution_count": 11, "id": "501098a0-5df1-448b-99e6-52143cb6751f", "metadata": {}, "outputs": [], "source": [ "tokenizer = ByteLevelBPETokenizer()" ] }, { "cell_type": "code", "execution_count": 13, "id": "24ef02fe-5703-4b87-a92a-e6e936f7fd96", "metadata": {}, "outputs": [], "source": [ "tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2,\n", " special_tokens=['', '', '', ''\n", " ])" ] }, { "cell_type": "code", "execution_count": 8, "id": "da5bfba4-6c97-4256-b669-f75026b93e09", "metadata": {}, "outputs": [ { "ename": "FileExistsError", "evalue": "[WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mFileExistsError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[8], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmkdir\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msinhalaMLM\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", "\u001b[1;31mFileExistsError\u001b[0m: [WinError 183] Cannot create a file when that file already exists: 'sinhalaMLM'" ] } ], "source": [ "import os\n", "os.mkdir('sinhalaMLM')" ] }, { "cell_type": "code", "execution_count": null, "id": "e25beeaa-e969-4c33-98e2-65523d827d95", "metadata": {}, "outputs": [], "source": [ "tokenizer.save_model('sinhalaMLM')" ] }, { "cell_type": "code", "execution_count": null, "id": "222a9edf-15ed-44a8-aaba-2afee76b3cbf", "metadata": {}, "outputs": [], "source": [ "!pip install transformers" ] }, { "cell_type": "code", "execution_count": null, "id": "864266b5-77d5-451e-9c01-096588ff62e4", "metadata": {}, "outputs": [], "source": [ "pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cpu" ] }, { "cell_type": "code", "execution_count": 15, "id": "67d2fc2e-9cb0-4306-9769-0c34a2111c37", "metadata": {}, "outputs": [], "source": [ "from transformers import RobertaTokenizerFast" ] }, { "cell_type": "code", "execution_count": 16, "id": "474f1e8c-e499-4205-96cc-44f4a4c9e4e3", "metadata": {}, "outputs": [], "source": [ "tokenizer = RobertaTokenizerFast.from_pretrained('sinhalaMLM')" ] }, { "cell_type": "code", "execution_count": 17, "id": "3ea07602-9cd6-4c12-a860-9302e4db7607", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [0, 4689, 267, 300, 275, 469, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer('ළමයා ගෙදර')" ] }, { "cell_type": "code", "execution_count": null, "id": "0a9a1acf-fd7d-4d4b-9222-f2f25a39efff", "metadata": {}, "outputs": [], "source": [ "lables == input_ids\n", "\n", "input_ids -> MLM" ] }, { "cell_type": "code", "execution_count": 18, "id": "90cec429-ee77-4c34-814b-3fec3f5b035e", "metadata": {}, "outputs": [], "source": [ "import torch\n", "def mlm(tensor):\n", " rand = torch.rand(tensor.shape) \n", " mask_arr = (rand < 0.15) * (tensor > 2)\n", " for i in range( tensor.shape[0]):\n", " selection = torch.flatten(mask_arr[i].nonzero()).tolist() #[[2,5,8]]\n", " tensor[i, selection] = 3\n", " return tensor" ] }, { "cell_type": "code", "execution_count": 19, "id": "2c22dc9d-cf4f-4cf0-a7b4-364464c525d7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['sinhala_0.txt',\n", " 'sinhala_1.txt',\n", " 'sinhala_10.txt',\n", " 'sinhala_11.txt',\n", " 'sinhala_12.txt']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from pathlib import Path\n", "\n", "paths = [str(x) for x in Path('./').glob('*.txt')]\n", "paths[:5]" ] }, { "cell_type": "code", "execution_count": 20, "id": "d19907e4-e54d-4596-a8a8-099e6965bcef", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9bb6ce3b55a644b4805cca4d442ada93", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/13 [00:00