{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "90ff6fd7-900e-4620-b369-9deb69d97f69", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from torch import Tensor, nn\n", "from torch.nn import TransformerEncoder, TransformerEncoderLayer\n", "from torchtext.vocab import build_vocab_from_iterator\n", "from torch.utils.data import dataset\n", "from torch.utils.tensorboard import SummaryWriter\n", "\n", "import regex as re\n", "import os\n", "import time\n", "from tqdm import tqdm\n", "import copy\n", "import math\n", "\n", "from model import TransformerModel\n", "from utils import preProcessText, getTokenizer\n", "from config import getConfig" ] }, { "cell_type": "code", "execution_count": 2, "id": "e80ad5ff-c38b-4c9c-8326-58593e0dbce8", "metadata": {}, "outputs": [], "source": [ "def get_model(model_config, ntokens):\n", " emsize = model_config[\"emsize\"]\n", " d_hid = model_config[\"d_hid\"]\n", " nlayers = model_config[\"nlayers\"]\n", " nhead = model_config[\"nhead\"]\n", " dropout = model_config[\"dropout\"]\n", " model = TransformerModel(ntokens, emsize,nhead, d_hid, nlayers, dropout)\n", " return model\n", "\n", "def loadModel(best_model_path):\n", " if os.path.exists(best_model_path):\n", " print(f\"Preloading model {best_model_path}\")\n", " if torch.cuda.is_available():\n", " state = torch.load(best_model_path)\n", " else:\n", " state = torch.load(best_model_path, map_location=torch.device('cpu'))\n", " model.load_state_dict(state['model_state_dict'])\n", " return model\n", " else:\n", " raise Exception(\"Model Not Found\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "15c00f79-716d-41fc-b027-74418c542ae6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'emsize': 300, 'd_hid': 1024, 'nlayers': 6, 'nhead': 6, 'dropout': 0.2, 'bptt': 64}\n", "{'logs': 'tensorboard_logs', 'epochs': 25}\n", "cpu\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/nirajan/miniconda3/envs/cslr/lib/python3.8/site-packages/torch/nn/modules/transformer.py:282: UserWarning: enable_nested_tensor is True, but self.use_nested_tensor is False because encoder_layer.self_attn.batch_first was not True(use batch_first for better inference performance)\n", " warnings.warn(f\"enable_nested_tensor is True, but self.use_nested_tensor is False because {why_not_sparsity_fast_path}\")\n" ] } ], "source": [ "model_config, app_config = getConfig()\n", "print(model_config)\n", "print(app_config)\n", "\n", "bptt=model_config[\"bptt\"]\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "print(device)\n", "\n", "softmax = nn.Softmax(dim=2)\n", "\n", "tokenizer, vocab = getTokenizer()\n", "ntokens = len(vocab)\n", "model = get_model(model_config, ntokens).to(device)" ] }, { "cell_type": "code", "execution_count": 4, "id": "bb36cce0-0ff7-4b8b-a128-2fb116ee6d66", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Preloading model models/best_model.pt\n" ] } ], "source": [ "best_model_path = 'models/best_model.pt'\n", "loaded_model = loadModel(best_model_path)" ] }, { "cell_type": "code", "execution_count": 5, "id": "188ea08e-141e-4ec5-b04d-d1c790712f8c", "metadata": {}, "outputs": [], "source": [ "def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:\n", " \"\"\"Converts raw text into a flat Tensor.\"\"\"\n", " # obtain the data in tensor format for each line\n", " data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long)\n", " for item in raw_text_iter]\n", " # concatenate all the lines\n", " return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))\n", "\n", "def batchify(data: Tensor, batch_size: int) -> Tensor:\n", " \"\"\"Divides the data into batch_size separate sequences, removing extra elements\n", " that wouldn't cleanly fit.\n", " Args:\n", " data: Tensor, shape [N]\n", " batch_size: int, batch size\n", " Returns:\n", " Tensor of shape [N // bsz, bsz]\n", " \"\"\"\n", " seq_len = data.size(0) // batch_size\n", " data = data[:seq_len * batch_size]\n", " data = data.view(batch_size, seq_len).t().contiguous()\n", " return data.to(device)\n", "\n", "def generate_square_subsequent_mask(sz: int) -> Tensor:\n", " \"\"\"Generates an upper-triangular matrix of -inf, with zeros on diag.\"\"\"\n", " return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)" ] }, { "cell_type": "code", "execution_count": 6, "id": "8fe07873-a781-4f46-ba42-54725ff9681a", "metadata": {}, "outputs": [], "source": [ "def nonnaive_generator(model: nn.Module, gen_data: Tensor, no_words=5, k=50):\n", " model.eval()\n", " src_mask = generate_square_subsequent_mask(bptt).to(device)\n", " pred_text = []\n", " for i in range(no_words):\n", " batch_size = gen_data.size(0)\n", " if batch_size != bptt:\n", " src_mask_ = src_mask[:batch_size, :batch_size]\n", " \n", " # generate the probability of the next word\n", " output_softmax = model(gen_data, src_mask_)\n", " output_softmax_permuted = output_softmax.permute(1, 0, 2)\n", " \n", " # obtain the \"k\" top probable words index\n", " # both indices and values are of size (no. of words, k=50)\n", " indices = torch.topk(output_softmax_permuted, k, dim=2).indices.squeeze(0)\n", " # obtain the top \"k\" probability of the probable words\n", " values = torch.topk(softmax(output_softmax_permuted), k, dim=2).values\n", " values = values/torch.sum(values, dim=2, keepdims=True)\n", " values = values.squeeze(0) \n", " \n", " # create categorical distribution and take sample from values\n", " # categorical distribution take 1 sample from k=50 samples of each dimension\n", " for _ in range(10): \n", " ind_sampled = torch.distributions.Categorical(values).sample()\n", " next_index = indices[-1][ind_sampled[-1]]\n", " # if the obtained token is not , then no need to sample again\n", " if vocab.lookup_token(next_index) != '':\n", " break\n", " \n", " pred_text.append([vocab.lookup_token(next_index)][0])\n", " if(batch_size < 15):\n", " gen_data = torch.cat((gen_data[:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)\n", " batch_size = gen_data.size(0)\n", " else:\n", " gen_data = torch.cat((gen_data[1:, :], next_index.unsqueeze(0).unsqueeze(0)), 0)\n", " batch_size = gen_data.size(0)\n", "\n", " return pred_text" ] }, { "cell_type": "markdown", "id": "8da59be0", "metadata": {}, "source": [ "## Gradio application" ] }, { "cell_type": "code", "execution_count": 7, "id": "3d6ded4e", "metadata": {}, "outputs": [], "source": [ "def predText(text : str, num_words : int):\n", " text = [text]\n", " num_words = int(num_words)\n", " sample_data = data_process(text)\n", " sample_data = batchify(sample_data, 1)\n", " pred_text = nonnaive_generator(loaded_model, sample_data[:,-1].unsqueeze(1), no_words=num_words, k=50)\n", " whole_text = text[0] + ' ' + ' '.join(pred_text)\n", " return whole_text" ] }, { "cell_type": "code", "execution_count": 8, "id": "d04f6707", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'म घर नजिकैको घरमा एक्लै छु । घर नजिकै रहेको सानो जग्गामा भाडामा लिएर काम गरिरहेका स्थानीय चन्द्रबहादुर थापाले यो व्यवसाय शुरु गरेका हुन् । उनलाई नेपाल सरकारले हजार रुपैयाँ जरिवाना लगाएको'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = \"म घर नजिकैको\"\n", "num_words = 30\n", "predText(text, num_words)" ] }, { "cell_type": "code", "execution_count": 9, "id": "a5383b85", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/nirajan/miniconda3/envs/cslr/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import gradio as gr" ] }, { "cell_type": "code", "execution_count": 10, "id": "35f38739", "metadata": {}, "outputs": [], "source": [ "# Set up the Gradio Interface with custom HTML and JavaScript\n", "input_text_box = gr.Textbox(label=\"Text\", value=\"म घर\", lines=5)\n", "\n", "def predText(text : str, num_words : int):\n", " global input_text_box\n", " text = [text]\n", " num_words = int(num_words)\n", " sample_data = data_process(text)\n", " sample_data = batchify(sample_data, 1)\n", " pred_text = nonnaive_generator(loaded_model, sample_data[:,-1].unsqueeze(1), no_words=num_words, k=50)\n", " whole_text = text[0] + ' ' + ' '.join(pred_text)\n", " return whole_text" ] }, { "cell_type": "code", "execution_count": 11, "id": "0afd6784", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7860\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "examples = [[\"म घर\", 10], [\"मलाई\", 40], [\"आज\", 70]]\n", "with gr.Blocks() as interface:\n", " interface.title = \"Nepali Text Generation Model\"\n", " gr.Markdown(\"# Nepali Text Generation\")\n", " gr.Markdown(\"Start typing nepali below and then click **Submit** to generate text.\")\n", " gr.Markdown(\"You can select examples from the table below and then click **Submit** to generate text.\")\n", " \n", " input_text_box = gr.Textbox(label=\"Text\", value=\"म घर\", lines=5, placeholder=\"Enter Nepali Text\")\n", " input_num_words = gr.Number(label=\"Number of word to generate\", value=5)\n", "\n", " btn = gr.Button(value=\"Submit\")\n", " \n", " btn.click(predText, inputs=[input_text_box, input_num_words], outputs=[input_text_box])\n", " \n", " gr.Examples(examples=examples, inputs=[input_text_box, input_num_words], outputs=[input_text_box])\n", " gr.flagging = True\n", " \n", "interface.launch()" ] }, { "cell_type": "code", "execution_count": 24, "id": "7daf0c5b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7869\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "title = \"Nepali Language Model For Text Generation\"\n", "description = \"\"\"\n", "Click **Submit** to generate text \n", "\n", "**Flag** as **(Bad, Fine or Good)** as per the generated text\n", "\n", "You can also choose **examples** given below. Click them and then submit\n", "\"\"\"\n", "examples = [[\"म घर\", 10], [\"मलाई\", 40], [\"आज\", 70]]\n", "\n", "\n", "# inpt and output for the interface\n", "input_text_box = gr.Textbox(label=\"Input Text\", value=\"म घर\", lines=5, placeholder=\"Enter Nepali Text\")\n", "input_num_words = gr.Number(label=\"Number of word to generate\", value=5)\n", "output_text_box = gr.Textbox(label=\"Generated Text\", lines=5, placeholder=\"Generated Text Appears here\")\n", "\n", "flagging_options = [\"Bad\", \"Fine\", \"Good\"]\n", "\n", "\n", "interface = gr.Interface(fn=predText, inputs=[input_text_box, input_num_words], outputs=output_text_box, flagging_options=flagging_options, title=title, description=description, examples=examples)\n", "interface.launch()" ] }, { "cell_type": "code", "execution_count": 25, "id": "a18b3274", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TextNumber of word to generateText.1flagusernametimestamp
0म घर5म घर पुग्दा मलाई निकै गाह्रो पर्नेNaNNaN2023-11-27 17:00:15.272263
1म घर5म घर नै लिएर आएको हुँ ।BadNaN2023-11-27 17:06:19.426503
2म घर5म घर बसेको थिएँ । तर ,GoodNaN2023-11-27 17:06:27.778048
3म घर5म घर बनाउँछु र घरमा काम गर्छुGoodNaN2023-11-27 17:06:34.104749
4म घर5म घर गएँ । त्यस क्षेत्रका मान्छेहरुFineNaN2023-11-27 17:06:44.150964
5आज70म घर छोडेर गएका थिए । तरGoodNaN2023-11-27 17:13:07.793558
6आज70आज पनि यस विषयमा छलफल भएको हो । नेपाल कम्युनिष...FineNaN2023-11-27 17:13:32.043888
7आज70आज नै निर्वाचन आयोगमा उजुरी दिन गएका छन् । आयो...GoodNaN2023-11-27 17:13:53.533122
\n", "
" ], "text/plain": [ " Text Number of word to generate \\\n", "0 म घर 5 \n", "1 म घर 5 \n", "2 म घर 5 \n", "3 म घर 5 \n", "4 म घर 5 \n", "5 आज 70 \n", "6 आज 70 \n", "7 आज 70 \n", "\n", " Text.1 flag username \\\n", "0 म घर पुग्दा मलाई निकै गाह्रो पर्ने NaN NaN \n", "1 म घर नै लिएर आएको हुँ । Bad NaN \n", "2 म घर बसेको थिएँ । तर , Good NaN \n", "3 म घर बनाउँछु र घरमा काम गर्छु Good NaN \n", "4 म घर गएँ । त्यस क्षेत्रका मान्छेहरु Fine NaN \n", "5 म घर छोडेर गएका थिए । तर Good NaN \n", "6 आज पनि यस विषयमा छलफल भएको हो । नेपाल कम्युनिष... Fine NaN \n", "7 आज नै निर्वाचन आयोगमा उजुरी दिन गएका छन् । आयो... Good NaN \n", "\n", " timestamp \n", "0 2023-11-27 17:00:15.272263 \n", "1 2023-11-27 17:06:19.426503 \n", "2 2023-11-27 17:06:27.778048 \n", "3 2023-11-27 17:06:34.104749 \n", "4 2023-11-27 17:06:44.150964 \n", "5 2023-11-27 17:13:07.793558 \n", "6 2023-11-27 17:13:32.043888 \n", "7 2023-11-27 17:13:53.533122 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read flagged log file\n", "import pandas as pd\n", "flagged_log = pd.read_csv(\"flagged/log.csv\")\n", "flagged_log" ] }, { "cell_type": "markdown", "id": "12cfb4ec", "metadata": {}, "source": [ "## No. of parameters" ] }, { "cell_type": "code", "execution_count": 12, "id": "392832a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "TransformerModel(\n", " (embedding): Embedding(60507, 300)\n", " (pos_encoder): PositionalEncoding(\n", " (dropout): Dropout(p=0.2, inplace=False)\n", " )\n", " (transformer_encoder): TransformerEncoder(\n", " (layers): ModuleList(\n", " (0-5): 6 x TransformerEncoderLayer(\n", " (self_attn): MultiheadAttention(\n", " (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)\n", " )\n", " (linear1): Linear(in_features=300, out_features=1024, bias=True)\n", " (dropout): Dropout(p=0.2, inplace=False)\n", " (linear2): Linear(in_features=1024, out_features=300, bias=True)\n", " (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)\n", " (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)\n", " (dropout1): Dropout(p=0.2, inplace=False)\n", " (dropout2): Dropout(p=0.2, inplace=False)\n", " )\n", " )\n", " )\n", " (decoder): Linear(in_features=300, out_features=60507, bias=True)\n", ")" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loaded_model" ] }, { "cell_type": "code", "execution_count": 13, "id": "6cd24c9f", "metadata": {}, "outputs": [], "source": [ "def count_parameters(model):\n", " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", "\n", "num_parameters = count_parameters(loaded_model)" ] }, { "cell_type": "code", "execution_count": 14, "id": "9793078e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'42,233,451'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "formatted_num = \"{:,}\".format(num_parameters)\n", "formatted_num" ] }, { "cell_type": "code", "execution_count": null, "id": "68b44769", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "cslr", "language": "python", "name": "cslr" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.18" } }, "nbformat": 4, "nbformat_minor": 5 }