{ "cells": [ { "cell_type": "raw", "metadata": {}, "source": [ "---\n", "title: 08 Bag of Words Text Classifier\n", "description: Build a simple bag of words text classifier.\n", "---" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\"Colab\"" ] }, { "cell_type": "markdown", "metadata": { "id": "OP_uXHGK0Q9d" }, "source": [ "# Bag of Words Text Classifier\n", "\n", "The code below implements a simple bag of words text classifier.\n", "- We tokenize the text, create a vocabulary and encode each piece of text in the dataset\n", "- The lookup allows for extracting embeddings for each tokenized inputs\n", "- The embedding vectors are added together with a bias vector\n", "- The resulting vector is referred to as the scores\n", "- The score are applied a softmax to generate probabilities which are used for the classification task\n", "\n", "The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). \n", "\n", "![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/bow.png?raw=true)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "rYJ7PiaO2R6Q" }, "outputs": [], "source": [ "import torch\n", "import random\n", "import torch.nn as nn" ] }, { "cell_type": "markdown", "metadata": { "id": "M3eH6PyS1Ykz" }, "source": [ "### Download the Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "F_lDByee1ddU" }, "outputs": [], "source": [ "%%capture\n", "\n", "# download the files\n", "!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt\n", "!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt\n", "!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt\n", "\n", "# create the data folders\n", "!mkdir data data/classes\n", "!cp dev.txt data/classes\n", "!cp test.txt data/classes\n", "!cp train.txt data/classes" ] }, { "cell_type": "markdown", "metadata": { "id": "G9gihHeo0dK6" }, "source": [ "### Read the Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YOYzmcLdzD8i" }, "outputs": [], "source": [ "# function to read in data, process each line and split columns by \" ||| \"\n", "def read_data(filename):\n", " data = []\n", " with open(filename, 'r') as f:\n", " for line in f:\n", " line = line.lower().strip()\n", " line = line.split(' ||| ')\n", " data.append(line)\n", " return data\n", "\n", "train_data = read_data('data/classes/train.txt')\n", "test_data = read_data('data/classes/test.txt')" ] }, { "cell_type": "markdown", "metadata": { "id": "WEIAf06u2kZz" }, "source": [ "### Contruct the Vocab and Datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9MJHDqjT2qDu" }, "outputs": [], "source": [ "# creating the word and tag indices\n", "word_to_index = {}\n", "word_to_index[\"\"] = len(word_to_index) # adds to dictionary\n", "tag_to_index = {}\n", "\n", "# create word to index dictionary and tag to index dictionary from data\n", "def create_dict(data, check_unk=False):\n", " for line in data:\n", " for word in line[1].split(\" \"):\n", " if check_unk == False:\n", " if word not in word_to_index:\n", " word_to_index[word] = len(word_to_index)\n", " else:\n", " if word not in word_to_index:\n", " word_to_index[word] = word_to_index[\"\"]\n", "\n", " if line[0] not in tag_to_index:\n", " tag_to_index[line[0]] = len(tag_to_index)\n", "\n", "create_dict(train_data)\n", "create_dict(test_data, check_unk=True)\n", "\n", "# create word and tag tensors from data\n", "def create_tensor(data):\n", " for line in data:\n", " yield([word_to_index[word] for word in line[1].split(\" \")], tag_to_index[line[0]])\n", "\n", "train_data = list(create_tensor(train_data))\n", "test_data = list(create_tensor(test_data))\n", "\n", "number_of_words = len(word_to_index)\n", "number_of_tags = len(tag_to_index)" ] }, { "cell_type": "markdown", "metadata": { "id": "n-4FU9Ab2McP" }, "source": [ "### Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Zt76PIzP0jWn" }, "outputs": [], "source": [ "# cpu or gpu\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "# create a simple neural network with embedding layer, bias, and xavier initialization\n", "class BoW(torch.nn.Module):\n", " def __init__(self, nwords, ntags):\n", " super(BoW, self).__init__()\n", " self.embedding = nn.Embedding(nwords, ntags)\n", " nn.init.xavier_uniform_(self.embedding.weight)\n", "\n", " type = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor\n", " self.bias = torch.zeros(ntags, requires_grad=True).type(type)\n", "\n", " def forward(self, x):\n", " emb = self.embedding(x) # seq_len x ntags (for each seq) \n", " out = torch.sum(emb, dim=0) + self.bias # ntags\n", " out = out.view(1, -1) # reshape to (1, ntags)\n", " return out" ] }, { "cell_type": "markdown", "metadata": { "id": "Mi4FNOy02Z1t" }, "source": [ "### Pretest the Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "pn_LCZJv2Osz", "outputId": "2c83bb22-a7e8-40af-cb1b-c04f3de6bd38" }, "outputs": [ { "data": { "text/plain": [ "tensor([[-0.0108, -0.0067, -0.0260, -0.0255, 0.0119]], device='cuda:0',\n", " grad_fn=)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# function to convert sentence into tensor using word_to_index dictionary\n", "def sentence_to_tensor(sentence):\n", " return torch.LongTensor([word_to_index[word] for word in sentence.split(\" \")])\n", "\n", "# test the sentence_to_tensor function\n", "type = torch.cuda.LongTensor if torch.cuda.is_available() else torch.LongTensor\n", "out = sentence_to_tensor(\"i love dogs\").type(type)\n", "test_model = BoW(number_of_words, number_of_tags).to(device)\n", "test_model(out)" ] }, { "cell_type": "markdown", "metadata": { "id": "SH5r2Xzs21zB" }, "source": [ "### Train the Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f86xjDAi2bt8", "outputId": "c329b5b2-6d09-405c-bca9-6066e3415c18" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ITER: 1 | train loss/sent: 1.4746 | train accuracy: 0.3661 | test accuracy: 0.3977\n", "ITER: 2 | train loss/sent: 1.1221 | train accuracy: 0.6023 | test accuracy: 0.4149\n", "ITER: 3 | train loss/sent: 0.9114 | train accuracy: 0.7124 | test accuracy: 0.4072\n", "ITER: 4 | train loss/sent: 0.7681 | train accuracy: 0.7684 | test accuracy: 0.4063\n", "ITER: 5 | train loss/sent: 0.6629 | train accuracy: 0.8069 | test accuracy: 0.4081\n", "ITER: 6 | train loss/sent: 0.5802 | train accuracy: 0.8331 | test accuracy: 0.4023\n", "ITER: 7 | train loss/sent: 0.5167 | train accuracy: 0.8549 | test accuracy: 0.4100\n", "ITER: 8 | train loss/sent: 0.4632 | train accuracy: 0.8683 | test accuracy: 0.4072\n", "ITER: 9 | train loss/sent: 0.4187 | train accuracy: 0.8838 | test accuracy: 0.3986\n", "ITER: 10 | train loss/sent: 0.3802 | train accuracy: 0.8954 | test accuracy: 0.3973\n" ] } ], "source": [ "# train and test the BoW model\n", "model = BoW(number_of_words, number_of_tags).to(device)\n", "criterion = nn.CrossEntropyLoss()\n", "optimizer = torch.optim.Adam(model.parameters())\n", "type = torch.LongTensor\n", "\n", "if torch.cuda.is_available():\n", " model.to(device)\n", " type = torch.cuda.LongTensor\n", "\n", "# perform training of the Bow model\n", "def train_bow(model, optimizer, criterion, train_data):\n", " for ITER in range(10):\n", " # perform training\n", " model.train()\n", " random.shuffle(train_data)\n", " total_loss = 0.0\n", " train_correct = 0\n", " for sentence, tag in train_data:\n", " sentence = torch.tensor(sentence).type(type)\n", " tag = torch.tensor([tag]).type(type)\n", " output = model(sentence)\n", " predicted = torch.argmax(output.data.detach()).item()\n", " \n", " loss = criterion(output, tag)\n", " total_loss += loss.item()\n", "\n", " optimizer.zero_grad()\n", " loss.backward()\n", " optimizer.step()\n", "\n", " if predicted == tag: train_correct+=1\n", "\n", " # perform testing of the model\n", " model.eval()\n", " test_correct = 0\n", " for sentence, tag in test_data:\n", " sentence = torch.tensor(sentence).type(type)\n", " output = model(sentence)\n", " predicted = torch.argmax(output.data.detach()).item()\n", " if predicted == tag: test_correct += 1\n", " \n", " # print model performance results\n", " log = f'ITER: {ITER+1} | ' \\\n", " f'train loss/sent: {total_loss/len(train_data):.4f} | ' \\\n", " f'train accuracy: {train_correct/len(train_data):.4f} | ' \\\n", " f'test accuracy: {test_correct/len(test_data):.4f}'\n", " print(log)\n", "\n", "# call the train_bow function\n", "train_bow(model, optimizer, criterion, train_data)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "name": "bow.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 1 }