{
"cells": [
{
"cell_type": "raw",
"metadata": {},
"source": [
"---\n",
"title: 08 Bag of Words Text Classifier\n",
"description: Build a simple bag of words text classifier.\n",
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OP_uXHGK0Q9d"
},
"source": [
"# Bag of Words Text Classifier\n",
"\n",
"The code below implements a simple bag of words text classifier.\n",
"- We tokenize the text, create a vocabulary and encode each piece of text in the dataset\n",
"- The lookup allows for extracting embeddings for each tokenized inputs\n",
"- The embedding vectors are added together with a bias vector\n",
"- The resulting vector is referred to as the scores\n",
"- The score are applied a softmax to generate probabilities which are used for the classification task\n",
"\n",
"The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). \n",
"\n",
"![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/bow.png?raw=true)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "rYJ7PiaO2R6Q"
},
"outputs": [],
"source": [
"import torch\n",
"import random\n",
"import torch.nn as nn"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "M3eH6PyS1Ykz"
},
"source": [
"### Download the Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "F_lDByee1ddU"
},
"outputs": [],
"source": [
"%%capture\n",
"\n",
"# download the files\n",
"!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt\n",
"!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt\n",
"!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt\n",
"\n",
"# create the data folders\n",
"!mkdir data data/classes\n",
"!cp dev.txt data/classes\n",
"!cp test.txt data/classes\n",
"!cp train.txt data/classes"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "G9gihHeo0dK6"
},
"source": [
"### Read the Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "YOYzmcLdzD8i"
},
"outputs": [],
"source": [
"# function to read in data, process each line and split columns by \" ||| \"\n",
"def read_data(filename):\n",
" data = []\n",
" with open(filename, 'r') as f:\n",
" for line in f:\n",
" line = line.lower().strip()\n",
" line = line.split(' ||| ')\n",
" data.append(line)\n",
" return data\n",
"\n",
"train_data = read_data('data/classes/train.txt')\n",
"test_data = read_data('data/classes/test.txt')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WEIAf06u2kZz"
},
"source": [
"### Contruct the Vocab and Datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9MJHDqjT2qDu"
},
"outputs": [],
"source": [
"# creating the word and tag indices\n",
"word_to_index = {}\n",
"word_to_index[\"\"] = len(word_to_index) # adds to dictionary\n",
"tag_to_index = {}\n",
"\n",
"# create word to index dictionary and tag to index dictionary from data\n",
"def create_dict(data, check_unk=False):\n",
" for line in data:\n",
" for word in line[1].split(\" \"):\n",
" if check_unk == False:\n",
" if word not in word_to_index:\n",
" word_to_index[word] = len(word_to_index)\n",
" else:\n",
" if word not in word_to_index:\n",
" word_to_index[word] = word_to_index[\"\"]\n",
"\n",
" if line[0] not in tag_to_index:\n",
" tag_to_index[line[0]] = len(tag_to_index)\n",
"\n",
"create_dict(train_data)\n",
"create_dict(test_data, check_unk=True)\n",
"\n",
"# create word and tag tensors from data\n",
"def create_tensor(data):\n",
" for line in data:\n",
" yield([word_to_index[word] for word in line[1].split(\" \")], tag_to_index[line[0]])\n",
"\n",
"train_data = list(create_tensor(train_data))\n",
"test_data = list(create_tensor(test_data))\n",
"\n",
"number_of_words = len(word_to_index)\n",
"number_of_tags = len(tag_to_index)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "n-4FU9Ab2McP"
},
"source": [
"### Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Zt76PIzP0jWn"
},
"outputs": [],
"source": [
"# cpu or gpu\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# create a simple neural network with embedding layer, bias, and xavier initialization\n",
"class BoW(torch.nn.Module):\n",
" def __init__(self, nwords, ntags):\n",
" super(BoW, self).__init__()\n",
" self.embedding = nn.Embedding(nwords, ntags)\n",
" nn.init.xavier_uniform_(self.embedding.weight)\n",
"\n",
" type = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor\n",
" self.bias = torch.zeros(ntags, requires_grad=True).type(type)\n",
"\n",
" def forward(self, x):\n",
" emb = self.embedding(x) # seq_len x ntags (for each seq) \n",
" out = torch.sum(emb, dim=0) + self.bias # ntags\n",
" out = out.view(1, -1) # reshape to (1, ntags)\n",
" return out"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Mi4FNOy02Z1t"
},
"source": [
"### Pretest the Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pn_LCZJv2Osz",
"outputId": "2c83bb22-a7e8-40af-cb1b-c04f3de6bd38"
},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[-0.0108, -0.0067, -0.0260, -0.0255, 0.0119]], device='cuda:0',\n",
" grad_fn=)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# function to convert sentence into tensor using word_to_index dictionary\n",
"def sentence_to_tensor(sentence):\n",
" return torch.LongTensor([word_to_index[word] for word in sentence.split(\" \")])\n",
"\n",
"# test the sentence_to_tensor function\n",
"type = torch.cuda.LongTensor if torch.cuda.is_available() else torch.LongTensor\n",
"out = sentence_to_tensor(\"i love dogs\").type(type)\n",
"test_model = BoW(number_of_words, number_of_tags).to(device)\n",
"test_model(out)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "SH5r2Xzs21zB"
},
"source": [
"### Train the Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "f86xjDAi2bt8",
"outputId": "c329b5b2-6d09-405c-bca9-6066e3415c18"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ITER: 1 | train loss/sent: 1.4746 | train accuracy: 0.3661 | test accuracy: 0.3977\n",
"ITER: 2 | train loss/sent: 1.1221 | train accuracy: 0.6023 | test accuracy: 0.4149\n",
"ITER: 3 | train loss/sent: 0.9114 | train accuracy: 0.7124 | test accuracy: 0.4072\n",
"ITER: 4 | train loss/sent: 0.7681 | train accuracy: 0.7684 | test accuracy: 0.4063\n",
"ITER: 5 | train loss/sent: 0.6629 | train accuracy: 0.8069 | test accuracy: 0.4081\n",
"ITER: 6 | train loss/sent: 0.5802 | train accuracy: 0.8331 | test accuracy: 0.4023\n",
"ITER: 7 | train loss/sent: 0.5167 | train accuracy: 0.8549 | test accuracy: 0.4100\n",
"ITER: 8 | train loss/sent: 0.4632 | train accuracy: 0.8683 | test accuracy: 0.4072\n",
"ITER: 9 | train loss/sent: 0.4187 | train accuracy: 0.8838 | test accuracy: 0.3986\n",
"ITER: 10 | train loss/sent: 0.3802 | train accuracy: 0.8954 | test accuracy: 0.3973\n"
]
}
],
"source": [
"# train and test the BoW model\n",
"model = BoW(number_of_words, number_of_tags).to(device)\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
"type = torch.LongTensor\n",
"\n",
"if torch.cuda.is_available():\n",
" model.to(device)\n",
" type = torch.cuda.LongTensor\n",
"\n",
"# perform training of the Bow model\n",
"def train_bow(model, optimizer, criterion, train_data):\n",
" for ITER in range(10):\n",
" # perform training\n",
" model.train()\n",
" random.shuffle(train_data)\n",
" total_loss = 0.0\n",
" train_correct = 0\n",
" for sentence, tag in train_data:\n",
" sentence = torch.tensor(sentence).type(type)\n",
" tag = torch.tensor([tag]).type(type)\n",
" output = model(sentence)\n",
" predicted = torch.argmax(output.data.detach()).item()\n",
" \n",
" loss = criterion(output, tag)\n",
" total_loss += loss.item()\n",
"\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" if predicted == tag: train_correct+=1\n",
"\n",
" # perform testing of the model\n",
" model.eval()\n",
" test_correct = 0\n",
" for sentence, tag in test_data:\n",
" sentence = torch.tensor(sentence).type(type)\n",
" output = model(sentence)\n",
" predicted = torch.argmax(output.data.detach()).item()\n",
" if predicted == tag: test_correct += 1\n",
" \n",
" # print model performance results\n",
" log = f'ITER: {ITER+1} | ' \\\n",
" f'train loss/sent: {total_loss/len(train_data):.4f} | ' \\\n",
" f'train accuracy: {train_correct/len(train_data):.4f} | ' \\\n",
" f'test accuracy: {test_correct/len(test_data):.4f}'\n",
" print(log)\n",
"\n",
"# call the train_bow function\n",
"train_bow(model, optimizer, criterion, train_data)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"name": "bow.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 1
}