"Colab"

# Bag of Words Text Classifier

The code below implements a simple bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- The lookup allows for extracting embeddings for each tokenized inputs
- The embedding vectors are added together with a bias vector
- The resulting vector is referred to as the scores
- The score are applied a softmax to generate probabilities which are used for the classification task

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/bow.png?raw=true)


In [None]:
import torch
import random
import torch.nn as nn

### Download the Data

In [None]:
%%capture

# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes

### Read the Data

In [None]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):
 data = []
 with open(filename, 'r') as f:
 for line in f:
 line = line.lower().strip()
 line = line.split(' ||| ')
 data.append(line)
 return data

train_data = read_data('data/classes/train.txt')
test_data = read_data('data/classes/test.txt')

### Contruct the Vocab and Datasets

In [None]:
# creating the word and tag indices
word_to_index = {}
word_to_index[""] = len(word_to_index) # adds to dictionary
tag_to_index = {}

# create word to index dictionary and tag to index dictionary from data
def create_dict(data, check_unk=False):
 for line in data:
 for word in line[1].split(" "):
 if check_unk == False:
 if word not in word_to_index:
 word_to_index[word] = len(word_to_index)
 else:
 if word not in word_to_index:
 word_to_index[word] = word_to_index[""]

 if line[0] not in tag_to_index:
 tag_to_index[line[0]] = len(tag_to_index)

create_dict(train_data)
create_dict(test_data, check_unk=True)

# create word and tag tensors from data
def create_tensor(data):
 for line in data:
 yield([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])

train_data = list(create_tensor(train_data))
test_data = list(create_tensor(test_data))

number_of_words = len(word_to_index)
number_of_tags = len(tag_to_index)

### Model

In [None]:
# cpu or gpu
device = "cuda" if torch.cuda.is_available() else "cpu"

# create a simple neural network with embedding layer, bias, and xavier initialization
class BoW(torch.nn.Module):
 def __init__(self, nwords, ntags):
 super(BoW, self).__init__()
 self.embedding = nn.Embedding(nwords, ntags)
 nn.init.xavier_uniform_(self.embedding.weight)

 type = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
 self.bias = torch.zeros(ntags, requires_grad=True).type(type)

 def forward(self, x):
 emb = self.embedding(x) # seq_len x ntags (for each seq) 
 out = torch.sum(emb, dim=0) + self.bias # ntags
 out = out.view(1, -1) # reshape to (1, ntags)
 return out

### Pretest the Model

In [None]:
# function to convert sentence into tensor using word_to_index dictionary
def sentence_to_tensor(sentence):
 return torch.LongTensor([word_to_index[word] for word in sentence.split(" ")])

# test the sentence_to_tensor function
type = torch.cuda.LongTensor if torch.cuda.is_available() else torch.LongTensor
out = sentence_to_tensor("i love dogs").type(type)
test_model = BoW(number_of_words, number_of_tags).to(device)
test_model(out)

tensor([[-0.0108, -0.0067, -0.0260, -0.0255, 0.0119]], device='cuda:0',
 grad_fn=)

### Train the Model

In [None]:
# train and test the BoW model
model = BoW(number_of_words, number_of_tags).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
type = torch.LongTensor

if torch.cuda.is_available():
 model.to(device)
 type = torch.cuda.LongTensor

# perform training of the Bow model
def train_bow(model, optimizer, criterion, train_data):
 for ITER in range(10):
 # perform training
 model.train()
 random.shuffle(train_data)
 total_loss = 0.0
 train_correct = 0
 for sentence, tag in train_data:
 sentence = torch.tensor(sentence).type(type)
 tag = torch.tensor([tag]).type(type)
 output = model(sentence)
 predicted = torch.argmax(output.data.detach()).item()
 
 loss = criterion(output, tag)
 total_loss += loss.item()

 optimizer.zero_grad()
 loss.backward()
 optimizer.step()

 if predicted == tag: train_correct+=1

 # perform testing of the model
 model.eval()
 test_correct = 0
 for sentence, tag in test_data:
 sentence = torch.tensor(sentence).type(type)
 output = model(sentence)
 predicted = torch.argmax(output.data.detach()).item()
 if predicted == tag: test_correct += 1
 
 # print model performance results
 log = f'ITER: {ITER+1} | ' \
 f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
 f'train accuracy: {train_correct/len(train_data):.4f} | ' \
 f'test accuracy: {test_correct/len(test_data):.4f}'
 print(log)

# call the train_bow function
train_bow(model, optimizer, criterion, train_data)

ITER: 1 | train loss/sent: 1.4746 | train accuracy: 0.3661 | test accuracy: 0.3977
ITER: 2 | train loss/sent: 1.1221 | train accuracy: 0.6023 | test accuracy: 0.4149
ITER: 3 | train loss/sent: 0.9114 | train accuracy: 0.7124 | test accuracy: 0.4072
ITER: 4 | train loss/sent: 0.7681 | train accuracy: 0.7684 | test accuracy: 0.4063
ITER: 5 | train loss/sent: 0.6629 | train accuracy: 0.8069 | test accuracy: 0.4081
ITER: 6 | train loss/sent: 0.5802 | train accuracy: 0.8331 | test accuracy: 0.4023
ITER: 7 | train loss/sent: 0.5167 | train accuracy: 0.8549 | test accuracy: 0.4100
ITER: 8 | train loss/sent: 0.4632 | train accuracy: 0.8683 | test accuracy: 0.4072
ITER: 9 | train loss/sent: 0.4187 | train accuracy: 0.8838 | test accuracy: 0.3986
ITER: 10 | train loss/sent: 0.3802 | train accuracy: 0.8954 | test accuracy: 0.3973
