In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
import pandas as pd
import numpy as np

In [3]:
full_df = pd.read_csv('cleanedTweetData.csv')

To fine tune a GPT-2 model, we only need the tweets

In [4]:
tweets = full_df['tweet']

# Pre-processing the text

For the model to be able to handle raw data, we need to first prepocess it. We need to keep in mind that we need to prepocess it the same way as the original data was prepocessed when tranining the model.

To make sure we get the correct tokenizer, we can use the transformers library and import the model and its corresponding tokenizer


Add three new tokens in the pre-trained GPT2 tokenizer: \ <|sos|> : start of sentence \ <|eos|> : end of sentence \ <|pad|> : padding token

In [5]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium',
                                          bos_token='<|sos|>',
                                          eos_token='<|eos|>',
                                          pad_token='<|pad|>')

#Example of tokenized sample text
tokenizer.encode('Hello World!')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[15496, 2159, 0]

We want to find out our longest tweet to know how to know how to set our maximum token length

In [6]:
max_tweet = max([len(tokenizer.encode(tweet)) for tweet in tweets])

print(f'The longest tweet is {max_tweet} tokens long.')

The longest tweet is 158 tokens long.


# Training GPT-2

**Training**

**What is GPT-2 and why do we use it?**

GPT-2 Is a transformer-based architecture NN that was trained on a massive amount of unlabeled raw text data in a self-supervised fashion in order to predict the next word in a given sentence, and the attempts at using it in a transfer-learning manner have been very successful so far.

You can use it yourself to create models that do anything from answering questions, generating stories, to mimicing someone on Twitter- which we're going to do here.
Next we create a custom dataloader for our tweets using torch Dataset. \ Each entry in the dataset will be two tensors, one which is the encoding for the string and one which is the attention mask

In [7]:
batch_size = 32

import torch
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self,tweets,tokenizer,gpt2_type="gpt2-medium",max_length=max_tweet):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_masks = []
        
        for tweet in tweets:
            encoding_dict = tokenizer('<|sos|>'+ tweet +'<|eos|>',truncation=True,
                                     max_length=max_length,
                                     padding='max_length')
            
            self.input_ids.append(torch.tensor(encoding_dict['input_ids']))
            self.attention_masks.append(torch.tensor(encoding_dict['attention_mask']))
        
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx], self.attention_masks[idx]

In [8]:
from torch.utils.data import random_split

#Note that we set max_length to max_tweet (we created this variable a few cells ago)
dataset = TweetDataset(tweets,tokenizer,max_length=max_tweet)

In [9]:
#Split our data into training and validation set

train_size = int(0.9 * len(dataset)) #90% train, 10% validation
val_size = len(dataset)-train_size

train,val = random_split(dataset,[train_size,val_size])
print(f'No of train samples = {train_size} and Number of validation samples = {val_size}')

No of train samples = 14274 and Number of validation samples = 1586


In [10]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
#Use our dataloader to prepare our data
train_dataloader = DataLoader(train,sampler = RandomSampler(train),
                             batch_size = batch_size)

val_dataloader = DataLoader(val,sampler = SequentialSampler(val),
                           batch_size = batch_size)

Now we that our data is preprocessed, we load our model from transformers library

In [11]:
import random
from transformers import GPT2LMHeadModel, GPT2Config

In [12]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda")
model.cuda()

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

The time for training will depend on the number of samples divided by the batch size, then multiplied by the epochs, I encourage people to expirement with hyperparameters such as batch size, epochs, learning rate, LR scheduling and optimizers, and compare results.

I recommend doing at least 5-6 if you want good results.

In [13]:
# the warmup steps are steps at the start of training that are ignored
# every x steps we will sample the model to test the output

epochs = 5
warmup_steps = 1e2
sample_every = 100

In [14]:
from transformers import AdamW

optimizer = AdamW(model.parameters(),
                  lr = 5e-4,
                  eps = 1e-8
                )



In [15]:
from transformers import get_linear_schedule_with_warmup

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [16]:
import random
import time
import datetime

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    print(f'Beginning epoch {epoch_i + 1} of {epochs}')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every 100 batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print(f'Batch {step} of {len(train_dataloader)}. Loss:{batch_loss}. Time:{elapsed}')

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print(f'Example output: {tokenizer.decode(sample_output, skip_special_tokens=True)}')
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print(f'Average Training Loss: {avg_train_loss}. Epoch time: {training_time}')

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in val_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids,  
                             attention_mask = b_masks,
                             labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(val_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print(f'Validation loss: {avg_val_loss}. Validation Time: {validation_time}')

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print(f'Total training took {format_time(time.time()-total_t0)}')

Beginning epoch 1 of 5


OutOfMemoryError: ignored