# Abstractive Text Summarization with T5

This implementation uses HuggingFace, especially utilizing `AutoModelForSeq2SeqLM` and `AutoTokenizer`. 

## Importing libraries

In [1]:
# Installs
!pip install -q evaluate py7zr rouge_score absl-py

# Imports here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

import torch
import torch.nn as nn

import datasets
import transformers
from transformers import (
        AutoModelForSeq2SeqLM,
        Seq2SeqTrainingArguments,
        Seq2SeqTrainer,
        AutoTokenizer
)
import evaluate

# Quality of life fixes
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint

import os
os.environ["WANDB_DISABLED"] = "true"

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Datasets version: {datasets.__version__}")
print(f"Evaluate version: {evaluate.__version__}")

# Get the samsum dataset
samsum = datasets.load_dataset('samsum')
clear_output()
print("Setup done!")

Setup done!


In [2]:
# Verify transformers version
transformers.__version__

'4.27.4'

## Playing around with the dataset

In [3]:
# The samsum dataset shape
samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [4]:
rand_idx = np.random.randint(0, len(samsum['train']))

print(f"Dialogue:\n{samsum['train'][rand_idx]['dialogue']}")
print('\n', '-'*50, '\n')
print(f"Summary:\n{samsum['train'][rand_idx]['summary']}")

Dialogue:
Ollie: How is your Hebrew?
Gabi: Not great. 
Ollie: Could you translate a letter?
Gabi: From Hebrew to English maybe, the opposite I don’t think so
Gabi: My writing sucks
Ollie: Please help me. I don’t have anyone else to ask
Gabi: Send it to me. I’ll try. 

 -------------------------------------------------- 

Summary:
Gabi knows a bit of Hebrew, though her writing isn't great. She will try to help Ollie translate a letter.


## Preprocessing data

 I'm using the T5 Transformers model (Text-to-Text Transfer Transformer)

In [5]:
model_ckpt = 't5-small'

# TODO: Create the Tokenizer AutoTokenizer pretrained checkpoint
tokenizer = AutoTokenizer.from_pretrained('t5-small')

In [6]:
from datasets import concatenate_datasets
# Find the max lengths of the source and target samples
# The maximum total input sequence length after tokenization. 
# Sequences that are longer than this will be truncated, sequences shorter are be padded.
tokenized_inputs = concatenate_datasets([samsum["train"], samsum["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences that are longer than this will be truncated, sequences shorter are be padded.
tokenized_targets = concatenate_datasets([samsum["train"], samsum["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Loading cached processed dataset at C:\Users\QXLVR\.cache\huggingface\datasets\samsum\samsum\0.0.0\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\cache-78c13bd5dd6a016a.arrow


Max source length: 512


Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Max target length: 95


In [7]:
def preprocess_function(
    sample, 
    padding="max_length", 
    max_source_length=max_source_length,
    max_target_length=max_target_length
):
    '''
    A preprocessing function that will be applied across the dataset.
    The inputs and targets will be tokenized and padded/truncated to the max lengths.

    Args:
        sample: A dictionary containing the source and target texts (keys are "dialogue" and "summary") in a list.
        padding: Whether to pad the inputs and targets to the max lengths.
        max_source_length: The maximum length of the source text.
        max_target_length: The maximum length of the target text.
    '''
    # Add prefix to the input for t5
    inputs = ['summarize: ' + s for s in sample['dialogue']]
   
    # Tokenize inputs, specifying the padding, truncation and max_length
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample['summary'], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore padding in the loss
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    # Format and return
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map this preprocessing function to our datasets using .map on the samsum variable
tokenized_dataset = samsum.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Loading cached processed dataset at C:\Users\QXLVR\.cache\huggingface\datasets\samsum\samsum\0.0.0\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\cache-073bbcc8f496f07c.arrow


Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Loading cached processed dataset at C:\Users\QXLVR\.cache\huggingface\datasets\samsum\samsum\0.0.0\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\cache-a43b31cabc78c9c3.arrow


Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [9]:
metric = evaluate.load("rouge")
clear_output()

In [10]:
def postprocess_text(preds, labels):
    '''
    A simple post-processing function to clean up the predictions and labels

    Args:
        preds: List[str] of predictions
        labels: List[str] of labels
    '''
    
    # strip whitespace on all sentences in preds and labels
    preds = [p.strip(' ') for p in preds]
    labels = [l.strip(' ') for l in preds]
    
    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    
    # Fetch the predictions and labels
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    # Decode the predictions back to text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing for ROUGE
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Compute ROUGE on the decoded predictions and the decoder labels
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

## Creating the model

In [11]:
# the AutoModelForSeq2SeqLM class and use the model_ckpt variable)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

clear_output()

In [12]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100

# Data Collator, specifying the tokenizer, model, and label_pad_token_id
# pad_to_multiple_of=8 to speed up training
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [13]:
import logging
logging.getLogger("transformers").setLevel(logging.WARNING)


# Define training hyperparameters in Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_samsum", # the output directory
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    predict_with_generate=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=50,
    logging_first_step=False,
    fp16=False
)

# index into the tokenized_dataset variable to get the training and validation data
training_data = tokenized_dataset['train']
eval_data = tokenized_dataset['validation']

# Create the Trainer for the model
trainer = Seq2SeqTrainer(
    model=model,    # the model to be trained
    args=training_args, # training arguments
    train_dataset=training_data, # the training dataset
    eval_dataset=eval_data, # the validation dataset
    tokenizer=tokenizer, # the tokenizer we used to tokenize our data
    compute_metrics=compute_metrics, # the function we defined above to compute metrics
    data_collator=data_collator # the data collator we defined above
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
# Train the model (this will take a while!)
results = trainer.train()
clear_output()
pprint(results)

TrainOutput(global_step=9210, training_loss=1.9861197174436753, metrics={'train_runtime': 3551.1547, 'train_samples_per_second': 20.743, 'train_steps_per_second': 2.594, 'total_flos': 9969277096427520.0, 'train_loss': 1.9861197174436753, 'epoch': 5.0})


## Evaluating the model

In [15]:
res = trainer.evaluate()
clear_output()

In [18]:
cols  = ["eval_loss", "eval_rouge1", "eval_rouge2", "eval_rougeL", "eval_rougeLsum"]
filtered_scores = dict((x , res[x]) for x in cols)
pd.DataFrame([filtered_scores], index=[model_ckpt])

Unnamed: 0,eval_loss,eval_rouge1,eval_rouge2,eval_rougeL,eval_rougeLsum
t5-small,1.764253,100.0,100.0,100.0,100.0


In [20]:
from transformers import pipeline

summarizer_pipeline = pipeline("summarization",
                              model=model,
                              tokenizer=tokenizer,
                              device=0)

In [22]:
rand_idx = np.random.randint(low=0, high=len(samsum["test"]))
sample = samsum["test"][rand_idx]

dialog = sample["dialogue"]
true_summary = sample["summary"]

model_summary = summarizer_pipeline(dialog)
clear_output()

print(f"Dialogue: {dialog}")
print("-"*25)
print(f"True Summary: {true_summary}")
print("-"*25)
print(f"Model Summary: {model_summary[0]['summary_text']}")
print("-"*25)

Dialogue: Adelina: Hi handsome. Where you you come from?
Cyprien: What do you mean?
Adelina: What do you mean, "what do you mean"? It's a simple question, where do you come from?
Cyprien: Well I was born in Jarrow, live in London now, so you could say I came from either of those places
Cyprien: I was educated in Loughborouogh, so in a sense I came from there.
Adelina: OK. 
Cyprien: In another sense I come from my mother's vagina, but I dare say everyone can say that.
Adelina: Are you all right?
Cyprien: IN another sense I come from the atoms in the air that I breath or the food I eat, which comes to me from many places, so all I can say is "I come from Planet Earth".
Adelina: OK, bye. If you're gonna be a dick...
Cyprien: Wait, what you got against earthlings?
-------------------------
True Summary: Cyprien irritates Adelina by giving too many responses.
-------------------------
Model Summary: Cyprien came from Jarrow, live in London. She came from Loughborouogh, and came fr

In [24]:
def create_summary(input_text, model_pipeline=summarizer_pipeline):
    summary = model_pipeline(input_text)
    return summary

text = '''
Andy: I need you to come in to work on the weekend.
David: Why boss? I have plans to go on a concert I might not be able to come on the weekend.
Andy: It's important we need to get our paperwork all sorted out for this year. Corporate needs it.
David: But I already made plans and this is news to me on very short notice.
Andy: Be there or you'r fired
'''

print(f"Original Text:\n{text}")
print('\n', '-'*50, '\n')

summary = create_summary(text)

print(f"Generated Summary: \n{summary}")

Your max_length is set to 200, but you input_length is only 94. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=47)


Original Text:

Andy: I need you to come in to work on the weekend.
David: Why boss? I have plans to go on a concert I might not be able to come on the weekend.
Andy: It's important we need to get our paperwork all sorted out for this year. Corporate needs it.
David: But I already made plans and this is news to me on very short notice.
Andy: Be there or you'r fired


 -------------------------------------------------- 

Generated Summary: 
[{'summary_text': 'David has plans to go on a concert. Andy needs to get his paperwork all sorted out for this year. David already made plans.'}]
