In [None]:
from IPython.display import HTML, display

def set_css():
 display(HTML('''
 
 '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
!pip install --upgrade pip
!pip install transformers
!pip install datasets
!pip install sentencepiece

# 📂 Dataset

### Loading the dataset
---

In [None]:
from datasets import load_dataset

!wget 'https://raw.githubusercontent.com/jamesesguerra/dataset_repo/main/kami-3000.csv'

dataset = load_dataset('csv', data_files='kami-3000.csv')

print(dataset)
print()
print(dataset['train'].features)

In [None]:
'''USE THIS CODE BLOCK FOR LOCAL INITIALIZATION'''

from datasets import load_dataset

dataset = load_dataset('csv', data_files='C:/Users/Public/Documents/hazielle/kami-3000.csv')

print(dataset)
print()
print(dataset['train'].features)

### Filtering rows
---

**Removing rows with blank article text and blank summary**

In [None]:
dataset = dataset.filter(lambda x: x['article_text'] is not None)
dataset = dataset.filter(lambda x: x['summary'] is not None)

print(dataset['train'])

**Removing rows with `len(article text)` < 25** and **`len(summary)` < 10**
(based on [this paper](http://www.diva-portal.org/smash/get/diva2:1563580/FULLTEXT01.pdf))

In [None]:
dataset = dataset.filter(lambda x: len(x['article_text'].split()) > 25)
dataset = dataset.filter(lambda x: len(x['summary'].split()) > 10)

print(dataset['train'])

### Cleaning
---

**Unescaping HTML character codes**

In [None]:
import html

dataset = dataset.map(
 lambda x: {'article_text': [html.unescape(o) for o in x['article_text']]}, batched=True
)

**Removing unicode hard spaces**

In [None]:
from unicodedata import normalize

dataset = dataset.map(lambda x: {'article_text': normalize('NFKD', x['article_text'])})

## Dataset splits
---

In [None]:
dataset = dataset['train'].train_test_split(train_size=0.8, seed=42)

dataset['validation'] = dataset.pop('test')

print(dataset)

# 🪙 Tokenization

In [None]:
from transformers import AutoTokenizer

checkpoint = "patrickvonplaten/bert2bert-cnn_dailymail-fp16"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

**Define preprocess function**

In [None]:
# set upper limit on how long the articles and their summaries can be
max_input_length = 512
max_target_length = 128

def preprocess_function(rows):
 model_inputs = tokenizer(rows['article_text'], max_length=max_input_length, truncation=True)
 
 with tokenizer.as_target_tokenizer():
 labels = tokenizer(rows['summary'], max_length=max_target_length, truncation=True)
 
 model_inputs['labels'] = labels['input_ids']
 return model_inputs

**Tokenize the dataset**

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# 📊 Evaluation Metrics

## ROUGE
---

**installing `rouge_score` and loading the metric**

In [None]:
!pip install rouge_score

In [None]:
from datasets import load_metric
rouge_score = load_metric('rouge')

## Creating a lead-3 baseline
---

**import and download dependencies**

In [None]:
!pip install nltk
import nltk

nltk.download("punkt")

**define fn to extract the first 3 sentences in an article**

In [None]:
from nltk.tokenize import sent_tokenize

def extract_sentences(text):
 return "\n".join(sent_tokenize(text)[:3])

print(extract_sentences(dataset["train"][4]["article_text"]))

**define fn to extract summaries from the data and compute ROUGE scores for the baseline**

In [None]:
def evaluate_baseline(dataset, metric):
 summaries = [extract_sentences(text) for text in dataset["article_text"]]
 return metric.compute(predictions=summaries, references=dataset["summary"])

**use fn to compute ROUGE scores over the validation set**

In [None]:
import pandas as pd

score = evaluate_baseline(dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names)
print(rouge_dict)

# 🔩 Fine-tuning

**Loading the model**

In [None]:
from transformers import EncoderDecoderModel

model = EncoderDecoderModel.from_pretrained(checkpoint, pad_token_id=0)


**Logging in Hugging Face Hub**

In [None]:
from huggingface_hub import notebook_login
notebook_login()

**set up hyperparameters for training**

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size = 4
num_train_epochs = 2
logging_steps = len(tokenized_dataset['train']) // batch_size
model_name = checkpoint.split('/')[-1]

args = Seq2SeqTrainingArguments(
 output_dir=f"{model_name}-finetuned-1.0.0",
 evaluation_strategy="epoch",
 learning_rate=5e-5,
 per_device_train_batch_size=batch_size,
 per_device_eval_batch_size=batch_size,
 weight_decay=0.01,
 save_total_limit=3,
 num_train_epochs=num_train_epochs,
 predict_with_generate=True,
 logging_steps=logging_steps,
 push_to_hub=True,
)

**define fn to evaluate model during training**

In [None]:
import numpy as np


def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
 labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
 decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
 result = rouge_score.compute(
 predictions=decoded_preds, references=decoded_labels, use_stemmer=True
 )
 result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
 return {k: round(v, 4) for k, v in result.items()}

**define data collator for dynamic padding**

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

**instantiate trainer with arguments**

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
 model,
 args,
 train_dataset=tokenized_dataset["train"],
 eval_dataset=tokenized_dataset["validation"],
 data_collator=data_collator,
 tokenizer=tokenizer,
 compute_metrics=compute_metrics,
)

**launch training run**

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()