In [1]:
from datasets import load_dataset

ds = load_dataset("thaisum")
ds

 from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
 train: Dataset({
 features: ['title', 'body', 'summary', 'type', 'tags', 'url'],
 num_rows: 358868
 })
 validation: Dataset({
 features: ['title', 'body', 'summary', 'type', 'tags', 'url'],
 num_rows: 11000
 })
 test: Dataset({
 features: ['title', 'body', 'summary', 'type', 'tags', 'url'],
 num_rows: 11000
 })
})

In [None]:
from datasets import load_dataset
from datasets import DatasetDict 

dataset = load_dataset('csv', data_files='thaisum.csv')
ds_train_devtest = dataset['train'].train_test_split(test_size=0.05, seed=42)
ds_devtest = ds_train_devtest['test'].train_test_split(test_size=0.5, seed=42)


ds_thai_news = DatasetDict({
 'train': ds_train_devtest['train'],
 'valid': ds_devtest['train'],
 'test': ds_devtest['test']
})
ds_thai_news

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mt5_config = AutoConfig.from_pretrained(
 "../mt5-base-thaisum-text-summarization",
 local_files_only=True,
 max_length=140,
 min_length=40,
 length_penalty=1.2,
 no_repeat_ngram_size=2,
 num_beams=15,
)

tokenizer = AutoTokenizer.from_pretrained("../mt5-base-thaisum-text-summarization", local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained("../mt5-base-thaisum-text-summarization", local_files_only=True).to(device)

 _torch_pytree._register_pytree_node(
 _torch_pytree._register_pytree_node(
 _torch_pytree._register_pytree_node(


In [3]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
 tokenizer,
 model=model,
 return_tensors="pt")

def tokenize_data(data):

 input_feature = tokenizer(data["body"], truncation=True, max_length=512)
 label = tokenizer(data["summary"], truncation=True, max_length=140)
 return {
 "input_ids": input_feature["input_ids"],
 "attention_mask": input_feature["attention_mask"],
 "labels": label["input_ids"],
 }

token_ds_thai_news = ds.map(
 tokenize_data,
 remove_columns=['title', 'body', 'summary', 'type', 'tags', 'url'],
 batched=True,
 batch_size=64)

Map: 100%|██████████| 11000/11000 [00:17<00:00, 622.18 examples/s]


In [4]:
import evaluate
import numpy as np
def tokenize_sentence(arg):
 encoded_arg = tokenizer(arg)
 return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
 preds, labels = eval_arg
 labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
 text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

 return rouge_metric.compute(
 predictions=text_preds,
 references=text_labels,
 tokenizer=tokenize_sentence
 )
rouge_metric = evaluate.load("rouge")

In [5]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
 output_dir = "..",
 log_level = "error",
 num_train_epochs = 6,
 learning_rate = 5e-4,
 warmup_steps = 5000,
 weight_decay=0.01,
 per_device_train_batch_size = 8,
 per_device_eval_batch_size = 1,
 gradient_accumulation_steps = 4,
 evaluation_strategy = "steps",
 eval_steps = 100,
 predict_with_generate=True,
 generation_max_length = 140,
 save_steps = 3000,
 logging_steps = 10,
 push_to_hub = False,
 remove_unused_columns=False
)


In [None]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
 model = model,
 args = training_args,
 data_collator = data_collator,
 compute_metrics = metrics_func,
 train_dataset = token_ds_thai_news["train"],
 eval_dataset = token_ds_thai_news["valid"].select(range(30)),
 tokenizer = tokenizer,
)

In [None]:
import os
from transformers import AutoModelForSeq2SeqLM

os.makedirs("./trained_for_summarization", exist_ok=True)
if hasattr(trainer.model, "module"):
 trainer.model.module.save_pretrained("./trained_for_summarization")
else:
 trainer.model.save_pretrained("./trained_for_summarization")