shivraj221's picture
Upload 2 files
1d2f70a verified
# -*- coding: utf-8 -*-
"""AI_t5_model2.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1cLG3m6CnABOLIGgwQuZUJfRZjsMHk6y7
"""
!pip install transformers[torch] accelerate
# Uninstall conflicting packages
!pip uninstall -y requests google-colab
# Reinstall google-colab which will bring the compatible requests version
!pip install google-colab
pip install requests==2.31.0
!pip install rouge_score
!pip install evaluate
# !pip install datasets
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, \
Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, get_scheduler
import evaluate
import nltk
from nltk.tokenize import sent_tokenize
import warnings
warnings.simplefilter(action='ignore')
data = pd.read_csv('news_summary.csv', encoding='cp437')
data = data.dropna()
data.info()
# headlines - column containing headlines which will be used as reference summarizations
# ctext - column containing full texts of news articles
# taking a look at the average lengths of both
def length(text):
return len(text.split())
print('Mean headline length (words):', data['headlines'].apply(length).mean())
print('Mean text length (words):', data['ctext'].apply(length).mean())
# splitting the data into train, val, and test, and converting it into Dataset format
train_size = int(0.8 * len(data))
val_size = int(0.1 * len(data))
test_size = len(data) - train_size - val_size
train_data = data[:train_size]
val_data = data[train_size:train_size+val_size]
test_data = data[train_size+val_size:]
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)
dataset = DatasetDict({
"train": train_dataset,
"validation": val_dataset,
"test": test_dataset
})
dataset
# loading the model tokenizer
model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# creating tokenization function with length limits for headlines and texts
max_input_length = 512
max_target_length = 30
def preprocess_function(examples):
model_inputs = tokenizer(
examples["ctext"],
max_length=max_input_length,
truncation=True,
)
labels = tokenizer(
examples["headlines"], max_length=max_target_length, truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# tokenizing the datasets
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# loading ROUGE metric
rouge_score = evaluate.load("rouge")
import nltk
nltk.download('punkt')
def three_sentence_summary(text):
return "\n".join(sent_tokenize(text)[:3])
print(three_sentence_summary(dataset["train"][1]["ctext"]))
def evaluate_baseline(dataset, metric):
summaries = [three_sentence_summary(text) for text in dataset["ctext"]]
return metric.compute(predictions=summaries, references=dataset["headlines"])
# getting baseline metrics
score = evaluate_baseline(dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict
# logging in to Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()
# loading the pre-trained Seq2Seq model and the data collator
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# setting arguments
batch_size = 10
num_train_epochs = 12
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
output_dir = "mt5-small-finetuned-news-summary-kaggle"
args = Seq2SeqTrainingArguments(
output_dir=output_dir,
evaluation_strategy="steps",
learning_rate=4e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.005,
save_total_limit=3,
num_train_epochs=num_train_epochs,
predict_with_generate=True, # calculate ROUGE for every epoch
logging_steps=logging_steps,
push_to_hub=True,
)
# function for computing ROUGE metrics
def compute_metrics(eval_pred):
predictions, labels = eval_pred
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels= np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
result = rouge_score.compute(
predictions=decoded_preds, references=decoded_labels, use_stemmer=True
)
result = {key: value * 100 for key, value in result.items()}
return {k: round(v, 4) for k, v in result.items()}
# removing columns containing strings
tokenized_datasets = tokenized_datasets.remove_columns(
dataset["train"].column_names
)
# defining Trainer
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
# training the model
trainer.train()
# evaluating the model
trainer.evaluate()
trainer.args.output_dir = "mt5-small-finetuned-news-summary-model-2"
# pushing to Hugging Face Hub
trainer.push_to_hub(commit_message="Training complete", tags="summarization")
from transformers import pipeline
hub_model_id = "shivraj221/mt5-small-finetuned-news-summary-kaggle"
summarizer = pipeline("summarization", model=hub_model_id)
# function to get a summary of an article with index idx
def print_summary(idx):
review = dataset["test"][idx]["ctext"]
title = dataset["test"][idx]["headlines"]
summary = summarizer(dataset["test"][idx]["ctext"])[0]["summary_text"]
print(f"'>>> Article: {review}'")
print(f"\n'>>> Headline: {title}'")
print(f"\n'>>> Summary: {summary}'")
print_summary(20)