# -*- coding: utf-8 -*- """AI_t5_model2.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1cLG3m6CnABOLIGgwQuZUJfRZjsMHk6y7 """ !pip install transformers[torch] accelerate # Uninstall conflicting packages !pip uninstall -y requests google-colab # Reinstall google-colab which will bring the compatible requests version !pip install google-colab pip install requests==2.31.0 !pip install rouge_score !pip install evaluate # !pip install datasets import numpy as np import pandas as pd from datasets import Dataset, DatasetDict from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, \ Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, get_scheduler import evaluate import nltk from nltk.tokenize import sent_tokenize import warnings warnings.simplefilter(action='ignore') data = pd.read_csv('news_summary.csv', encoding='cp437') data = data.dropna() data.info() # headlines - column containing headlines which will be used as reference summarizations # ctext - column containing full texts of news articles # taking a look at the average lengths of both def length(text): return len(text.split()) print('Mean headline length (words):', data['headlines'].apply(length).mean()) print('Mean text length (words):', data['ctext'].apply(length).mean()) # splitting the data into train, val, and test, and converting it into Dataset format train_size = int(0.8 * len(data)) val_size = int(0.1 * len(data)) test_size = len(data) - train_size - val_size train_data = data[:train_size] val_data = data[train_size:train_size+val_size] test_data = data[train_size+val_size:] train_dataset = Dataset.from_pandas(train_data) val_dataset = Dataset.from_pandas(val_data) test_dataset = Dataset.from_pandas(test_data) dataset = DatasetDict({ "train": train_dataset, "validation": val_dataset, "test": test_dataset }) dataset # loading the model tokenizer model_checkpoint = "google/mt5-small" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) # creating tokenization function with length limits for headlines and texts max_input_length = 512 max_target_length = 30 def preprocess_function(examples): model_inputs = tokenizer( examples["ctext"], max_length=max_input_length, truncation=True, ) labels = tokenizer( examples["headlines"], max_length=max_target_length, truncation=True ) model_inputs["labels"] = labels["input_ids"] return model_inputs # tokenizing the datasets tokenized_datasets = dataset.map(preprocess_function, batched=True) # loading ROUGE metric rouge_score = evaluate.load("rouge") import nltk nltk.download('punkt') def three_sentence_summary(text): return "\n".join(sent_tokenize(text)[:3]) print(three_sentence_summary(dataset["train"][1]["ctext"])) def evaluate_baseline(dataset, metric): summaries = [three_sentence_summary(text) for text in dataset["ctext"]] return metric.compute(predictions=summaries, references=dataset["headlines"]) # getting baseline metrics score = evaluate_baseline(dataset["validation"], rouge_score) rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names) rouge_dict # logging in to Hugging Face Hub from huggingface_hub import notebook_login notebook_login() # loading the pre-trained Seq2Seq model and the data collator model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # setting arguments batch_size = 10 num_train_epochs = 12 # Show the training loss with every epoch logging_steps = len(tokenized_datasets["train"]) // batch_size output_dir = "mt5-small-finetuned-news-summary-kaggle" args = Seq2SeqTrainingArguments( output_dir=output_dir, evaluation_strategy="steps", learning_rate=4e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.005, save_total_limit=3, num_train_epochs=num_train_epochs, predict_with_generate=True, # calculate ROUGE for every epoch logging_steps=logging_steps, push_to_hub=True, ) # function for computing ROUGE metrics def compute_metrics(eval_pred): predictions, labels = eval_pred decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) labels= np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds] decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels] result = rouge_score.compute( predictions=decoded_preds, references=decoded_labels, use_stemmer=True ) result = {key: value * 100 for key, value in result.items()} return {k: round(v, 4) for k, v in result.items()} # removing columns containing strings tokenized_datasets = tokenized_datasets.remove_columns( dataset["train"].column_names ) # defining Trainer trainer = Seq2SeqTrainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, ) # training the model trainer.train() # evaluating the model trainer.evaluate() trainer.args.output_dir = "mt5-small-finetuned-news-summary-model-2" # pushing to Hugging Face Hub trainer.push_to_hub(commit_message="Training complete", tags="summarization") from transformers import pipeline hub_model_id = "shivraj221/mt5-small-finetuned-news-summary-kaggle" summarizer = pipeline("summarization", model=hub_model_id) # function to get a summary of an article with index idx def print_summary(idx): review = dataset["test"][idx]["ctext"] title = dataset["test"][idx]["headlines"] summary = summarizer(dataset["test"][idx]["ctext"])[0]["summary_text"] print(f"'>>> Article: {review}'") print(f"\n'>>> Headline: {title}'") print(f"\n'>>> Summary: {summary}'") print_summary(20)