shivraj221 commited on
Commit
3104516
1 Parent(s): 6d475df

Upload 2 files

Browse files
Files changed (2) hide show
  1. AI_t5_model2.ipynb +0 -0
  2. ai_t5_model2.py +215 -0
AI_t5_model2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
ai_t5_model2.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """AI_t5_model2.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1cLG3m6CnABOLIGgwQuZUJfRZjsMHk6y7
8
+ """
9
+
10
+ !pip install transformers[torch] accelerate
11
+
12
+ # Uninstall conflicting packages
13
+ !pip uninstall -y requests google-colab
14
+
15
+ # Reinstall google-colab which will bring the compatible requests version
16
+ !pip install google-colab
17
+
18
+ pip install requests==2.31.0
19
+
20
+ !pip install rouge_score
21
+ !pip install evaluate
22
+ # !pip install datasets
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+ from datasets import Dataset, DatasetDict
27
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, \
28
+ Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, get_scheduler
29
+ import evaluate
30
+ import nltk
31
+ from nltk.tokenize import sent_tokenize
32
+ import warnings
33
+ warnings.simplefilter(action='ignore')
34
+
35
+ data = pd.read_csv('news_summary.csv', encoding='cp437')
36
+ data = data.dropna()
37
+ data.info()
38
+
39
+ # headlines - column containing headlines which will be used as reference summarizations
40
+ # ctext - column containing full texts of news articles
41
+ # taking a look at the average lengths of both
42
+
43
+ def length(text):
44
+ return len(text.split())
45
+
46
+ print('Mean headline length (words):', data['headlines'].apply(length).mean())
47
+ print('Mean text length (words):', data['ctext'].apply(length).mean())
48
+
49
+ # splitting the data into train, val, and test, and converting it into Dataset format
50
+
51
+ train_size = int(0.8 * len(data))
52
+ val_size = int(0.1 * len(data))
53
+ test_size = len(data) - train_size - val_size
54
+
55
+ train_data = data[:train_size]
56
+ val_data = data[train_size:train_size+val_size]
57
+ test_data = data[train_size+val_size:]
58
+
59
+ train_dataset = Dataset.from_pandas(train_data)
60
+ val_dataset = Dataset.from_pandas(val_data)
61
+ test_dataset = Dataset.from_pandas(test_data)
62
+
63
+ dataset = DatasetDict({
64
+ "train": train_dataset,
65
+ "validation": val_dataset,
66
+ "test": test_dataset
67
+ })
68
+
69
+ dataset
70
+
71
+ # loading the model tokenizer
72
+
73
+ model_checkpoint = "google/mt5-small"
74
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
75
+
76
+ # creating tokenization function with length limits for headlines and texts
77
+
78
+ max_input_length = 512
79
+ max_target_length = 30
80
+
81
+ def preprocess_function(examples):
82
+ model_inputs = tokenizer(
83
+ examples["ctext"],
84
+ max_length=max_input_length,
85
+ truncation=True,
86
+ )
87
+ labels = tokenizer(
88
+ examples["headlines"], max_length=max_target_length, truncation=True
89
+ )
90
+ model_inputs["labels"] = labels["input_ids"]
91
+ return model_inputs
92
+
93
+ # tokenizing the datasets
94
+
95
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
96
+
97
+ # loading ROUGE metric
98
+
99
+ rouge_score = evaluate.load("rouge")
100
+
101
+ import nltk
102
+ nltk.download('punkt')
103
+
104
+ def three_sentence_summary(text):
105
+ return "\n".join(sent_tokenize(text)[:3])
106
+
107
+
108
+ print(three_sentence_summary(dataset["train"][1]["ctext"]))
109
+
110
+ def evaluate_baseline(dataset, metric):
111
+ summaries = [three_sentence_summary(text) for text in dataset["ctext"]]
112
+ return metric.compute(predictions=summaries, references=dataset["headlines"])
113
+
114
+ # getting baseline metrics
115
+
116
+ score = evaluate_baseline(dataset["validation"], rouge_score)
117
+ rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
118
+ rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
119
+ rouge_dict
120
+
121
+ # logging in to Hugging Face Hub
122
+
123
+ from huggingface_hub import notebook_login
124
+
125
+ notebook_login()
126
+
127
+ # loading the pre-trained Seq2Seq model and the data collator
128
+
129
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
130
+
131
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
132
+
133
+ # setting arguments
134
+
135
+ batch_size = 8
136
+ num_train_epochs = 8
137
+ # Show the training loss with every epoch
138
+ logging_steps = len(tokenized_datasets["train"]) // batch_size
139
+ output_dir = "news-summary-t5-model-2"
140
+
141
+ args = Seq2SeqTrainingArguments(
142
+ output_dir=output_dir,
143
+ evaluation_strategy="epoch",
144
+ learning_rate=5.6e-5,
145
+ per_device_train_batch_size=batch_size,
146
+ per_device_eval_batch_size=batch_size,
147
+ weight_decay=0.01,
148
+ save_total_limit=3,
149
+ num_train_epochs=num_train_epochs,
150
+ predict_with_generate=True, # calculate ROUGE for every epoch
151
+ logging_steps=logging_steps,
152
+ push_to_hub=True,
153
+ )
154
+
155
+ # function for computing ROUGE metrics
156
+
157
+ def compute_metrics(eval_pred):
158
+ predictions, labels = eval_pred
159
+ decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
160
+ labels= np.where(labels != -100, labels, tokenizer.pad_token_id)
161
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
162
+ decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
163
+ decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
164
+ result = rouge_score.compute(
165
+ predictions=decoded_preds, references=decoded_labels, use_stemmer=True
166
+ )
167
+ result = {key: value * 100 for key, value in result.items()}
168
+ return {k: round(v, 4) for k, v in result.items()}
169
+
170
+ # removing columns containing strings
171
+
172
+ tokenized_datasets = tokenized_datasets.remove_columns(
173
+ dataset["train"].column_names
174
+ )
175
+
176
+ # defining Trainer
177
+
178
+ trainer = Seq2SeqTrainer(
179
+ model,
180
+ args,
181
+ train_dataset=tokenized_datasets["train"],
182
+ eval_dataset=tokenized_datasets["validation"],
183
+ data_collator=data_collator,
184
+ tokenizer=tokenizer,
185
+ compute_metrics=compute_metrics,
186
+ )
187
+
188
+ # training the model
189
+
190
+ trainer.train()
191
+
192
+ # evaluating the model
193
+
194
+ trainer.evaluate()
195
+
196
+ # pushing to Hugging Face Hub
197
+
198
+ trainer.push_to_hub(commit_message="Training complete", tags="summarization")
199
+
200
+ from transformers import pipeline
201
+
202
+ hub_model_id = "shivraj221/news-summary-t5-model-2"
203
+ summarizer = pipeline("summarization", model=hub_model_id)
204
+
205
+ # function to get a summary of an article with index idx
206
+
207
+ def print_summary(idx):
208
+ review = dataset["test"][idx]["ctext"]
209
+ title = dataset["test"][idx]["headlines"]
210
+ summary = summarizer(dataset["test"][idx]["ctext"])[0]["summary_text"]
211
+ print(f"'>>> Article: {review}'")
212
+ print(f"\n'>>> Headline: {title}'")
213
+ print(f"\n'>>> Summary: {summary}'")
214
+
215
+ print_summary(20)