shivraj221 commited on
Commit
3cc3172
1 Parent(s): 58864d6

Delete ai_t5.py

Browse files
Files changed (1) hide show
  1. ai_t5.py +0 -215
ai_t5.py DELETED
@@ -1,215 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """AI_t5.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1wUhv0CziUL-fB4pEUCQW8fOnJDIGLgtn
8
- """
9
-
10
- !pip install transformers[torch] accelerate
11
-
12
- # Uninstall conflicting packages
13
- !pip uninstall -y requests google-colab
14
-
15
- # Reinstall google-colab which will bring the compatible requests version
16
- !pip install google-colab
17
-
18
- pip install requests==2.31.0
19
-
20
- !pip install rouge_score
21
- !pip install evaluate
22
- # !pip install datasets
23
-
24
- import numpy as np
25
- import pandas as pd
26
- from datasets import Dataset, DatasetDict
27
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, \
28
- Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, get_scheduler
29
- import evaluate
30
- import nltk
31
- from nltk.tokenize import sent_tokenize
32
- import warnings
33
- warnings.simplefilter(action='ignore')
34
-
35
- data = pd.read_csv('news_summary.csv', encoding='cp437')
36
- data = data.dropna()
37
- data.info()
38
-
39
- # headlines - column containing headlines which will be used as reference summarizations
40
- # ctext - column containing full texts of news articles
41
- # taking a look at the average lengths of both
42
-
43
- def length(text):
44
- return len(text.split())
45
-
46
- print('Mean headline length (words):', data['headlines'].apply(length).mean())
47
- print('Mean text length (words):', data['ctext'].apply(length).mean())
48
-
49
- # splitting the data into train, val, and test, and converting it into Dataset format
50
-
51
- train_size = int(0.8 * len(data))
52
- val_size = int(0.1 * len(data))
53
- test_size = len(data) - train_size - val_size
54
-
55
- train_data = data[:train_size]
56
- val_data = data[train_size:train_size+val_size]
57
- test_data = data[train_size+val_size:]
58
-
59
- train_dataset = Dataset.from_pandas(train_data)
60
- val_dataset = Dataset.from_pandas(val_data)
61
- test_dataset = Dataset.from_pandas(test_data)
62
-
63
- dataset = DatasetDict({
64
- "train": train_dataset,
65
- "validation": val_dataset,
66
- "test": test_dataset
67
- })
68
-
69
- dataset
70
-
71
- # loading the model tokenizer
72
-
73
- model_checkpoint = "google/mt5-small"
74
- tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
75
-
76
- # creating tokenization function with length limits for headlines and texts
77
-
78
- max_input_length = 512
79
- max_target_length = 30
80
-
81
- def preprocess_function(examples):
82
- model_inputs = tokenizer(
83
- examples["ctext"],
84
- max_length=max_input_length,
85
- truncation=True,
86
- )
87
- labels = tokenizer(
88
- examples["headlines"], max_length=max_target_length, truncation=True
89
- )
90
- model_inputs["labels"] = labels["input_ids"]
91
- return model_inputs
92
-
93
- # tokenizing the datasets
94
-
95
- tokenized_datasets = dataset.map(preprocess_function, batched=True)
96
-
97
- # loading ROUGE metric
98
-
99
- rouge_score = evaluate.load("rouge")
100
-
101
- import nltk
102
- nltk.download('punkt')
103
-
104
- def three_sentence_summary(text):
105
- return "\n".join(sent_tokenize(text)[:3])
106
-
107
-
108
- print(three_sentence_summary(dataset["train"][1]["ctext"]))
109
-
110
- def evaluate_baseline(dataset, metric):
111
- summaries = [three_sentence_summary(text) for text in dataset["ctext"]]
112
- return metric.compute(predictions=summaries, references=dataset["headlines"])
113
-
114
- # getting baseline metrics
115
-
116
- score = evaluate_baseline(dataset["validation"], rouge_score)
117
- rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
118
- rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
119
- rouge_dict
120
-
121
- # logging in to Hugging Face Hub
122
-
123
- from huggingface_hub import notebook_login
124
-
125
- notebook_login()
126
-
127
- # loading the pre-trained Seq2Seq model and the data collator
128
-
129
- model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
130
-
131
- data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
132
-
133
- # setting arguments
134
-
135
- batch_size = 8
136
- num_train_epochs = 8
137
- # Show the training loss with every epoch
138
- logging_steps = len(tokenized_datasets["train"]) // batch_size
139
- output_dir = "mt5-small-finetuned-news-summary-kaggle"
140
-
141
- args = Seq2SeqTrainingArguments(
142
- output_dir=output_dir,
143
- evaluation_strategy="epoch",
144
- learning_rate=5.6e-5,
145
- per_device_train_batch_size=batch_size,
146
- per_device_eval_batch_size=batch_size,
147
- weight_decay=0.01,
148
- save_total_limit=3,
149
- num_train_epochs=num_train_epochs,
150
- predict_with_generate=True, # calculate ROUGE for every epoch
151
- logging_steps=logging_steps,
152
- push_to_hub=True,
153
- )
154
-
155
- # function for computing ROUGE metrics
156
-
157
- def compute_metrics(eval_pred):
158
- predictions, labels = eval_pred
159
- decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
160
- labels= np.where(labels != -100, labels, tokenizer.pad_token_id)
161
- decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
162
- decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
163
- decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
164
- result = rouge_score.compute(
165
- predictions=decoded_preds, references=decoded_labels, use_stemmer=True
166
- )
167
- result = {key: value * 100 for key, value in result.items()}
168
- return {k: round(v, 4) for k, v in result.items()}
169
-
170
- # removing columns containing strings
171
-
172
- tokenized_datasets = tokenized_datasets.remove_columns(
173
- dataset["train"].column_names
174
- )
175
-
176
- # defining Trainer
177
-
178
- trainer = Seq2SeqTrainer(
179
- model,
180
- args,
181
- train_dataset=tokenized_datasets["train"],
182
- eval_dataset=tokenized_datasets["validation"],
183
- data_collator=data_collator,
184
- tokenizer=tokenizer,
185
- compute_metrics=compute_metrics,
186
- )
187
-
188
- # training the model
189
-
190
- trainer.train()
191
-
192
- # evaluating the model
193
-
194
- trainer.evaluate()
195
-
196
- # pushing to Hugging Face Hub
197
-
198
- trainer.push_to_hub(commit_message="Training complete", tags="summarization")
199
-
200
- from transformers import pipeline
201
-
202
- hub_model_id = "shivraj221/mt5-small-finetuned-news-summary-kaggle"
203
- summarizer = pipeline("summarization", model=hub_model_id)
204
-
205
- # function to get a summary of an article with index idx
206
-
207
- def print_summary(idx):
208
- review = dataset["test"][idx]["ctext"]
209
- title = dataset["test"][idx]["headlines"]
210
- summary = summarizer(dataset["test"][idx]["ctext"])[0]["summary_text"]
211
- print(f"'>>> Article: {review}'")
212
- print(f"\n'>>> Headline: {title}'")
213
- print(f"\n'>>> Summary: {summary}'")
214
-
215
- print_summary(20)