RobPruzan commited on
Commit
c6711bf
1 Parent(s): a4e5c29

Uploading training.py

Browse files
Files changed (1) hide show
  1. training.py +309 -0
training.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import matplotlib.pyplot as plt
4
+ import nltk
5
+ import numpy as np
6
+ import pandas as pd
7
+ import torch
8
+ import torch.nn
9
+ from nltk.tokenize import word_tokenize
10
+ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
11
+ from torch.utils.data import TensorDataset, random_split
12
+ from transformers import DistilBertForSequenceClassification, AdamW
13
+ from transformers import DistilBertTokenizer
14
+ from transformers import get_linear_schedule_with_warmup
15
+
16
+ nltk.download('punkt')
17
+
18
+ # %matplotlib inline
19
+
20
+ df = pd.read_csv('/content/train.csv')
21
+
22
+ print(f'Number of training samples: {df.shape[0]}')
23
+
24
+ df.sample(100)
25
+
26
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
27
+
28
+ excerpts = df.excerpt.values
29
+ targets = df.target.values.astype('float32')
30
+
31
+ plt.hist(df['target'])
32
+ plt.show()
33
+
34
+ max_len = 0
35
+
36
+ for i in excerpts:
37
+ input_ids = tokenizer.encode(i, add_special_tokens=True)
38
+
39
+ max_len = max(max_len, len(input_ids))
40
+
41
+ print(max_len)
42
+
43
+ input_ids = []
44
+ attention_masks = []
45
+
46
+ for i in excerpts:
47
+ encoded_text = tokenizer.encode_plus(
48
+ i,
49
+ add_special_tokens=True,
50
+ max_length=315,
51
+ pad_to_max_length=True,
52
+ return_attention_mask=True,
53
+ return_tensors='pt'
54
+ )
55
+
56
+ input_ids.append(encoded_text['input_ids'])
57
+
58
+ attention_masks.append(encoded_text['attention_mask'])
59
+
60
+ input_ids = torch.cat(input_ids, dim=0)
61
+ attention_masks = torch.cat(attention_masks, dim=0)
62
+ labels = torch.tensor(targets)
63
+
64
+ labels = labels.float()
65
+
66
+ # Combine the training inputs into a TensorDataset.
67
+ dataset = TensorDataset(input_ids, attention_masks, labels)
68
+
69
+ # Create a 80-20 train-validation split.
70
+
71
+ # Calculate the number of samples to include in each set.
72
+ train_size = int(0.8 * len(dataset))
73
+ val_size = len(dataset) - train_size
74
+
75
+ # Divide the dataset by randomly selecting samples.
76
+ train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
77
+
78
+ print('{:>5,} training samples'.format(train_size))
79
+ print('{:>5,} validation samples'.format(val_size))
80
+
81
+ batch_size = 8
82
+
83
+ train_dataloader = DataLoader(
84
+ train_dataset,
85
+ sampler=RandomSampler(train_dataset),
86
+ batch_size=batch_size
87
+ )
88
+
89
+ validation_dataloader = DataLoader(
90
+ val_dataset,
91
+ sampler=SequentialSampler(val_dataset),
92
+ batch_size=batch_size
93
+ )
94
+
95
+ model = DistilBertForSequenceClassification.from_pretrained(
96
+ "distilbert-base-uncased",
97
+ num_labels=1,
98
+ output_attentions=False,
99
+ output_hidden_states=False
100
+ )
101
+ torch.cuda.empty_cache()
102
+ model.cuda()
103
+
104
+ optimizer = AdamW(model.parameters(),
105
+ lr=2e-5,
106
+ eps=1e-8)
107
+
108
+ EPOCHS = 4
109
+
110
+ total_steps = len(train_dataloader) * EPOCHS
111
+
112
+ scheduler = get_linear_schedule_with_warmup(optimizer,
113
+ num_warmup_steps=0,
114
+ num_training_steps=total_steps)
115
+
116
+ device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
117
+
118
+ seed = 42
119
+
120
+ random.seed(seed)
121
+ np.random.seed(seed)
122
+ torch.manual_seed(seed)
123
+ torch.cuda.manual_seed_all(seed)
124
+
125
+ training_stats = []
126
+
127
+ for epoch in range(EPOCHS):
128
+ total_train_loss = 0
129
+ model.train()
130
+
131
+ for step, batch in enumerate(train_dataloader):
132
+
133
+ b_input_ids = batch[0].to(device)
134
+ b_input_mask = batch[1].to(device)
135
+ b_labels = batch[2].to(device)
136
+
137
+ model.zero_grad()
138
+ result = model(b_input_ids,
139
+
140
+ attention_mask=b_input_mask,
141
+ labels=b_labels,
142
+ return_dict=True
143
+ )
144
+
145
+ loss = result.loss
146
+
147
+ logits = result.logits
148
+
149
+ total_train_loss += loss.item()
150
+
151
+ loss = loss.to(torch.float32)
152
+
153
+ loss.backward()
154
+
155
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
156
+
157
+ optimizer.step()
158
+
159
+ scheduler.step()
160
+ if step % 40 == 0:
161
+ print(f'epoch: {epoch + 1} / {EPOCHS}, step {step + 1} / {len(train_dataloader)}, loss = {loss.item():.4f}')
162
+ avg_train_loss = total_train_loss / len(train_dataloader)
163
+ print(f'MSE {avg_train_loss:.2f}')
164
+ print("Running Validation...")
165
+
166
+ model.eval()
167
+ total_eval_accuracy = 0
168
+ total_eval_loss = 0
169
+ nb_eval_steps = 0
170
+
171
+ for batch in validation_dataloader:
172
+ b_input_ids = batch[0].to(device)
173
+ b_input_mask = batch[1].to(device)
174
+ b_labels = batch[2].to(device)
175
+
176
+ with torch.no_grad():
177
+ result = model(
178
+ b_input_ids,
179
+ attention_mask=b_input_mask,
180
+ labels=b_labels,
181
+ return_dict=True,
182
+ )
183
+
184
+ loss = loss.to(torch.float32)
185
+ logits = result.logits
186
+
187
+ total_eval_loss += loss.item()
188
+
189
+ logits = logits.detach().cpu().numpy()
190
+ label_ids = b_labels.to('cpu').numpy()
191
+
192
+ avg_val_loss = total_eval_loss / len(validation_dataloader)
193
+ print(f'Validation Loss {avg_val_loss:.2f}')
194
+ training_stats.append(
195
+ {
196
+ 'epoch': epoch + 1,
197
+ 'Training Loss': avg_train_loss,
198
+ 'MSE': avg_val_loss,
199
+
200
+ }
201
+ )
202
+
203
+ print("")
204
+ print("Training complete!")
205
+
206
+ torch.save(model, '/content/untitled')
207
+
208
+ PATH = '/content/pytorchBERTmodel'
209
+ model = torch.load(PATH)
210
+ model.eval()
211
+ model.to(device)
212
+
213
+
214
+ def predict(text, tokenizer):
215
+ model.eval()
216
+ model.to(device)
217
+
218
+ def prepare_data(text, tokenizer):
219
+ input_ids = []
220
+ attention_masks = []
221
+
222
+ encoded_text = tokenizer.encode_plus(
223
+ text,
224
+ add_special_tokens=True,
225
+ max_length=315,
226
+ padding=True,
227
+ return_attention_mask=True,
228
+ return_tensors='pt'
229
+ )
230
+
231
+ input_ids.append(encoded_text['input_ids'])
232
+ attention_masks.append(encoded_text['attention_mask'])
233
+
234
+ input_ids = torch.cat(input_ids, dim=0)
235
+ attention_masks = torch.cat(attention_masks, dim=0)
236
+ return {'input_ids': input_ids, 'attention_masks': attention_masks}
237
+
238
+ tokenized_example_text = prepare_data(text, tokenizer)
239
+ with torch.no_grad():
240
+ result = model(
241
+ tokenized_example_text['input_ids'].to(device),
242
+ attention_mask=tokenized_example_text['attention_masks'].to(device),
243
+ return_dict=True
244
+ ).logits
245
+
246
+ return result
247
+
248
+
249
+ sen = """
250
+ Recent JWST observations suggest an excess of 𝑧 & 10 galaxy candidates above most theoretical models. Here, we explore how
251
+ the interplay between halo formation timescales, star formation efficiency and dust attenuation affects the properties and number
252
+ densities of galaxies we can detect in the early universe. We calculate the theoretical upper limit on the UV luminosity function,
253
+ assuming star formation is 100% efficient and all gas in halos is converted into stars, and that galaxies are at the peak age for
254
+ UV emission (∼ 10 Myr). This upper limit is ∼ 4 orders of magnitude greater than current observations, implying these are
255
+ fully consistent with star formation in ΛCDM cosmology. One day, a woman was walking her two dogs. One was a big, friendly labrador
256
+ and the other was a little yappy dog. As they walked, the little dog started to bark at a cat. The cat hissed and ran away. The
257
+ labrador just stood there wagging his tail. The woman scolded the little dog, "You're supposed to be my protector! Why didn't you
258
+ chase that cat away?" The labrador just looked at her and said, "I'm sorry, but I just don't see the point.
259
+ """
260
+ sen_2 = """
261
+ Interstellar chemistry is important for galaxy formation, as it determines the rate at which gas can cool, and enables
262
+ us to make predictions for observable spectroscopic lines from ions and molecules. We explore two central aspects
263
+ of modelling the chemistry of the interstellar medium (ISM): (1) the effects of local stellar radiation, which ionises
264
+ and heats the gas, and (2) the depletion of metals onto dust grains, which reduces the abundance of metals in the
265
+ gas phase. We run high-resolution (400 M per baryonic particle) simulations of isolated disc galaxies, from dwarfs
266
+ to Milky Way-mass, using the fire galaxy formation models together with the chimes non-equilibrium chemistry
267
+ and cooling module. In our fiducial model, we couple the chemistry to the stellar fluxes calculated from star particles
268
+ using an approximate radiative transfer scheme, and we implement an empirical density-dependent prescription for
269
+ metal depletion. For comparison, we also run simulations with a spatially uniform radiation field, and without metal
270
+ depletion. Our fiducial model broadly reproduces observed trends in Hi and H2 mass with stellar mass, and in line
271
+ luminosity versus star formation rate for [Cii]158µm, [Oi]63µm, [Oiii]88µm, [Nii]122µm and Hα6563˚A. Our simulations
272
+ """
273
+ windows_2 = []
274
+ words = word_tokenize(sen_2)
275
+ for idx, text in enumerate(words):
276
+ if idx <= len(words) - 21:
277
+ x = ' '.join(words[idx: idx + 20])
278
+ windows_2.append(x)
279
+
280
+ win_preds_2 = []
281
+ for text in windows_2:
282
+ win_preds_2.append(predict(text, tokenizer).item())
283
+
284
+ windows = []
285
+ words = word_tokenize(sen)
286
+ for idx, text in enumerate(words):
287
+ if idx <= len(words) - 21:
288
+ x = ' '.join(words[idx: idx + 20])
289
+
290
+ windows.append(x)
291
+
292
+ win_preds = []
293
+ for text in windows:
294
+ win_preds.append(predict(text, tokenizer).item())
295
+
296
+ plt.style.use('seaborn-notebook')
297
+ # Data
298
+ x = list(range(len(win_preds)))
299
+ y = win_preds
300
+ x2 = list(range(len(win_preds_2)))
301
+ y2 = win_preds_2
302
+ # Plot
303
+ plt.plot(x, y, color='#ff0000')
304
+ plt.plot(x2, y2, color='blue')
305
+ plt.grid(color='#cccccc', linestyle='--', linewidth=1)
306
+ plt.xlabel('Window Sequence')
307
+ plt.ylabel('Difficulty Score')
308
+ plt.suptitle('Difficulty Score Over Time', fontsize=14, fontweight='bold')
309
+ plt.show()