Spaces:
Runtime error
Runtime error
Uploading training.py
Browse files- training.py +309 -0
training.py
ADDED
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import nltk
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
import torch
|
8 |
+
import torch.nn
|
9 |
+
from nltk.tokenize import word_tokenize
|
10 |
+
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
11 |
+
from torch.utils.data import TensorDataset, random_split
|
12 |
+
from transformers import DistilBertForSequenceClassification, AdamW
|
13 |
+
from transformers import DistilBertTokenizer
|
14 |
+
from transformers import get_linear_schedule_with_warmup
|
15 |
+
|
16 |
+
nltk.download('punkt')
|
17 |
+
|
18 |
+
# %matplotlib inline
|
19 |
+
|
20 |
+
df = pd.read_csv('/content/train.csv')
|
21 |
+
|
22 |
+
print(f'Number of training samples: {df.shape[0]}')
|
23 |
+
|
24 |
+
df.sample(100)
|
25 |
+
|
26 |
+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
|
27 |
+
|
28 |
+
excerpts = df.excerpt.values
|
29 |
+
targets = df.target.values.astype('float32')
|
30 |
+
|
31 |
+
plt.hist(df['target'])
|
32 |
+
plt.show()
|
33 |
+
|
34 |
+
max_len = 0
|
35 |
+
|
36 |
+
for i in excerpts:
|
37 |
+
input_ids = tokenizer.encode(i, add_special_tokens=True)
|
38 |
+
|
39 |
+
max_len = max(max_len, len(input_ids))
|
40 |
+
|
41 |
+
print(max_len)
|
42 |
+
|
43 |
+
input_ids = []
|
44 |
+
attention_masks = []
|
45 |
+
|
46 |
+
for i in excerpts:
|
47 |
+
encoded_text = tokenizer.encode_plus(
|
48 |
+
i,
|
49 |
+
add_special_tokens=True,
|
50 |
+
max_length=315,
|
51 |
+
pad_to_max_length=True,
|
52 |
+
return_attention_mask=True,
|
53 |
+
return_tensors='pt'
|
54 |
+
)
|
55 |
+
|
56 |
+
input_ids.append(encoded_text['input_ids'])
|
57 |
+
|
58 |
+
attention_masks.append(encoded_text['attention_mask'])
|
59 |
+
|
60 |
+
input_ids = torch.cat(input_ids, dim=0)
|
61 |
+
attention_masks = torch.cat(attention_masks, dim=0)
|
62 |
+
labels = torch.tensor(targets)
|
63 |
+
|
64 |
+
labels = labels.float()
|
65 |
+
|
66 |
+
# Combine the training inputs into a TensorDataset.
|
67 |
+
dataset = TensorDataset(input_ids, attention_masks, labels)
|
68 |
+
|
69 |
+
# Create a 80-20 train-validation split.
|
70 |
+
|
71 |
+
# Calculate the number of samples to include in each set.
|
72 |
+
train_size = int(0.8 * len(dataset))
|
73 |
+
val_size = len(dataset) - train_size
|
74 |
+
|
75 |
+
# Divide the dataset by randomly selecting samples.
|
76 |
+
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
|
77 |
+
|
78 |
+
print('{:>5,} training samples'.format(train_size))
|
79 |
+
print('{:>5,} validation samples'.format(val_size))
|
80 |
+
|
81 |
+
batch_size = 8
|
82 |
+
|
83 |
+
train_dataloader = DataLoader(
|
84 |
+
train_dataset,
|
85 |
+
sampler=RandomSampler(train_dataset),
|
86 |
+
batch_size=batch_size
|
87 |
+
)
|
88 |
+
|
89 |
+
validation_dataloader = DataLoader(
|
90 |
+
val_dataset,
|
91 |
+
sampler=SequentialSampler(val_dataset),
|
92 |
+
batch_size=batch_size
|
93 |
+
)
|
94 |
+
|
95 |
+
model = DistilBertForSequenceClassification.from_pretrained(
|
96 |
+
"distilbert-base-uncased",
|
97 |
+
num_labels=1,
|
98 |
+
output_attentions=False,
|
99 |
+
output_hidden_states=False
|
100 |
+
)
|
101 |
+
torch.cuda.empty_cache()
|
102 |
+
model.cuda()
|
103 |
+
|
104 |
+
optimizer = AdamW(model.parameters(),
|
105 |
+
lr=2e-5,
|
106 |
+
eps=1e-8)
|
107 |
+
|
108 |
+
EPOCHS = 4
|
109 |
+
|
110 |
+
total_steps = len(train_dataloader) * EPOCHS
|
111 |
+
|
112 |
+
scheduler = get_linear_schedule_with_warmup(optimizer,
|
113 |
+
num_warmup_steps=0,
|
114 |
+
num_training_steps=total_steps)
|
115 |
+
|
116 |
+
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
|
117 |
+
|
118 |
+
seed = 42
|
119 |
+
|
120 |
+
random.seed(seed)
|
121 |
+
np.random.seed(seed)
|
122 |
+
torch.manual_seed(seed)
|
123 |
+
torch.cuda.manual_seed_all(seed)
|
124 |
+
|
125 |
+
training_stats = []
|
126 |
+
|
127 |
+
for epoch in range(EPOCHS):
|
128 |
+
total_train_loss = 0
|
129 |
+
model.train()
|
130 |
+
|
131 |
+
for step, batch in enumerate(train_dataloader):
|
132 |
+
|
133 |
+
b_input_ids = batch[0].to(device)
|
134 |
+
b_input_mask = batch[1].to(device)
|
135 |
+
b_labels = batch[2].to(device)
|
136 |
+
|
137 |
+
model.zero_grad()
|
138 |
+
result = model(b_input_ids,
|
139 |
+
|
140 |
+
attention_mask=b_input_mask,
|
141 |
+
labels=b_labels,
|
142 |
+
return_dict=True
|
143 |
+
)
|
144 |
+
|
145 |
+
loss = result.loss
|
146 |
+
|
147 |
+
logits = result.logits
|
148 |
+
|
149 |
+
total_train_loss += loss.item()
|
150 |
+
|
151 |
+
loss = loss.to(torch.float32)
|
152 |
+
|
153 |
+
loss.backward()
|
154 |
+
|
155 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
156 |
+
|
157 |
+
optimizer.step()
|
158 |
+
|
159 |
+
scheduler.step()
|
160 |
+
if step % 40 == 0:
|
161 |
+
print(f'epoch: {epoch + 1} / {EPOCHS}, step {step + 1} / {len(train_dataloader)}, loss = {loss.item():.4f}')
|
162 |
+
avg_train_loss = total_train_loss / len(train_dataloader)
|
163 |
+
print(f'MSE {avg_train_loss:.2f}')
|
164 |
+
print("Running Validation...")
|
165 |
+
|
166 |
+
model.eval()
|
167 |
+
total_eval_accuracy = 0
|
168 |
+
total_eval_loss = 0
|
169 |
+
nb_eval_steps = 0
|
170 |
+
|
171 |
+
for batch in validation_dataloader:
|
172 |
+
b_input_ids = batch[0].to(device)
|
173 |
+
b_input_mask = batch[1].to(device)
|
174 |
+
b_labels = batch[2].to(device)
|
175 |
+
|
176 |
+
with torch.no_grad():
|
177 |
+
result = model(
|
178 |
+
b_input_ids,
|
179 |
+
attention_mask=b_input_mask,
|
180 |
+
labels=b_labels,
|
181 |
+
return_dict=True,
|
182 |
+
)
|
183 |
+
|
184 |
+
loss = loss.to(torch.float32)
|
185 |
+
logits = result.logits
|
186 |
+
|
187 |
+
total_eval_loss += loss.item()
|
188 |
+
|
189 |
+
logits = logits.detach().cpu().numpy()
|
190 |
+
label_ids = b_labels.to('cpu').numpy()
|
191 |
+
|
192 |
+
avg_val_loss = total_eval_loss / len(validation_dataloader)
|
193 |
+
print(f'Validation Loss {avg_val_loss:.2f}')
|
194 |
+
training_stats.append(
|
195 |
+
{
|
196 |
+
'epoch': epoch + 1,
|
197 |
+
'Training Loss': avg_train_loss,
|
198 |
+
'MSE': avg_val_loss,
|
199 |
+
|
200 |
+
}
|
201 |
+
)
|
202 |
+
|
203 |
+
print("")
|
204 |
+
print("Training complete!")
|
205 |
+
|
206 |
+
torch.save(model, '/content/untitled')
|
207 |
+
|
208 |
+
PATH = '/content/pytorchBERTmodel'
|
209 |
+
model = torch.load(PATH)
|
210 |
+
model.eval()
|
211 |
+
model.to(device)
|
212 |
+
|
213 |
+
|
214 |
+
def predict(text, tokenizer):
|
215 |
+
model.eval()
|
216 |
+
model.to(device)
|
217 |
+
|
218 |
+
def prepare_data(text, tokenizer):
|
219 |
+
input_ids = []
|
220 |
+
attention_masks = []
|
221 |
+
|
222 |
+
encoded_text = tokenizer.encode_plus(
|
223 |
+
text,
|
224 |
+
add_special_tokens=True,
|
225 |
+
max_length=315,
|
226 |
+
padding=True,
|
227 |
+
return_attention_mask=True,
|
228 |
+
return_tensors='pt'
|
229 |
+
)
|
230 |
+
|
231 |
+
input_ids.append(encoded_text['input_ids'])
|
232 |
+
attention_masks.append(encoded_text['attention_mask'])
|
233 |
+
|
234 |
+
input_ids = torch.cat(input_ids, dim=0)
|
235 |
+
attention_masks = torch.cat(attention_masks, dim=0)
|
236 |
+
return {'input_ids': input_ids, 'attention_masks': attention_masks}
|
237 |
+
|
238 |
+
tokenized_example_text = prepare_data(text, tokenizer)
|
239 |
+
with torch.no_grad():
|
240 |
+
result = model(
|
241 |
+
tokenized_example_text['input_ids'].to(device),
|
242 |
+
attention_mask=tokenized_example_text['attention_masks'].to(device),
|
243 |
+
return_dict=True
|
244 |
+
).logits
|
245 |
+
|
246 |
+
return result
|
247 |
+
|
248 |
+
|
249 |
+
sen = """
|
250 |
+
Recent JWST observations suggest an excess of 𝑧 & 10 galaxy candidates above most theoretical models. Here, we explore how
|
251 |
+
the interplay between halo formation timescales, star formation efficiency and dust attenuation affects the properties and number
|
252 |
+
densities of galaxies we can detect in the early universe. We calculate the theoretical upper limit on the UV luminosity function,
|
253 |
+
assuming star formation is 100% efficient and all gas in halos is converted into stars, and that galaxies are at the peak age for
|
254 |
+
UV emission (∼ 10 Myr). This upper limit is ∼ 4 orders of magnitude greater than current observations, implying these are
|
255 |
+
fully consistent with star formation in ΛCDM cosmology. One day, a woman was walking her two dogs. One was a big, friendly labrador
|
256 |
+
and the other was a little yappy dog. As they walked, the little dog started to bark at a cat. The cat hissed and ran away. The
|
257 |
+
labrador just stood there wagging his tail. The woman scolded the little dog, "You're supposed to be my protector! Why didn't you
|
258 |
+
chase that cat away?" The labrador just looked at her and said, "I'm sorry, but I just don't see the point.
|
259 |
+
"""
|
260 |
+
sen_2 = """
|
261 |
+
Interstellar chemistry is important for galaxy formation, as it determines the rate at which gas can cool, and enables
|
262 |
+
us to make predictions for observable spectroscopic lines from ions and molecules. We explore two central aspects
|
263 |
+
of modelling the chemistry of the interstellar medium (ISM): (1) the effects of local stellar radiation, which ionises
|
264 |
+
and heats the gas, and (2) the depletion of metals onto dust grains, which reduces the abundance of metals in the
|
265 |
+
gas phase. We run high-resolution (400 M per baryonic particle) simulations of isolated disc galaxies, from dwarfs
|
266 |
+
to Milky Way-mass, using the fire galaxy formation models together with the chimes non-equilibrium chemistry
|
267 |
+
and cooling module. In our fiducial model, we couple the chemistry to the stellar fluxes calculated from star particles
|
268 |
+
using an approximate radiative transfer scheme, and we implement an empirical density-dependent prescription for
|
269 |
+
metal depletion. For comparison, we also run simulations with a spatially uniform radiation field, and without metal
|
270 |
+
depletion. Our fiducial model broadly reproduces observed trends in Hi and H2 mass with stellar mass, and in line
|
271 |
+
luminosity versus star formation rate for [Cii]158µm, [Oi]63µm, [Oiii]88µm, [Nii]122µm and Hα6563˚A. Our simulations
|
272 |
+
"""
|
273 |
+
windows_2 = []
|
274 |
+
words = word_tokenize(sen_2)
|
275 |
+
for idx, text in enumerate(words):
|
276 |
+
if idx <= len(words) - 21:
|
277 |
+
x = ' '.join(words[idx: idx + 20])
|
278 |
+
windows_2.append(x)
|
279 |
+
|
280 |
+
win_preds_2 = []
|
281 |
+
for text in windows_2:
|
282 |
+
win_preds_2.append(predict(text, tokenizer).item())
|
283 |
+
|
284 |
+
windows = []
|
285 |
+
words = word_tokenize(sen)
|
286 |
+
for idx, text in enumerate(words):
|
287 |
+
if idx <= len(words) - 21:
|
288 |
+
x = ' '.join(words[idx: idx + 20])
|
289 |
+
|
290 |
+
windows.append(x)
|
291 |
+
|
292 |
+
win_preds = []
|
293 |
+
for text in windows:
|
294 |
+
win_preds.append(predict(text, tokenizer).item())
|
295 |
+
|
296 |
+
plt.style.use('seaborn-notebook')
|
297 |
+
# Data
|
298 |
+
x = list(range(len(win_preds)))
|
299 |
+
y = win_preds
|
300 |
+
x2 = list(range(len(win_preds_2)))
|
301 |
+
y2 = win_preds_2
|
302 |
+
# Plot
|
303 |
+
plt.plot(x, y, color='#ff0000')
|
304 |
+
plt.plot(x2, y2, color='blue')
|
305 |
+
plt.grid(color='#cccccc', linestyle='--', linewidth=1)
|
306 |
+
plt.xlabel('Window Sequence')
|
307 |
+
plt.ylabel('Difficulty Score')
|
308 |
+
plt.suptitle('Difficulty Score Over Time', fontsize=14, fontweight='bold')
|
309 |
+
plt.show()
|