Spaces:
Runtime error
Runtime error
import random | |
import matplotlib.pyplot as plt | |
import nltk | |
import numpy as np | |
import pandas as pd | |
import torch | |
import torch.nn | |
from nltk.tokenize import word_tokenize | |
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler | |
from torch.utils.data import TensorDataset, random_split | |
from transformers import DistilBertForSequenceClassification, AdamW | |
from transformers import DistilBertTokenizer | |
from transformers import get_linear_schedule_with_warmup | |
nltk.download('punkt') | |
# %matplotlib inline | |
df = pd.read_csv('/content/train.csv') | |
print(f'Number of training samples: {df.shape[0]}') | |
df.sample(100) | |
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') | |
excerpts = df.excerpt.values | |
targets = df.target.values.astype('float32') | |
plt.hist(df['target']) | |
plt.show() | |
max_len = 0 | |
for i in excerpts: | |
input_ids = tokenizer.encode(i, add_special_tokens=True) | |
max_len = max(max_len, len(input_ids)) | |
print(max_len) | |
input_ids = [] | |
attention_masks = [] | |
for i in excerpts: | |
encoded_text = tokenizer.encode_plus( | |
i, | |
add_special_tokens=True, | |
max_length=315, | |
pad_to_max_length=True, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
input_ids.append(encoded_text['input_ids']) | |
attention_masks.append(encoded_text['attention_mask']) | |
input_ids = torch.cat(input_ids, dim=0) | |
attention_masks = torch.cat(attention_masks, dim=0) | |
labels = torch.tensor(targets) | |
labels = labels.float() | |
# Combine the training inputs into a TensorDataset. | |
dataset = TensorDataset(input_ids, attention_masks, labels) | |
# Create a 80-20 train-validation split. | |
# Calculate the number of samples to include in each set. | |
train_size = int(0.8 * len(dataset)) | |
val_size = len(dataset) - train_size | |
# Divide the dataset by randomly selecting samples. | |
train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) | |
print('{:>5,} training samples'.format(train_size)) | |
print('{:>5,} validation samples'.format(val_size)) | |
batch_size = 8 | |
train_dataloader = DataLoader( | |
train_dataset, | |
sampler=RandomSampler(train_dataset), | |
batch_size=batch_size | |
) | |
validation_dataloader = DataLoader( | |
val_dataset, | |
sampler=SequentialSampler(val_dataset), | |
batch_size=batch_size | |
) | |
model = DistilBertForSequenceClassification.from_pretrained( | |
"distilbert-base-uncased", | |
num_labels=1, | |
output_attentions=False, | |
output_hidden_states=False | |
) | |
torch.cuda.empty_cache() | |
model.cuda() | |
optimizer = AdamW(model.parameters(), | |
lr=2e-5, | |
eps=1e-8) | |
EPOCHS = 4 | |
total_steps = len(train_dataloader) * EPOCHS | |
scheduler = get_linear_schedule_with_warmup(optimizer, | |
num_warmup_steps=0, | |
num_training_steps=total_steps) | |
device = torch.device('cuda' if torch.cuda.is_available else 'cpu') | |
seed = 42 | |
random.seed(seed) | |
np.random.seed(seed) | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) | |
training_stats = [] | |
for epoch in range(EPOCHS): | |
total_train_loss = 0 | |
model.train() | |
for step, batch in enumerate(train_dataloader): | |
b_input_ids = batch[0].to(device) | |
b_input_mask = batch[1].to(device) | |
b_labels = batch[2].to(device) | |
model.zero_grad() | |
result = model(b_input_ids, | |
attention_mask=b_input_mask, | |
labels=b_labels, | |
return_dict=True | |
) | |
loss = result.loss | |
logits = result.logits | |
total_train_loss += loss.item() | |
loss = loss.to(torch.float32) | |
loss.backward() | |
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
optimizer.step() | |
scheduler.step() | |
if step % 40 == 0: | |
print(f'epoch: {epoch + 1} / {EPOCHS}, step {step + 1} / {len(train_dataloader)}, loss = {loss.item():.4f}') | |
avg_train_loss = total_train_loss / len(train_dataloader) | |
print(f'MSE {avg_train_loss:.2f}') | |
print("Running Validation...") | |
model.eval() | |
total_eval_accuracy = 0 | |
total_eval_loss = 0 | |
nb_eval_steps = 0 | |
for batch in validation_dataloader: | |
b_input_ids = batch[0].to(device) | |
b_input_mask = batch[1].to(device) | |
b_labels = batch[2].to(device) | |
with torch.no_grad(): | |
result = model( | |
b_input_ids, | |
attention_mask=b_input_mask, | |
labels=b_labels, | |
return_dict=True, | |
) | |
loss = loss.to(torch.float32) | |
logits = result.logits | |
total_eval_loss += loss.item() | |
logits = logits.detach().cpu().numpy() | |
label_ids = b_labels.to('cpu').numpy() | |
avg_val_loss = total_eval_loss / len(validation_dataloader) | |
print(f'Validation Loss {avg_val_loss:.2f}') | |
training_stats.append( | |
{ | |
'epoch': epoch + 1, | |
'Training Loss': avg_train_loss, | |
'MSE': avg_val_loss, | |
} | |
) | |
print("") | |
print("Training complete!") | |
torch.save(model, '/content/untitled') | |
PATH = '/content/pytorchBERTmodel' | |
model = torch.load(PATH) | |
model.eval() | |
model.to(device) | |
def predict(text, tokenizer): | |
model.eval() | |
model.to(device) | |
def prepare_data(text, tokenizer): | |
input_ids = [] | |
attention_masks = [] | |
encoded_text = tokenizer.encode_plus( | |
text, | |
add_special_tokens=True, | |
max_length=315, | |
padding=True, | |
return_attention_mask=True, | |
return_tensors='pt' | |
) | |
input_ids.append(encoded_text['input_ids']) | |
attention_masks.append(encoded_text['attention_mask']) | |
input_ids = torch.cat(input_ids, dim=0) | |
attention_masks = torch.cat(attention_masks, dim=0) | |
return {'input_ids': input_ids, 'attention_masks': attention_masks} | |
tokenized_example_text = prepare_data(text, tokenizer) | |
with torch.no_grad(): | |
result = model( | |
tokenized_example_text['input_ids'].to(device), | |
attention_mask=tokenized_example_text['attention_masks'].to(device), | |
return_dict=True | |
).logits | |
return result | |
sen = """ | |
Recent JWST observations suggest an excess of 𝑧 & 10 galaxy candidates above most theoretical models. Here, we explore how | |
the interplay between halo formation timescales, star formation efficiency and dust attenuation affects the properties and number | |
densities of galaxies we can detect in the early universe. We calculate the theoretical upper limit on the UV luminosity function, | |
assuming star formation is 100% efficient and all gas in halos is converted into stars, and that galaxies are at the peak age for | |
UV emission (∼ 10 Myr). This upper limit is ∼ 4 orders of magnitude greater than current observations, implying these are | |
fully consistent with star formation in ΛCDM cosmology. One day, a woman was walking her two dogs. One was a big, friendly labrador | |
and the other was a little yappy dog. As they walked, the little dog started to bark at a cat. The cat hissed and ran away. The | |
labrador just stood there wagging his tail. The woman scolded the little dog, "You're supposed to be my protector! Why didn't you | |
chase that cat away?" The labrador just looked at her and said, "I'm sorry, but I just don't see the point. | |
""" | |
sen_2 = """ | |
Interstellar chemistry is important for galaxy formation, as it determines the rate at which gas can cool, and enables | |
us to make predictions for observable spectroscopic lines from ions and molecules. We explore two central aspects | |
of modelling the chemistry of the interstellar medium (ISM): (1) the effects of local stellar radiation, which ionises | |
and heats the gas, and (2) the depletion of metals onto dust grains, which reduces the abundance of metals in the | |
gas phase. We run high-resolution (400 M per baryonic particle) simulations of isolated disc galaxies, from dwarfs | |
to Milky Way-mass, using the fire galaxy formation models together with the chimes non-equilibrium chemistry | |
and cooling module. In our fiducial model, we couple the chemistry to the stellar fluxes calculated from star particles | |
using an approximate radiative transfer scheme, and we implement an empirical density-dependent prescription for | |
metal depletion. For comparison, we also run simulations with a spatially uniform radiation field, and without metal | |
depletion. Our fiducial model broadly reproduces observed trends in Hi and H2 mass with stellar mass, and in line | |
luminosity versus star formation rate for [Cii]158µm, [Oi]63µm, [Oiii]88µm, [Nii]122µm and Hα6563˚A. Our simulations | |
""" | |
windows_2 = [] | |
words = word_tokenize(sen_2) | |
for idx, text in enumerate(words): | |
if idx <= len(words) - 21: | |
x = ' '.join(words[idx: idx + 20]) | |
windows_2.append(x) | |
win_preds_2 = [] | |
for text in windows_2: | |
win_preds_2.append(predict(text, tokenizer).item()) | |
windows = [] | |
words = word_tokenize(sen) | |
for idx, text in enumerate(words): | |
if idx <= len(words) - 21: | |
x = ' '.join(words[idx: idx + 20]) | |
windows.append(x) | |
win_preds = [] | |
for text in windows: | |
win_preds.append(predict(text, tokenizer).item()) | |
plt.style.use('seaborn-notebook') | |
# Data | |
x = list(range(len(win_preds))) | |
y = win_preds | |
x2 = list(range(len(win_preds_2))) | |
y2 = win_preds_2 | |
# Plot | |
plt.plot(x, y, color='#ff0000') | |
plt.plot(x2, y2, color='blue') | |
plt.grid(color='#cccccc', linestyle='--', linewidth=1) | |
plt.xlabel('Window Sequence') | |
plt.ylabel('Difficulty Score') | |
plt.suptitle('Difficulty Score Over Time', fontsize=14, fontweight='bold') | |
plt.show() |