# Processing data

In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import get_scheduler, TrainingArguments, Trainer, DataCollatorWithPadding, AdamW, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import gc
import numpy as np
from datasets import load_metric
import random
import os
from tqdm.auto import tqdm

In [5]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [6]:
# reset GPU memory
gc.collect()
torch.cuda.empty_cache()

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

NameError: name 'AutoTokenizer' is not defined

In [5]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
sequences = [
 "I've been waiting for a HuggingFace course my whole life.",
 "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch["labels"] = torch.tensor([1, 1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

In [4]:
raw_datasets = load_dataset("glue","mrpc")
raw_train_dataset = raw_datasets['train']
# print(raw_train_dataset.features)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# # WHY CANT WE PASS THE DIFFERENT SENTENCES TOGETHER
# tokenized_sentences_1 = tokenizer(raw_train_dataset[15]['sentence1'])
# tokenized_sentences_2 = tokenizer(raw_train_dataset[15]['sentence2'])
# print(tokenizer.decode(tokenized_sentences_1.input_ids), tokenizer.decode(tokenized_sentences_2.input_ids))
# inputs = tokenizer(raw_train_dataset[15]['sentence1'], raw_train_dataset[15]['sentence2'])
# print(tokenizer.decode(inputs.input_ids))
inputs = tokenizer(raw_train_dataset['sentence1'], raw_train_dataset['sentence2'], padding=True, truncation=True)

# tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)
# print(tokenized_datasets['train'].features)

Reusing dataset glue (C:\Users\1seba\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [5]:
list(inputs.keys())

['input_ids', 'token_type_ids', 'attention_mask']

In [6]:
def tokenize_function(example):
 tokenized = tokenizer(example['sentence1'], example['sentence2'], truncation=True)
# tokenized['input_ids'] = ['CHANGED!' for item in tokenized['input_ids']]
 return tokenized
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

100%|██████████| 4/4 [00:01<00:00, 3.69ba/s]
100%|██████████| 1/1 [00:00<00:00, 16.42ba/s]
100%|██████████| 2/2 [00:00<00:00, 6.22ba/s]


In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [37]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([8, 67]),
 'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'labels': torch.Size([8])}

## Challenge 1

In [15]:
from torch.utils.data import DataLoader

In [12]:
samples = tokenized_datasets['test'][:8]
samples = {k: samples[k] for k in list(samples.keys()) if k not in ["idx", "sentence1", "sentence2"]}

In [13]:
padded_samples = data_collator(samples)

In [21]:

train_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=True, collate_fn=data_collator)
for batch in train_dataloader:
 print(batch['input_ids'].shape())

## Challenge 2

In [5]:
raw_dataset_sst2 = load_dataset("glue","sst2")

Reusing dataset glue (C:\Users\1seba\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [6]:
dataset_to_tokenize = raw_dataset_sst2
def tokenize_dynamic(example):
 dynamic_sentence_list = [x for x in list(example.keys()) if x not in ['label', 'idx']]
 if len(dynamic_sentence_list) == 1:
 return tokenizer(example[dynamic_sentence_list[0]], truncation=True)
 else:
 return tokenizer(example[dynamic_sentence_list[0]], example[dynamic_sentence_list[1]], truncation=True)
tokenized_datasets = dataset_to_tokenize.map(tokenize_dynamic, batched=True)

100%|██████████| 68/68 [00:03<00:00, 18.46ba/s]
100%|██████████| 1/1 [00:00<00:00, 16.67ba/s]
100%|██████████| 2/2 [00:00<00:00, 16.67ba/s]


In [7]:
samples = tokenized_datasets['train'][:8]
samples = {k: samples[k] for k in list(samples.keys()) if k not in ["idx", "sentence", "sentence1", "sentence2"]}

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [74]:
padded_data = data_collator(samples)

# Fine-tuning a model with Trainer API

In [33]:
# set up so far
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
 return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Reusing dataset glue (C:\Users\1seba\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 4/4 [00:00<00:00, 5.85ba/s]
100%|██████████| 1/1 [00:00<00:00, 14.49ba/s]
100%|██████████| 2/2 [00:00<00:00, 6.37ba/s]


In [9]:
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

In [34]:
training_args = TrainingArguments("test-trainer")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [37]:
train_dataset = tokenized_datasets["train"].filter(percentageOfItems)
validation_dataset = tokenized_datasets["validation"].filter(percentageOfItems)

100%|██████████| 4/4 [00:00<00:00, 4.14ba/s]
100%|██████████| 1/1 [00:00<00:00, 9.71ba/s]


In [42]:
trainer = Trainer(
 model,
 training_args,
 train_dataset=train_dataset,
 eval_dataset=validation_dataset,
 data_collator=data_collator,
 tokenizer=tokenizer,
)

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

 0%| | 0/132 [01:31) torch.Size([8, 2])


In [63]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [64]:
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
 'linear',
 optimizer,
 num_warmup_steps=0,
 num_training_steps=num_training_steps,
)
print(num_training_steps)


93


In [65]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
device

device(type='cuda')

In [71]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
 for batch in train_dataloader:
 batch = {k: v.to(device) for k, v in batch.items()}
 outputs = model(**batch)
 loss = outputs.loss
 loss.backward()
 optimizer.step()
 optimizer.zero_grad()
 progress_bar.update(1)
 
 # metric = load_metric('glue', 'mrpc')
 # model.eval()
 # for batch in eval_dataloader:
 # batch = {k: v.to(device) for k, v in batch.items()}
 # with torch.no_grad():
 # outputs = model(**batch)
 # logits = outputs.logits
 # predictions = torch.argmax(logits, dim=-1)
 # metric.add_batch(predictions=predictions, references=batch['labels'])
 # print(metric.compute())

100%|██████████| 93/93 [08:50<00:00, 5.70s/it]
100%|██████████| 93/93 [00:28<00:00, 3.21it/s]

In [109]:
from datasets import load_metric
metric = load_metric('glue', 'mrpc')
model.eval()
for batch in eval_dataloader:
 batch = {k: v.to(device) for k, v in batch.items()}
 with torch.no_grad():
 outputs = model(**batch)
 logits = outputs.logits
 predictions = torch.argmax(logits, dim=-1)
 metric.add_batch(predictions=predictions, references=batch['labels'])
metric.compute()

{'accuracy': 0.6463414634146342, 'f1': 0.7851851851851851}

## Challenge 1

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

sst2_datasets = load_dataset("glue", "sst2")
def tokenize_function (example):
 return tokenizer(example['sentence'], truncation=True)
tokenized_datasets = sst2_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence"])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataset = DataLoader(
 tokenized_datasets['train'].shard(num_shards=180, index=0), shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataset = DataLoader(
 tokenized_datasets['validation'].shard(num_shards=4, index=0), batch_size=8, collate_fn=data_collator
)

Reusing dataset glue (C:\Users\1seba\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 68/68 [00:03<00:00, 20.33ba/s]
100%|██████████| 1/1 [00:00<00:00, 17.24ba/s]
100%|██████████| 2/2 [00:00<00:00, 16.53ba/s]


In [31]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)
optimizer= AdamW(model.parameters(), 5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataset)
lr_scheduler = get_scheduler(
 'linear',
 optimizer=optimizer,
 num_warmup_steps=0,
 num_training_steps=num_training_steps,
)

metrics = []

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
 for batch in train_dataset:
 batch = {k: v.to(device) for k, v in batch.items()}
 outputs = model(**batch)
 loss = outputs.loss
 loss.backward()
 optimizer.step()
 lr_scheduler.step()
 optimizer.zero_grad()
 progress_bar.update(1)

 metric= load_metric("glue", "sst2")
 model.eval()
 for batch in eval_dataset:
 batch = {k: v.to(device) for k, v in batch.items()}
 with torch.no_grad():
 outputs = model(**batch)
 logits = outputs.logits
 predictions = torch.argmax(logits, dim=-1)
 metric.add_batch(predictions=predictions, references=batch["labels"])
 metrics.append(metric.compute())

print(metrics)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[{'accuracy': 0.7568807339449541}, {'accuracy': 0.8256880733944955}, {'accuracy': 0.8623853211009175}]


## (end)

In [8]:
from accelerate import Accelerator
accelerator = Accelerator()

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer= AdamW(model.parameters(), 5e-5)
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
 train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
 'linear',
 optimizer=optimizer,
 num_warmup_steps=0,
 num_training_steps=num_training_steps,
)

metrics = []

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
 for batch in train_dataloader:
 outputs = model(**batch)
 loss = outputs.loss
 accelerator.backward(loss)
 optimizer.step()
 lr_scheduler.step()
 optimizer.zero_grad()
 progress_bar.update(1)

 metric= load_metric("glue", "sst2")
 model.eval()
 for batch in eval_dataloader:
 with torch.no_grad():
 outputs = model(**batch)
 logits = outputs.logits
 predictions = torch.argmax(logits, dim=-1)
 metric.add_batch(predictions=predictions, references=batch["labels"])
 metrics.append(metric.compute())

print(metrics)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[{'accuracy': 0.6707317073170732}, {'accuracy': 0.7073170731707317}, {'accuracy': 0.7560975609756098}]
