Model gives weird predictions

#1
by salma-elshafey - opened

Hello! Thank you so much for sharing your code :)
I have a question, though. I followed a similar code to yours to machine translation task using a pre-trained BERT model on Arabic language named AraBERT as an encoder, and GPT2 as a decoder and fine-tune on a small dataset just to see what the results will look like. However, the model predicts a bunch of exclamation marks "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!". Do you know what the cause of the problem may be and how to solve it?

Here is my full code:

import numpy as np
import logging
import torch
from transformers import BertTokenizer, GPT2Tokenizer, EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, load_metric, DatasetDict, Dataset
logging.basicConfig(level=logging.INFO)
#arabert_model = AutoModel.from_pretrained("bert-base-arabertv02/")
#gpt2_model = AutoModel.from_pretrained("gpt2/")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-arabertv02/", "gpt2/")

cache is currently not supported by EncoderDecoder framework

model.decoder.config.use_cache = False
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-arabertv02/")

CLS token will work as BOS token

bert_tokenizer.bos_token = bert_tokenizer.cls_token

SEP token will work as EOS token

bert_tokenizer.eos_token = bert_tokenizer.sep_token

make sure GPT2 appends EOS in begin and end

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
return outputs

GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2/")

set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id

gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token

set decoding params

model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
model.config.eos_token_id = gpt2_tokenizer.eos_token_id
model.config.pad_token_id = bert_tokenizer.pad_token_id
model.config.max_length = 142
model.config.min_length = 1
model.config.no_repeat_ngram_size = 3
model.early_stopping = True
model.length_penalty = 2.0
model.num_beams = 5

load train and validation data

dataset = load_dataset("ted_talks_iwslt", language_pair=("ar", "en"), year="2014")
metric = load_metric("sacrebleu")
dataset = dataset['train']
train_test = dataset.train_test_split(0.2)
train_test_dataset = DatasetDict({
'train': train_test['train'],
'test': train_test['test']})
encoder_length = 128
decoder_length = 128
batch_size = 1

map data correctly

def map_to_encoder_decoder_inputs(batch): # Tokenizer will automatically set [BOS]

batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = outputs.input_ids
batch["labels"] = outputs.input_ids.copy()
batch["decoder_attention_mask"] = outputs.attention_mask

# complicated list comprehension here because pad_token_id alone is not good enough to know whether label should be excluded or not
batch["labels"] = [
    [-100 if mask == 0 else token for mask, token in mask_and_tokens] for mask_and_tokens in [zip(masks, labels) for masks, labels in zip(batch["decoder_attention_mask"], batch["labels"])]
]

assert all([len(x) == encoder_length for x in inputs.input_ids])
assert all([len(x) == decoder_length for x in outputs.input_ids])
return batch

def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]

return preds, labels

def compute_metrics(eval_preds):
torch.cuda.empty_cache()
with torch.no_grad():
label_ids = eval_preds.label_ids

print("LABEL IDS: ", label_ids)

pred_ids = eval_preds.predictions
#preds, labels = eval_preds
if isinstance(pred_ids, tuple):
    pred_ids = pred_ids[0]

print("PREDICTION IDS: ", pred_ids)

decoded_preds = gpt2_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

Replace -100 in the labels as we can't decode them.

label_ids = np.where(label_ids != -100, label_ids, bert_tokenizer.pad_token_id)
label_ids[label_ids == -100] = gpt2_tokenizer.eos_token_id
decoded_labels = gpt2_tokenizer.batch_decode(label_ids, skip_special_tokens=True)
file = open("reference sentences iwslt test", "a")
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
for label in decoded_labels:
  print("LABEL: ", label)
file2 = open("predicted sentences iwslt test", "a")
for pred in decoded_preds:
  print("PRED: ", pred)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}

prediction_lens = [np.count_nonzero(pred != gpt2_tokenizer.pad_token_id) for pred in pred_ids]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result

make train dataset ready

train_dataset = train_test_dataset.map(
map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["translation"],
)
del(train_test_dataset)
train_dataset.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

same for validation dataset

val_dataset = val_dataset.map(

map_to_encoder_decoder_inputs, batched=True, batch_size=batch_size, remove_columns=["ar", "en"],

)

val_dataset.set_format(

type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],

)

training_args = Seq2SeqTrainingArguments(
f"arabert2bert-finetuned-ar-to-en-on-iwslt",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=1,
weight_decay=0.01,
do_train=True,
do_eval=True,
predict_with_generate=True,

eval_accumulation_steps=20,

save_total_limit=3,
num_train_epochs=1,
logging_steps=150,
fp16=True,

no_cuda=True

)

trainer = Seq2SeqTrainer(model=model, args=training_args, compute_metrics=compute_metrics, train_dataset = train_dataset['train'], eval_dataset=train_dataset['test'])

trainer.train()

Thanks in advance :)

Hey @salma-elshafey ,

Thanks for opening the discussion here! Could you maybe put this code in a google colab where I can just run it to see the problem? :-)
Thanks!

BTW you'll soon be able to edit comments =)

Hey, @patrickvonplaten , sorry if I'm bothering but did you find where the problem lies?😅

Hey @salma-elshafey ,

I sadly don't have the time to fully debug your script, but the problem here seems to be that the model predicts the EOS token id (50256) all the time while getting a very low loss.
Could it be that your labels only consist of 50256 tokens? Could you maybe be sure that the label ids that you train the model on are correct?

Hello, @patrickvonplaten ! I used an older version of the transformes library -specifically v4.2.1- and the problem was solved :)

Awesome, very glad that you got it to work! Do you know by any chance what the problem was? Maybe there is a bug in the newer transformers version

Sign up or log in to comment