Got random accuracy
#2
by
chord
- opened
I load the model and evaluate it on mnli data from GLUE, however, it seems that the labels2index is different from that in origin GLUE dataset.
GLUE: entailment->0, neutral->1, contradiction->2
textattack: entailment->2, neutral->1, contradiction->0
Am I right?
Here is my code:
dataset_to_keys = {
"mnli": ("premise", "hypothesis"),
...
}
model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-MNLI")
model.to(device)
model.eval()
dataset = load_dataset('glue', 'mnli')
sentence1_key, sentence2_key = dataset_to_keys['mnli']
def preprocess_function(examples):
tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-MNLI", use_fast=True)
if sentence2_key is None:
return tokenizer(examples[sentence1_key], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
encoded_dataset = dataset.map(preprocess_function, batched=True, num_proc=8, load_from_cache_file=False)
encoded_dataset.set_format(type='torch')
dataloader = DataLoader(encoded_dataset['validation_matched'], batch_size=256)
acc = 0
for batch in tqdm(dataloader):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
label = batch['label'].to(device)
output = model(input_ids=input_ids,
attention_mask=attention_mask,
labels=label)
logits = output.logits
pred = logits.argmax(dim=1)
acc += (pred==label).int().sum().item()
acc /= len(encoded_dataset['validation_matched'])
print("Eval Acc:", acc)
(If I change
acc += (pred==label).int().sum().item()
to
acc += (pred==(2-label)).int().sum().item()
the accuracy becomes ~80%.
)