textattack/roberta-base-MNLI · Got random accuracy

I load the model and evaluate it on mnli data from GLUE, however, it seems that the labels2index is different from that in origin GLUE dataset.
GLUE: entailment->0, neutral->1, contradiction->2
textattack: entailment->2, neutral->1, contradiction->0
Am I right?

Here is my code:

dataset_to_keys = {
    "mnli": ("premise", "hypothesis"),
    ...
}

model = AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-MNLI")
model.to(device)
model.eval()
dataset = load_dataset('glue', 'mnli')
sentence1_key, sentence2_key = dataset_to_keys['mnli']

def preprocess_function(examples):
    tokenizer = AutoTokenizer.from_pretrained("textattack/roberta-base-MNLI", use_fast=True)
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    
encoded_dataset = dataset.map(preprocess_function, batched=True, num_proc=8, load_from_cache_file=False)
encoded_dataset.set_format(type='torch')

dataloader = DataLoader(encoded_dataset['validation_matched'], batch_size=256)

acc = 0
for batch in tqdm(dataloader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    label = batch['label'].to(device)

    output = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=label)
                
    logits = output.logits
    pred = logits.argmax(dim=1)
    acc += (pred==label).int().sum().item()

    acc /= len(encoded_dataset['validation_matched'])
    print("Eval Acc:", acc)

(If I change

acc += (pred==label).int().sum().item()

acc += (pred==(2-label)).int().sum().item()

the accuracy becomes ~80%.
)