Quantized model inference

#1
by Amanda95 - opened

Hi,
I am trying to evaluate the quantized int8 model on the MRPC task but failed to reproduce the reported f1 score. The following code is how I manage to tokenize/load model/evaluate model:

self.tokenizer = AutoTokenizer.from_pretrained("Intel/bert-base-uncased-mrpc-int8-qat")
int8_model = AutoModelForSequenceClassification.from_pretrained("Intel/bert-base-uncased-mrpc-int8-qat")
logits=int8_model(input_ids=batch_data["input_ids"], token_type_ids=batch_data["token_type_ids"],attention_mask=batch_data["attention_mask"]).logits

Could you help to check if there is something wrong? And the weirdest thing is the f1 score is zero always with eval() on, however f1 score becomes 0.81 when eval() is off.

Intel org

Please follow model card to load the int8 model.

from neural_compressor.utils.load_huggingface import OptimizedModel
int8_model = OptimizedModel.from_pretrained(
    'Intel/bert-base-uncased-mrpc-int8-qat',
)

or

from optimum.intel.neural_compressor.quantization import IncQuantizedModelForSequenceClassification
int8_model = IncQuantizedModelForSequenceClassification.from_pretrained(
    'Intel/bert-base-uncased-mrpc-int8-qat',
)

Thank you for your reply. I used the way you suggested at the very beginning but still could not reproduce the reported f1 score (the f1 score I got is 0.51, reported one is 0.91). I am attaching all the code that I manage to evaluate the model. Could you help to see if there are something wrong?:

import torch
import json
import random
import numpy as np
import pandas as pd
import os
import re

from tqdm import tqdm
from torch.utils import data
from transformers import BertTokenizer 
from neural_compressor.utils.load_huggingface import OptimizedModel
from sklearn.metrics import f1_score
from ipdb import set_trace
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from optimum.intel.neural_compressor.quantization import IncQuantizedModelForSequenceClassification

class MRPCset(data.Dataset):
   def __init__(self,path1, path2):
       self.convert_tsv_to_json(path1,path2)
       self.read_json_data(path2) 
       self.tokenizer =  AutoTokenizer.from_pretrained("Intel/bert-base-uncased-mrpc-int8-qat")
   def read_data(self, path):
       self.data = pd.read_csv(path, sep='delimiter', header=None)
       
   def read_json_data(self, path):
       with open(path) as f:
           self.data = json.load(f)["0"] 
       self.data_dict = []   
       for key, value in self.data.items():
           if key == '0':
               continue
           label, id_1,id_2, sent_1, sent_2 = re.split(r"\t+",value)
           self.data_dict.append({
           "label": label,
           "id_1": id_1,
           "id_2": id_2,
           "sent_1": sent_1,
           "sent_2": sent_2,
           })
   
   def convert_tsv_to_json(self,tsv_path,json_path):
       if os.path.exists(json_path):
           return
       data = pd.read_csv(path, sep='delimiter', header=None)
       data.to_json(json_path)
       
   def preprocess_text(self,sent1, sent2):
       return self.tokenizer(sent1,sent2,return_tensors="pt")
       
   def __len__(self):
       return len(self.data_dict)
   
   def __getitem__(self, index):
       current_data = self.data_dict[index]
       data_dict = self.preprocess_text(current_data["sent_1"],current_data["sent_2"])
       label = torch.tensor(int(current_data["label"]))

       batch = {"input_ids": data_dict["input_ids"].squeeze(0),
                "token_type_ids": data_dict["token_type_ids"].squeeze(0),
                "attention_mask": data_dict["attention_mask"].squeeze(0),
                "label":label}
       return batch   
           
if __name__ == "__main__":
   batch_size = 1
   def _main():
       path_root = './glue_data/MRPC/'
       json_path = path_root+ 'dev_data.json'
       tsv_path = path_root + 'dev.tsv'
       test = MRPCset(tsv_path, json_path)
       
       data_loader = data.DataLoader(
       test,
       batch_size = batch_size,
       shuffle=False,
       num_workers=0,
       drop_last=False,
       pin_memory=False,
       )
   
       #fp_model = OptimizedModel.from_pretrained('Intel/bert-base-uncased-mrpc',) # can reproduce the reported f1 score of floating point mode
       int8_model = OptimizedModel.from_pretrained('Intel/bert-base-uncased-mrpc-int8-qat',)
       int8_model.eval()
       
       total_labels = []
       pred_labels = []
       for i, batch_data in tqdm(enumerate(data_loader)):
           print(i)
           label = batch_data.pop("label")
           logits=int8_model(input_ids=batch_data["input_ids"], token_type_ids=batch_data["token_type_ids"],attention_mask=batch_data["attention_mask"]).logits
           pred_label = torch.max(logits,dim=1).indices
           total_labels.append(label)
           pred_labels.append(pred_label)
       
       total_gt_labels= np.array(torch.cat(total_labels))
       total_pred_labels = np.array(torch.cat(pred_labels))
       print("f1 score", f1_score(total_gt_labels, total_pred_labels))
       set_trace()
       print(test.data)
       
   _main()  
Intel org

Hi Amanda95, please check whether your machine support INT8 ISA, for example, AVX512_VNNI.
I run your code on cascade lake and got F1=0.91068. you can try script in huggingface/optimum-intel to avoid mistakes.

Sign up or log in to comment