--- license: apache-2.0 --- ## Convert pytorch model to onnx format. import torch import onnx import onnxruntime from onnxruntime import InferenceSession from transformers import RobertaTokenizer, RobertaModel from transformers.convert_graph_to_onnx import convert import numpy as np from onnxruntime.transformers import optimizer from pathlib import Path from onnxruntime.quantization import quantize_dynamic, QuantType from sentence_transformers import SentenceTransformer, util sbert = SentenceTransformer('sentence-transformers/all-roberta-large-v1') sbert.save('sbert-all-roberta-large-v1') tokenizer = RobertaTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1') model = RobertaModel.from_pretrained('sentence-transformers/all-roberta-large-v1') model.save_pretrained('./all-roberta-large-v1/') tokenizer.save_pretrained('./all-roberta-large-v1/') opt_model_path = "onnx-model/sbert-roberta-large.onnx" convert(framework='pt', model='./all-roberta-large-v1/', output= Path(opt_model_path), opset=12, use_external_format=False, pipeline_name='feature-extraction') quantize_dynamic( model_input='onnx-model/sbert-roberta-large.onnx', model_output='onnx-model/sbert-roberta-large-quant.onnx', per_channel=True, reduce_range=True, activation_type=QuantType.QUInt8, weight_type=QuantType.QInt8, optimize_model=False, use_external_data_format=False ) ##Copy pooling layer and tokenizer files to the output directory ## How to generate embeddings? from onnxruntime import InferenceSession import torch from transformers.modeling_outputs import BaseModelOutput from transformers import RobertaTokenizerFast import torch.nn.functional as F from sentence_transformers.models import Transformer, Pooling, Dense class RobertaEncoder(torch.nn.Module): def __init__(self, encoder_sess): super().__init__() self.encoder = encoder_sess def forward( self, input_ids, attention_mask, inputs_embeds=None, head_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): encoder_hidden_state = torch.from_numpy( self.encoder.run( None, { "input_ids": input_ids.cpu().numpy(), "attention_mask": attention_mask.cpu().numpy(), }, )[0] ) return BaseModelOutput(encoder_hidden_state) def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def sbert_onnx_encode(sentence_input): token = roberta_tokenizer(sentence_input, return_tensors='pt') encoder_outputs = encoder_layer(input_ids=token['input_ids'], attention_mask=token['attention_mask']) sbert_embeddings = mean_pooling(encoder_outputs, token['attention_mask']) sbert_embeddings = F.normalize(sbert_embeddings, p=2, dim=1) return sbert_embeddings.tolist()[0] roberta_tokenizer = RobertaTokenizerFast.from_pretrained('sbert-onnx-all-roberta-large-v1') encoder_sess = InferenceSession('sbert-onnx-all-roberta-large-v1/sbert-roberta-large-quant.onnx') encoder_layer = RobertaEncoder(encoder_sess) pooling_layer = Pooling.load('./sbert-onnx-all-roberta-large-v1/1_Pooling/') m1 = sbert_onnx_encode('That is a happy person') m2 = sbert.encode('That is a happy person').tolist() print(util.cos_sim(m1,m2)) ##tensor([[0.9925]])