|
--- |
|
license: apache-2.0 |
|
--- |
|
|
|
## Convert pytorch model to onnx format. |
|
|
|
``` |
|
import torch |
|
import onnx |
|
import onnxruntime |
|
from onnxruntime import InferenceSession |
|
from transformers import RobertaTokenizer, RobertaModel |
|
from transformers.convert_graph_to_onnx import convert |
|
import numpy as np |
|
from onnxruntime.transformers import optimizer |
|
from pathlib import Path |
|
from onnxruntime.quantization import quantize_dynamic, QuantType |
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
sbert = SentenceTransformer('sentence-transformers/all-roberta-large-v1') |
|
sbert.save('sbert-all-roberta-large-v1') |
|
|
|
tokenizer = RobertaTokenizer.from_pretrained('sentence-transformers/all-roberta-large-v1') |
|
model = RobertaModel.from_pretrained('sentence-transformers/all-roberta-large-v1') |
|
model.save_pretrained('./all-roberta-large-v1/') |
|
tokenizer.save_pretrained('./all-roberta-large-v1/') |
|
|
|
opt_model_path = "onnx-model/sbert-roberta-large.onnx" |
|
convert(framework='pt', model='./all-roberta-large-v1/', output= Path(opt_model_path), opset=12, use_external_format=False, pipeline_name='feature-extraction') |
|
|
|
quantize_dynamic( |
|
model_input='onnx-model/sbert-roberta-large.onnx', |
|
model_output='onnx-model/sbert-roberta-large-quant.onnx', |
|
per_channel=True, |
|
reduce_range=True, |
|
activation_type=QuantType.QUInt8, |
|
weight_type=QuantType.QInt8, |
|
optimize_model=False, |
|
use_external_data_format=False |
|
) |
|
``` |
|
##Copy pooling layer and tokenizer files to the output directory |
|
|
|
|
|
## How to generate embeddings? |
|
``` |
|
from onnxruntime import InferenceSession |
|
import torch |
|
from transformers.modeling_outputs import BaseModelOutput |
|
from transformers import RobertaTokenizerFast |
|
import torch.nn.functional as F |
|
from sentence_transformers.models import Transformer, Pooling, Dense |
|
|
|
class RobertaEncoder(torch.nn.Module): |
|
def __init__(self, encoder_sess): |
|
super().__init__() |
|
self.encoder = encoder_sess |
|
|
|
def forward( |
|
self, |
|
input_ids, |
|
attention_mask, |
|
inputs_embeds=None, |
|
head_mask=None, |
|
output_attentions=None, |
|
output_hidden_states=None, |
|
return_dict=None, |
|
): |
|
|
|
encoder_hidden_state = torch.from_numpy( |
|
self.encoder.run( |
|
None, |
|
{ |
|
"input_ids": input_ids.cpu().numpy(), |
|
"attention_mask": attention_mask.cpu().numpy(), |
|
|
|
}, |
|
)[0] |
|
) |
|
|
|
return BaseModelOutput(encoder_hidden_state) |
|
|
|
def mean_pooling(model_output, attention_mask): |
|
token_embeddings = model_output[0] #First element of model_output contains all token embeddings |
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
|
def sbert_onnx_encode(sentence_input): |
|
token = roberta_tokenizer(sentence_input, return_tensors='pt') |
|
encoder_outputs = encoder_layer(input_ids=token['input_ids'], attention_mask=token['attention_mask']) |
|
sbert_embeddings = mean_pooling(encoder_outputs, token['attention_mask']) |
|
sbert_embeddings = F.normalize(sbert_embeddings, p=2, dim=1) |
|
return sbert_embeddings.tolist()[0] |
|
|
|
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('sbert-onnx-all-roberta-large-v1') |
|
encoder_sess = InferenceSession('sbert-onnx-all-roberta-large-v1/sbert-roberta-large-quant.onnx') |
|
encoder_layer = RobertaEncoder(encoder_sess) |
|
pooling_layer = Pooling.load('./sbert-onnx-all-roberta-large-v1/1_Pooling/') |
|
|
|
m1 = sbert_onnx_encode('That is a happy person') |
|
m2 = sbert.encode('That is a happy person').tolist() |
|
print(util.cos_sim(m1,m2)) |
|
##tensor([[0.9925]]) |
|
``` |