earnings-transcript-summary / dont-use-handler.py
Joshua Perk
dont use handler
ae5a969
from optimum.onnxruntime import ORTOptimizer, ORTQuantizer, ORTModelForSeq2SeqLM
from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig
from transformers import AutoTokenizer, pipeline
import sys
import os
save_dir = "."
class EndpointHandler():
def __init__(self, path=""):
# Load a PyTorch model and export it to the ONNX format
model = ORTModelForSeq2SeqLM.from_pretrained(path, from_transformers=True)
# Create the optimizer
optimizer = ORTOptimizer.from_pretrained(model)
# Define the optimization strategy by creating the appropriate configuration
optimization_config = OptimizationConfig(
optimization_level=2,
optimize_with_onnxruntime_only=False,
optimize_for_gpu=False,
)
# Optimize the model
optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)
# Create encoder quantizer
encoder_quantizer = ORTQuantizer.from_pretrained(save_dir, file_name="encoder_model_optimized.onnx")
# Create decoder quantizer
decoder_quantizer = ORTQuantizer.from_pretrained(save_dir, file_name="decoder_model_optimized.onnx")
# Create decoder with past key values quantizer
decoder_wp_quantizer = ORTQuantizer.from_pretrained(save_dir, file_name="decoder_with_past_model_optimized.onnx")
# Create Quantizer list
quantizer = [encoder_quantizer, decoder_quantizer, decoder_wp_quantizer]
# Define the quantization strategy by creating the appropriate configuration
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
# Quantize the model
[q.quantize(save_dir=save_dir, quantization_config=dqconfig) for q in quantizer]
# OPTIONAL: Print the files to the log to understand the naming convention of the models
[sys.stderr.write(x) for x in os.listdir(save_dir)]
# Load the resulting optimized/quantized model
optimized_model = ORTModelForSeq2SeqLM.from_pretrained(
save_dir,
encoder_file_name="encoder_model_optimized_quantized.onnx",
decoder_file_name="decoder_model_optimized_quantized.onnx",
decoder_with_past_file_name="decoder_with_past_model_optimized_quantized.onnx",
)
# Define the tokenizer
tokenizer = AutoTokenizer.from_pretrained(path)
# Define the pipeline
self.pipeline = pipeline("summarization", model=optimized_model, tokenizer=tokenizer)
def __call__(self, data):
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", None)
if parameters is not None:
summary = self.pipeline(inputs, **parameters)
else:
summary = self.pipeline(inputs)
return summary