getvector
/

earnings-transcript-summary

Trained with AutoTrain

Carbon Emissions

Inference Endpoints

Model card Files Files and versions Community

earnings-transcript-summary / dont-use-handler.py

Joshua Perk

dont use handler

ae5a969 over 1 year ago

raw history blame contribute delete

No virus

2.87 kB

	from optimum.onnxruntime import ORTOptimizer, ORTQuantizer, ORTModelForSeq2SeqLM
	from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig
	from transformers import AutoTokenizer, pipeline
	import sys
	import os

	save_dir = "."

	class EndpointHandler():
	def __init__(self, path=""):
	# Load a PyTorch model and export it to the ONNX format
	model = ORTModelForSeq2SeqLM.from_pretrained(path, from_transformers=True)
	# Create the optimizer
	optimizer = ORTOptimizer.from_pretrained(model)
	# Define the optimization strategy by creating the appropriate configuration
	optimization_config = OptimizationConfig(
	optimization_level=2,
	optimize_with_onnxruntime_only=False,
	optimize_for_gpu=False,
	)
	# Optimize the model
	optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)

	# Create encoder quantizer
	encoder_quantizer = ORTQuantizer.from_pretrained(save_dir, file_name="encoder_model_optimized.onnx")

	# Create decoder quantizer
	decoder_quantizer = ORTQuantizer.from_pretrained(save_dir, file_name="decoder_model_optimized.onnx")

	# Create decoder with past key values quantizer
	decoder_wp_quantizer = ORTQuantizer.from_pretrained(save_dir, file_name="decoder_with_past_model_optimized.onnx")

	# Create Quantizer list
	quantizer = [encoder_quantizer, decoder_quantizer, decoder_wp_quantizer]

	# Define the quantization strategy by creating the appropriate configuration
	dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

	# Quantize the model
	[q.quantize(save_dir=save_dir, quantization_config=dqconfig) for q in quantizer]

	# OPTIONAL: Print the files to the log to understand the naming convention of the models
	[sys.stderr.write(x) for x in os.listdir(save_dir)]

	# Load the resulting optimized/quantized model
	optimized_model = ORTModelForSeq2SeqLM.from_pretrained(
	save_dir,
	encoder_file_name="encoder_model_optimized_quantized.onnx",
	decoder_file_name="decoder_model_optimized_quantized.onnx",
	decoder_with_past_file_name="decoder_with_past_model_optimized_quantized.onnx",
	)

	# Define the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(path)

	# Define the pipeline
	self.pipeline = pipeline("summarization", model=optimized_model, tokenizer=tokenizer)

	def __call__(self, data):
	inputs = data.pop("inputs", data)
	parameters = data.pop("parameters", None)
	if parameters is not None:
	summary = self.pipeline(inputs, **parameters)
	else:
	summary = self.pipeline(inputs)
	return summary