getvector
/

earnings-transcript-summary

@@ -1,22 +0,0 @@
-'''
-A script to convert this normal/deepset model into an onnx model for further optimization.
-Use the local .env and install the requirements
-Make sure the model checkpoint name is correct (or convert to do this locally)
-Make sure the auth token has access to the model
-'''
-from optimum.onnxruntime import ORTModelForSeq2SeqLM
-from transformers import AutoTokenizer
-model_checkpoint = "getvector/deepset-earnings-transcript-summary"
-save_directory = "tmp/onnx/"
-huggingface_auth_token = "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"
-# Load a model from transformers and export it to ONNX
-ort_model = ORTModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_transformers=True, use_auth_token=huggingface_auth_token)
-tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token=huggingface_auth_token)
-# Save the onnx model and tokenizer
-ort_model.save_pretrained(save_directory)
-tokenizer.save_pretrained(save_directory)

handler.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from optimum.onnxruntime import ORTOptimizer, ORTQuantizer, ORTModelForSeq2SeqLM
+from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig
+from transformers import AutoTokenizer, pipeline
+save_dir = "/" # or "."?
+class EndpointHandler():
+    def __init__(self, path=""):
+        print('hi')
+        # Load a PyTorch model and export it to the ONNX format
+        model = ORTModelForSeq2SeqLM.from_pretrained(path, from_transformers=True)
+        # Create the optimizer
+        optimizer = ORTOptimizer.from_pretrained(model)
+        # Define the optimization strategy by creating the appropriate configuration
+        optimization_config = OptimizationConfig(
+            optimization_level=2,
+            optimize_with_onnxruntime_only=False,
+            optimize_for_gpu=False,
+        )
+        # Optimize the model
+        optimizer.optimize(save_dir=save_dir, optimization_config=optimization_config)
+        # Load the resulting optimized model
+        # optimized_model = ORTModelForSeq2SeqLM.from_pretrained(
+        #     save_dir,
+        #     encoder_file_name="encoder_model_optimized.onnx",
+        #     decoder_file_name="decoder_model_optimized.onnx",
+        #     decoder_file_with_past_name="decoder_with_past_model_optimized.onnx",
+        # )
+        # Create encoder quantizer
+        encoder_quantizer = ORTQuantizer.from_pretrained(path, file_name="encoder_model.onnx")
+        # Create decoder quantizer
+        decoder_quantizer = ORTQuantizer.from_pretrained(path, file_name="decoder_model.onnx")
+        # Create decoder with past key values quantizer
+        decoder_wp_quantizer = ORTQuantizer.from_pretrained(path, file_name="decoder_with_past_model.onnx")
+        # Create Quantizer list
+        quantizer = [encoder_quantizer, decoder_quantizer, decoder_wp_quantizer]
+        # Define the quantization strategy by creating the appropriate configuration
+        dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)
+        # Quantize the model
+        [print(q.quantize(save_dir=save_dir, quantization_config=dqconfig) )for q in quantizer]
+        newModel = ORTModelForSeq2SeqLM.from_pretrained(path)
+        tokenizer = AutoTokenizer.from_pretrained(path) # , return_tensors="pt")
+        self.pipeline = pipeline("summarization", model=newModel, tokenizer=tokenizer)
+    def __call__(self, data):
+        inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", None)
+        if parameters is not None:
+            summary = self.pipeline(inputs, **parameters)
+        else:
+            summary = self.pipeline(inputs)
+        return summary

requirements.txt CHANGED Viewed

@@ -1,24 +1 @@
-certifi==2022.9.24
-charset-normalizer==2.1.1
-coloredlogs==15.0.1
-filelock==3.8.0
-huggingface-hub==0.10.0
-humanfriendly==10.0
-idna==3.4
-mpmath==1.2.1
-numpy==1.23.3
-optimum==1.4.0
-packaging==21.3
-protobuf==3.20.1
-pyparsing==3.0.9
-PyYAML==6.0
-regex==2022.9.13
-requests==2.28.1
-sentencepiece==0.1.97
-sympy==1.11.1
-tokenizers==0.12.1
-torch==1.12.1
-tqdm==4.64.1
-transformers==4.22.2
-typing_extensions==4.3.0
-urllib3==1.26.12


1	+ optimum[onnxruntime]==1.4.0