hsmashiana commited on
Commit
305d23f
β€’
1 Parent(s): d4461bd

first commit

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. app.py +80 -0
  3. quantization.py +83 -0
  4. requirements.txt +9 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: HPML Proj
3
- emoji: 😻
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
+ title: Optimized SentenceTransformer Space
3
+ emoji: πŸ“Š
4
+ colorFrom: pink
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.28.3
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import numpy as np
4
+ from time import perf_counter
5
+ from setfit import SetFitModel
6
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
7
+ from quantization import OnnxSetFitModel
8
+ from transformers import AutoTokenizer
9
+ # Load the models
10
+
11
+ # model1 = SetFitModel.from_pretrained("hsmashiana/base_model_hpml")
12
+ # ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx")
13
+ # tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml")
14
+ # model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head)
15
+
16
+ decode = {0:"World",1:"Sports",2:"Business",3:"Sci/Tech"}
17
+
18
+ def compare_models(text):
19
+ # result1 = model1(text)
20
+ # result2 = model2(text)
21
+ # # Including model names in the results
22
+ # output1 = {"Model": "BERT Base Uncased", "Output": result1}
23
+ # output2 = {"Model": "RoBERTa Base", "Output": result2}
24
+ # return output1, output2
25
+
26
+ times = []
27
+ # Warm-up phase to ensure fair timing
28
+ for _ in range(5):
29
+ model1([text])
30
+ # Measure the execution time of model predictions
31
+ for _ in range(20):
32
+ start = perf_counter()
33
+ out1 = model1([text])
34
+ end = perf_counter()
35
+ times.append(end - start)
36
+ # Calculate mean and standard deviation of latency
37
+ avg_latency_ms_model_1 = np.mean(times) * 1000
38
+
39
+ # times = []
40
+ # # Warm-up phase to ensure fair timing
41
+ # for _ in range(5):
42
+ # model2([text])
43
+ # # Measure the execution time of model predictions
44
+ # for _ in range(20):
45
+ # start = perf_counter()
46
+ # out2 = model2([text])
47
+ # end = perf_counter()
48
+ # times.append(end - start)
49
+ # # Calculate mean and standard deviation of latency
50
+ # avg_latency_ms_model_2 = np.mean(times) * 1000
51
+
52
+ times = []
53
+ # Warm-up phase to ensure fair timing
54
+ for _ in range(5):
55
+ model3.predict([text])
56
+ # Measure the execution time of model predictions
57
+ for _ in range(20):
58
+ start = perf_counter()
59
+ out3 = model3([text])
60
+ end = perf_counter()
61
+ times.append(end - start)
62
+ # Calculate mean and standard deviation of latency
63
+ avg_latency_ms_model_3 = np.mean(times) * 1000
64
+
65
+ return {"answer":decode[out1.numpy()[0]],"avgtime":avg_latency_ms_model_1}, {"answer":decode[out3[0]],"avgtime":avg_latency_ms_model_3}
66
+
67
+ # Create a Gradio interface
68
+ iface = gr.Interface(
69
+ fn=compare_models,
70
+ inputs="text",
71
+ outputs=[
72
+ gr.components.JSON(label="Base miniLM"),
73
+ gr.components.JSON(label="Quantized Distilled miniLM")
74
+ ],
75
+ title="Compare Sentence Classification Models",
76
+ description="Enter a sentence to see how each model classifies it."
77
+ )
78
+
79
+ # Run the interface
80
+ iface.launch()
quantization.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from neural_compressor.experimental import Quantization, common
2
+
3
+ import functools
4
+
5
+ import evaluate
6
+ import onnxruntime
7
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
8
+ from sklearn.linear_model import LogisticRegression
9
+ from tqdm import tqdm
10
+ from setfit.exporters.utils import mean_pooling
11
+
12
+ accuracy = evaluate.load("accuracy")
13
+
14
+ class OnnxSetFitModel:
15
+ def __init__(self, ort_model, tokenizer, model_head):
16
+ self.ort_model = ort_model
17
+ self.tokenizer = tokenizer
18
+ self.model_head = model_head
19
+
20
+ def predict(self, inputs):
21
+ encoded_inputs = self.tokenizer(
22
+ inputs, padding=True, truncation=True, return_tensors="pt"
23
+ )
24
+ outputs = self.ort_model(**encoded_inputs)
25
+ embeddings = mean_pooling(
26
+ outputs["last_hidden_state"], encoded_inputs["attention_mask"]
27
+ )
28
+ return self.model_head.predict(embeddings)
29
+
30
+ def __call__(self, inputs):
31
+ return self.predict(inputs)
32
+
33
+ class myquantizer:
34
+ def __init__(self,onnx_path,model_head,tokenizer, test_dataset):
35
+ self.onnx_path = onnx_path
36
+ self.head = model_head
37
+ self.tokenizer = tokenizer
38
+ self.test_dataset = test_dataset
39
+
40
+ def eval_func(self, model):
41
+ print(self.onnx_path)
42
+ ort_model = ORTModelForFeatureExtraction.from_pretrained(self.onnx_path)
43
+ ort_model.model = onnxruntime.InferenceSession(model.SerializeToString(), None)
44
+ onnx_setfit_model = OnnxSetFitModel(ort_model, self.tokenizer, self.head)
45
+ preds = []
46
+ chunk_size = 100
47
+ for i in tqdm(range(0, len(self.test_dataset["text"]), chunk_size)):
48
+ preds.extend(
49
+ onnx_setfit_model.predict(self.test_dataset["text"][i : i + chunk_size])
50
+ )
51
+ labels = self.test_dataset["label"]
52
+ accuracy_calc = accuracy.compute(predictions=preds, references=labels)
53
+ return accuracy_calc["accuracy"]
54
+
55
+ def build_dynamic_quant_yaml(self):
56
+ yaml = """
57
+ model:
58
+ name: bert
59
+ framework: onnxrt_integerops
60
+
61
+ device: cpu
62
+
63
+ quantization:
64
+ approach: post_training_dynamic_quant
65
+
66
+ tuning:
67
+ accuracy_criterion:
68
+ relative: 0.01
69
+ exit_policy:
70
+ timeout: 0
71
+ random_seed: 9527
72
+ """
73
+ with open("build.yaml", "w", encoding="utf-8") as f:
74
+ f.write(yaml)
75
+ def quantizer_model(self):
76
+ self.build_dynamic_quant_yaml()
77
+ onnx_output_path = "onnx/model_quantized.onnx"
78
+ quantizer = Quantization("build.yaml")
79
+ model_is_at = str(self.onnx_path / "model.onnx")
80
+ quantizer.model = common.Model(model_is_at)
81
+ quantizer.eval_func = functools.partial(self.eval_func)
82
+ quantized_model = quantizer()
83
+ quantized_model.save(onnx_output_path)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
4
+ transformers
5
+ setfit
6
+ neural_compressor
7
+ optimum[onnxruntime]
8
+ onnxruntime_extensions
9
+ wandb