File size: 3,249 Bytes
305d23f
74bcc75
305d23f
 
 
d889137
 
 
 
 
305d23f
d889137
 
98f80c6
 
 
305d23f
d889137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305d23f
 
d889137
305d23f
d889137
 
305d23f
 
d889137
305d23f
 
 
 
 
 
 
d889137
305d23f
 
d889137
 
305d23f
 
d889137
305d23f
 
 
d889137
305d23f
 
d889137
305d23f
 
d889137
 
 
e7c1419
d889137
 
 
 
3ddbfb1
d889137
 
 
 
 
3ddbfb1
d889137
 
 
 
 
305d23f
 
 
 
 
d889137
 
 
305d23f
 
d889137
 
305d23f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
from transformers import AutoTokenizer
from setfit import SetFitModel
from optimum.onnxruntime import ORTModelForFeatureExtraction
from quantization import OnnxSetFitModel
import numpy as np
from time import perf_counter
import matplotlib.pyplot as plt
from PIL import Image
import io

# Load the models
model1 = SetFitModel.from_pretrained("hsmashiana/basemodel_hpml")
ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx")
tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml")
model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head)


def plot_throughput_bar_chart(throughput_model1, throughput_model2):
    labels = ['Base model', 'Optimized model']
    throughputs = [throughput_model1, throughput_model2]

    plt.figure(figsize=(8, 6))
    plt.bar(labels, throughputs, color=['blue', 'navy'])
    plt.xlabel('Models')
    plt.ylabel('Throughput (tokens/second)')
    plt.title('Model Throughput Comparison')
    plt.tight_layout()

    # Create a PIL Image from the plot
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    img = Image.open(buf)
    plt.close()
    return img

def compare_models(text):
    inputs = tokenizer(text, return_tensors="pt")
    times = []

    # Warm-up phase to ensure fair timing
    for _ in range(5):
        model1([text])

    # Measure the execution time of model predictions
    for _ in range(20):
        start = perf_counter()
        out1 = model1([text])
        end = perf_counter()
        times.append(end - start)

    avg_latency_ms_model_1 = np.mean(times) * 1000

    times = []

    # Warm-up phase to ensure fair timing
    for _ in range(5):
        model3.predict([text])

    # Measure the execution time of model predictions
    for _ in range(20):
        start = perf_counter()
        out3 = model3.predict([text])
        end = perf_counter()
        times.append(end - start)

    avg_latency_ms_model_3 = np.mean(times) * 1000

    throughput_tokens_per_sec1 = inputs['input_ids'].size(1) / (avg_latency_ms_model_1 / 1000)
    throughput_tokens_per_sec2 = inputs['input_ids'].size(1) / (avg_latency_ms_model_3 / 1000)


    plot_data = plot_throughput_bar_chart(throughput_tokens_per_sec1, throughput_tokens_per_sec2)

    result1 = {
        "Base Model": {
            "answer": out1.numpy()[0],
            "average time (ms)": avg_latency_ms_model_1,
            "throughput (tokens/sec)": throughput_tokens_per_sec1
        }}
    result2 = {
        "Optimized Model": {
            "answer": out3.numpy()[0],
            "average time (ms)": avg_latency_ms_model_3,
            "throughput (tokens/sec)": throughput_tokens_per_sec2
        }}
    
    return result1, result2, plot_data

iface = gr.Interface(
    fn=compare_models,
    inputs="text",
    outputs=[
        gr.components.JSON(label="Base Model"),
        gr.components.JSON(label="Optimized Model"),
        gr.components.Image(label="throughput Comparison")
    ],
    title="Compare Sentence Classification Models",
    description="Enter a sentence to see how each model classifies it and their throughputs.",
    allow_flagging="never"
)

iface.launch()