Spaces:
Runtime error
Runtime error
File size: 3,330 Bytes
305d23f 74bcc75 305d23f d889137 305d23f d889137 98f80c6 305d23f d889137 305d23f d889137 305d23f d889137 305d23f d889137 305d23f d889137 305d23f d889137 305d23f d889137 305d23f d889137 305d23f d889137 305d23f d889137 e7c1419 d889137 6ea93fd d889137 305d23f d889137 305d23f d889137 305d23f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
from transformers import AutoTokenizer
from setfit import SetFitModel
from optimum.onnxruntime import ORTModelForFeatureExtraction
from quantization import OnnxSetFitModel
import numpy as np
from time import perf_counter
import matplotlib.pyplot as plt
from PIL import Image
import io
# Load the models
model1 = SetFitModel.from_pretrained("hsmashiana/basemodel_hpml")
ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx")
tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml")
model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head)
decode = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
def plot_throughput_bar_chart(throughput_model1, throughput_model2):
labels = ['Base model', 'Optimized model']
throughputs = [throughput_model1, throughput_model2]
plt.figure(figsize=(8, 6))
plt.bar(labels, throughputs, color=['blue', 'navy'])
plt.xlabel('Models')
plt.ylabel('Throughput (tokens/second)')
plt.title('Model Throughput Comparison')
plt.tight_layout()
# Create a PIL Image from the plot
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
img = Image.open(buf)
plt.close()
return img
def compare_models(text):
inputs = tokenizer(text, return_tensors="pt")
times = []
# Warm-up phase to ensure fair timing
for _ in range(5):
model1([text])
# Measure the execution time of model predictions
for _ in range(20):
start = perf_counter()
out1 = model1([text])
end = perf_counter()
times.append(end - start)
avg_latency_ms_model_1 = np.mean(times) * 1000
times = []
# Warm-up phase to ensure fair timing
for _ in range(5):
model3.predict([text])
# Measure the execution time of model predictions
for _ in range(20):
start = perf_counter()
out3 = model3.predict([text])
end = perf_counter()
times.append(end - start)
avg_latency_ms_model_3 = np.mean(times) * 1000
throughput_tokens_per_sec1 = inputs['input_ids'].size(1) / (avg_latency_ms_model_1 / 1000)
throughput_tokens_per_sec2 = inputs['input_ids'].size(1) / (avg_latency_ms_model_3 / 1000)
plot_data = plot_throughput_bar_chart(throughput_tokens_per_sec1, throughput_tokens_per_sec2)
result1 = {
"Base Model": {
"answer": decode[out1.numpy()[0]],
"average time (ms)": avg_latency_ms_model_1,
"throughput (tokens/sec)": throughput_tokens_per_sec1
}}
result2 = {
"Optimized Model": {
"answer": decode[out3.numpy()[0]],
"average time (ms)": avg_latency_ms_model_3,
"throughput (tokens/sec)": throughput_tokens_per_sec2
}}
return result1, result2, plot_data
iface = gr.Interface(
fn=compare_models,
inputs="text",
outputs=[
gr.components.JSON(label="Base Model"),
gr.components.JSON(label="Optimized Model"),
gr.components.Image(label="throughput Comparison")
],
title="Compare Sentence Classification Models",
description="Enter a sentence to see how each model classifies it and their throughputs.",
allow_flagging="never"
)
iface.launch()
|