Spaces:
Runtime error
Runtime error
import gradio as gr | |
from transformers import AutoTokenizer | |
from setfit import SetFitModel | |
from optimum.onnxruntime import ORTModelForFeatureExtraction | |
from quantization import OnnxSetFitModel | |
import numpy as np | |
from time import perf_counter | |
import matplotlib.pyplot as plt | |
from PIL import Image | |
import io | |
# Load the models | |
model1 = SetFitModel.from_pretrained("hsmashiana/basemodel_hpml") | |
ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx") | |
tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml") | |
model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head) | |
def plot_throughput_bar_chart(throughput_model1, throughput_model2): | |
labels = ['Base model', 'Optimized model'] | |
throughputs = [throughput_model1, throughput_model2] | |
plt.figure(figsize=(8, 6)) | |
plt.bar(labels, throughputs, color=['blue', 'navy']) | |
plt.xlabel('Models') | |
plt.ylabel('Throughput (tokens/second)') | |
plt.title('Model Throughput Comparison') | |
plt.tight_layout() | |
# Create a PIL Image from the plot | |
buf = io.BytesIO() | |
plt.savefig(buf, format='png') | |
buf.seek(0) | |
img = Image.open(buf) | |
plt.close() | |
return img | |
def compare_models(text): | |
inputs = tokenizer(text, return_tensors="pt") | |
times = [] | |
# Warm-up phase to ensure fair timing | |
for _ in range(5): | |
model1([text]) | |
# Measure the execution time of model predictions | |
for _ in range(20): | |
start = perf_counter() | |
out1 = model1([text]) | |
end = perf_counter() | |
times.append(end - start) | |
avg_latency_ms_model_1 = np.mean(times) * 1000 | |
times = [] | |
# Warm-up phase to ensure fair timing | |
for _ in range(5): | |
model3.predict([text]) | |
# Measure the execution time of model predictions | |
for _ in range(20): | |
start = perf_counter() | |
out3 = model3.predict([text]) | |
end = perf_counter() | |
times.append(end - start) | |
avg_latency_ms_model_3 = np.mean(times) * 1000 | |
throughput_tokens_per_sec1 = inputs['input_ids'].size(1) / (avg_latency_ms_model_1 / 1000) | |
throughput_tokens_per_sec2 = inputs['input_ids'].size(1) / (avg_latency_ms_model_3 / 1000) | |
plot_data = plot_throughput_bar_chart(throughput_tokens_per_sec1, throughput_tokens_per_sec2) | |
result1 = { | |
"Base Model": { | |
"answer": out1.numpy()[0], | |
"average time (ms)": avg_latency_ms_model_1, | |
"throughput (tokens/sec)": throughput_tokens_per_sec1 | |
}} | |
result2 = { | |
"Optimized Model": { | |
"answer": out3.numpy()[0], | |
"average time (ms)": avg_latency_ms_model_3, | |
"throughput (tokens/sec)": throughput_tokens_per_sec2 | |
}} | |
return result1, result2, plot_data | |
iface = gr.Interface( | |
fn=compare_models, | |
inputs="text", | |
outputs=[ | |
gr.components.JSON(label="Base Model"), | |
gr.components.JSON(label="Optimized Model"), | |
gr.components.Image(label="throughput Comparison") | |
], | |
title="Compare Sentence Classification Models", | |
description="Enter a sentence to see how each model classifies it and their throughputs.", | |
allow_flagging="never" | |
) | |
iface.launch() | |