HPML_proj / app.py
hsmashiana's picture
removed decode
3ddbfb1
import gradio as gr
from transformers import AutoTokenizer
from setfit import SetFitModel
from optimum.onnxruntime import ORTModelForFeatureExtraction
from quantization import OnnxSetFitModel
import numpy as np
from time import perf_counter
import matplotlib.pyplot as plt
from PIL import Image
import io
# Load the models
model1 = SetFitModel.from_pretrained("hsmashiana/basemodel_hpml")
ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx")
tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml")
model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head)
def plot_throughput_bar_chart(throughput_model1, throughput_model2):
labels = ['Base model', 'Optimized model']
throughputs = [throughput_model1, throughput_model2]
plt.figure(figsize=(8, 6))
plt.bar(labels, throughputs, color=['blue', 'navy'])
plt.xlabel('Models')
plt.ylabel('Throughput (tokens/second)')
plt.title('Model Throughput Comparison')
plt.tight_layout()
# Create a PIL Image from the plot
buf = io.BytesIO()
plt.savefig(buf, format='png')
buf.seek(0)
img = Image.open(buf)
plt.close()
return img
def compare_models(text):
inputs = tokenizer(text, return_tensors="pt")
times = []
# Warm-up phase to ensure fair timing
for _ in range(5):
model1([text])
# Measure the execution time of model predictions
for _ in range(20):
start = perf_counter()
out1 = model1([text])
end = perf_counter()
times.append(end - start)
avg_latency_ms_model_1 = np.mean(times) * 1000
times = []
# Warm-up phase to ensure fair timing
for _ in range(5):
model3.predict([text])
# Measure the execution time of model predictions
for _ in range(20):
start = perf_counter()
out3 = model3.predict([text])
end = perf_counter()
times.append(end - start)
avg_latency_ms_model_3 = np.mean(times) * 1000
throughput_tokens_per_sec1 = inputs['input_ids'].size(1) / (avg_latency_ms_model_1 / 1000)
throughput_tokens_per_sec2 = inputs['input_ids'].size(1) / (avg_latency_ms_model_3 / 1000)
plot_data = plot_throughput_bar_chart(throughput_tokens_per_sec1, throughput_tokens_per_sec2)
result1 = {
"Base Model": {
"answer": out1.numpy()[0],
"average time (ms)": avg_latency_ms_model_1,
"throughput (tokens/sec)": throughput_tokens_per_sec1
}}
result2 = {
"Optimized Model": {
"answer": out3.numpy()[0],
"average time (ms)": avg_latency_ms_model_3,
"throughput (tokens/sec)": throughput_tokens_per_sec2
}}
return result1, result2, plot_data
iface = gr.Interface(
fn=compare_models,
inputs="text",
outputs=[
gr.components.JSON(label="Base Model"),
gr.components.JSON(label="Optimized Model"),
gr.components.Image(label="throughput Comparison")
],
title="Compare Sentence Classification Models",
description="Enter a sentence to see how each model classifies it and their throughputs.",
allow_flagging="never"
)
iface.launch()