HPML_proj / app.py
hsmashiana's picture
first commit
305d23f
raw
history blame
No virus
2.8 kB
import gradio as gr
from transformers import pipeline
import numpy as np
from time import perf_counter
from setfit import SetFitModel
from optimum.onnxruntime import ORTModelForFeatureExtraction
from quantization import OnnxSetFitModel
from transformers import AutoTokenizer
# Load the models
# model1 = SetFitModel.from_pretrained("hsmashiana/base_model_hpml")
# ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx")
# tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml")
# model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head)
decode = {0:"World",1:"Sports",2:"Business",3:"Sci/Tech"}
def compare_models(text):
# result1 = model1(text)
# result2 = model2(text)
# # Including model names in the results
# output1 = {"Model": "BERT Base Uncased", "Output": result1}
# output2 = {"Model": "RoBERTa Base", "Output": result2}
# return output1, output2
times = []
# Warm-up phase to ensure fair timing
for _ in range(5):
model1([text])
# Measure the execution time of model predictions
for _ in range(20):
start = perf_counter()
out1 = model1([text])
end = perf_counter()
times.append(end - start)
# Calculate mean and standard deviation of latency
avg_latency_ms_model_1 = np.mean(times) * 1000
# times = []
# # Warm-up phase to ensure fair timing
# for _ in range(5):
# model2([text])
# # Measure the execution time of model predictions
# for _ in range(20):
# start = perf_counter()
# out2 = model2([text])
# end = perf_counter()
# times.append(end - start)
# # Calculate mean and standard deviation of latency
# avg_latency_ms_model_2 = np.mean(times) * 1000
times = []
# Warm-up phase to ensure fair timing
for _ in range(5):
model3.predict([text])
# Measure the execution time of model predictions
for _ in range(20):
start = perf_counter()
out3 = model3([text])
end = perf_counter()
times.append(end - start)
# Calculate mean and standard deviation of latency
avg_latency_ms_model_3 = np.mean(times) * 1000
return {"answer":decode[out1.numpy()[0]],"avgtime":avg_latency_ms_model_1}, {"answer":decode[out3[0]],"avgtime":avg_latency_ms_model_3}
# Create a Gradio interface
iface = gr.Interface(
fn=compare_models,
inputs="text",
outputs=[
gr.components.JSON(label="Base miniLM"),
gr.components.JSON(label="Quantized Distilled miniLM")
],
title="Compare Sentence Classification Models",
description="Enter a sentence to see how each model classifies it."
)
# Run the interface
iface.launch()