import gradio as gr from transformers import pipeline import numpy as np from time import perf_counter from setfit import SetFitModel from optimum.onnxruntime import ORTModelForFeatureExtraction from quantization import OnnxSetFitModel from transformers import AutoTokenizer # Load the models # model1 = SetFitModel.from_pretrained("hsmashiana/base_model_hpml") # ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx") # tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml") # model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head) decode = {0:"World",1:"Sports",2:"Business",3:"Sci/Tech"} def compare_models(text): # result1 = model1(text) # result2 = model2(text) # # Including model names in the results # output1 = {"Model": "BERT Base Uncased", "Output": result1} # output2 = {"Model": "RoBERTa Base", "Output": result2} # return output1, output2 times = [] # Warm-up phase to ensure fair timing for _ in range(5): model1([text]) # Measure the execution time of model predictions for _ in range(20): start = perf_counter() out1 = model1([text]) end = perf_counter() times.append(end - start) # Calculate mean and standard deviation of latency avg_latency_ms_model_1 = np.mean(times) * 1000 # times = [] # # Warm-up phase to ensure fair timing # for _ in range(5): # model2([text]) # # Measure the execution time of model predictions # for _ in range(20): # start = perf_counter() # out2 = model2([text]) # end = perf_counter() # times.append(end - start) # # Calculate mean and standard deviation of latency # avg_latency_ms_model_2 = np.mean(times) * 1000 times = [] # Warm-up phase to ensure fair timing for _ in range(5): model3.predict([text]) # Measure the execution time of model predictions for _ in range(20): start = perf_counter() out3 = model3([text]) end = perf_counter() times.append(end - start) # Calculate mean and standard deviation of latency avg_latency_ms_model_3 = np.mean(times) * 1000 return {"answer":decode[out1.numpy()[0]],"avgtime":avg_latency_ms_model_1}, {"answer":decode[out3[0]],"avgtime":avg_latency_ms_model_3} # Create a Gradio interface iface = gr.Interface( fn=compare_models, inputs="text", outputs=[ gr.components.JSON(label="Base miniLM"), gr.components.JSON(label="Quantized Distilled miniLM") ], title="Compare Sentence Classification Models", description="Enter a sentence to see how each model classifies it." ) # Run the interface iface.launch()