juanfkurucz's picture
Preload models
d61e332
raw
history blame
2.96 kB
import time
import gradio as gr
import torch
from huggingface_hub import hf_hub_download
from onnxruntime import InferenceSession
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
models = {
"Base model": "bert-large-uncased-whole-word-masking-finetuned-squad",
"Pruned model": "madlag/bert-large-uncased-wwm-squadv2-x2.63-f82.6-d16-hybrid-v1",
"Pruned ONNX Optimized FP16": "tryolabs/bert-large-uncased-wwm-squadv2-optimized-f16",
}
loaded_models = {
"Pruned ONNX Optimized FP16": hf_hub_download(
repo_id=models["Pruned ONNX Optimized FP16"], filename="model.onnx"
),
"Base model": AutoModelForQuestionAnswering.from_pretrained(models["Base model"]),
"Pruned model": AutoModelForQuestionAnswering.from_pretrained(
models["Pruned model"]
),
}
def run_ort_inference(model_name, inputs):
sess = InferenceSession(
loaded_models[model_name], providers=["CPUExecutionProvider"]
)
start_time = time.time()
output = sess.run(None, input_feed=inputs)
end_time = time.time()
return (output[0], output[1]), (end_time - start_time)
def run_normal_hf(model_name, inputs):
start_time = time.time()
output = loaded_models[model_name](**inputs).values()
end_time = time.time()
return output, (end_time - start_time)
def inference(model_name, context, question):
tokenizer = AutoTokenizer.from_pretrained(models[model_name])
if model_name == "Pruned ONNX Optimized FP16":
inputs = dict(tokenizer(question, context, return_tensors="np"))
output, inference_time = run_ort_inference(model_name, inputs)
answer_start_scores, answer_end_scores = torch.tensor(output[0]), torch.tensor(
output[1]
)
else:
inputs = tokenizer(question, context, return_tensors="pt")
output, inference_time = run_normal_hf(model_name, inputs)
answer_start_scores, answer_end_scores = output
input_ids = inputs["input_ids"].tolist()[0]
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1
answer = tokenizer.convert_tokens_to_string(
tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
)
return answer, f"{inference_time:.4f}s"
model_field = gr.Dropdown(
choices=["Base model", "Pruned model", "Pruned ONNX Optimized FP16"],
value="Pruned ONNX Optimized FP16",
label="Model",
)
input_text_field = gr.Textbox(placeholder="Enter the text here", label="Text")
input_question_field = gr.Text(placeholder="Enter the question here", label="Question")
output_model = gr.Text(label="Model output")
output_inference_time = gr.Text(label="Inference time in seconds")
demo = gr.Interface(
inference,
title="Optimizing Transformers - Question Answering Demo",
inputs=[model_field, input_text_field, input_question_field],
outputs=[output_model, output_inference_time],
)
demo.launch()