Spaces:

tryolabs
/

transformers-optimization

Runtime error

App Files Files Community

transformers-optimization / app.py

juanfkurucz

Add example

496462c over 1 year ago

raw

history blame contribute delete

No virus

3.51 kB

	import time

	import gradio as gr
	import torch
	from huggingface_hub import hf_hub_download
	from onnxruntime import InferenceSession
	from transformers import AutoModelForQuestionAnswering, AutoTokenizer

	MAX_SEQUENCE_LENGTH = 512

	models = {
	"Base model": "madlag/bert-large-uncased-whole-word-masking-finetuned-squadv2",
	"Pruned model": "madlag/bert-large-uncased-wwm-squadv2-x2.63-f82.6-d16-hybrid-v1",
	"Pruned ONNX Optimized FP16": "tryolabs/bert-large-uncased-wwm-squadv2-optimized-f16",
	}

	loaded_models = {
	"Pruned ONNX Optimized FP16": hf_hub_download(
	repo_id=models["Pruned ONNX Optimized FP16"], filename="model.onnx"
	),
	"Base model": AutoModelForQuestionAnswering.from_pretrained(models["Base model"]),
	"Pruned model": AutoModelForQuestionAnswering.from_pretrained(
	models["Pruned model"]
	),
	}


	def run_ort_inference(model_name, inputs):
	sess = InferenceSession(
	loaded_models[model_name], providers=["CPUExecutionProvider"]
	)
	start_time = time.time()
	output = sess.run(None, input_feed=inputs)
	end_time = time.time()
	return (output[0], output[1]), (end_time - start_time)


	def run_normal_hf(model_name, inputs):
	start_time = time.time()
	output = loaded_models[model_name](**inputs).values()
	end_time = time.time()
	return output, (end_time - start_time)


	def inference(model_name, context, question):
	tokenizer = AutoTokenizer.from_pretrained(models[model_name])
	if model_name == "Pruned ONNX Optimized FP16":
	inputs = dict(
	tokenizer(
	question, context, return_tensors="np", max_length=MAX_SEQUENCE_LENGTH
	)
	)
	output, inference_time = run_ort_inference(model_name, inputs)
	answer_start_scores, answer_end_scores = torch.tensor(output[0]), torch.tensor(
	output[1]
	)
	else:
	inputs = tokenizer(
	question, context, return_tensors="pt", max_length=MAX_SEQUENCE_LENGTH
	)
	output, inference_time = run_normal_hf(model_name, inputs)
	answer_start_scores, answer_end_scores = output

	input_ids = inputs["input_ids"].tolist()[0]
	answer_start = torch.argmax(answer_start_scores)
	answer_end = torch.argmax(answer_end_scores) + 1
	answer = tokenizer.convert_tokens_to_string(
	tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
	)

	return answer, f"{inference_time:.4f}s"


	model_field = gr.Dropdown(
	choices=["Base model", "Pruned model", "Pruned ONNX Optimized FP16"],
	value="Pruned ONNX Optimized FP16",
	label="Model",
	)
	input_text_field = gr.Textbox(placeholder="Enter the text here", label="Text")
	input_question_field = gr.Text(placeholder="Enter the question here", label="Question")

	output_model = gr.Text(label="Model output")
	output_inference_time = gr.Text(label="Inference time in seconds")


	examples = [
	[
	"Pruned ONNX Optimized FP16",
	"The first little pig was very lazy. He didn't want to work at all and he built his house out of straw. The second little pig worked a little bit harder but he was somewhat lazy too and he built his house out of sticks. Then, they sang and danced and played together the rest of the day.",
	"Who worked a little bit harder?",
	]
	]

	demo = gr.Interface(
	inference,
	inputs=[model_field, input_text_field, input_question_field],
	outputs=[output_model, output_inference_time],
	examples=examples,
	)

	demo.launch()