Spaces:

tryolabs
/

transformers-optimization

Runtime error

File size: 3,505 Bytes

import time

import gradio as gr
import torch
from huggingface_hub import hf_hub_download
from onnxruntime import InferenceSession
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

MAX_SEQUENCE_LENGTH = 512

models = {
    "Base model": "madlag/bert-large-uncased-whole-word-masking-finetuned-squadv2",
    "Pruned model": "madlag/bert-large-uncased-wwm-squadv2-x2.63-f82.6-d16-hybrid-v1",
    "Pruned ONNX Optimized FP16": "tryolabs/bert-large-uncased-wwm-squadv2-optimized-f16",
}

loaded_models = {
    "Pruned ONNX Optimized FP16": hf_hub_download(
        repo_id=models["Pruned ONNX Optimized FP16"], filename="model.onnx"
    ),
    "Base model": AutoModelForQuestionAnswering.from_pretrained(models["Base model"]),
    "Pruned model": AutoModelForQuestionAnswering.from_pretrained(
        models["Pruned model"]
    ),
}


def run_ort_inference(model_name, inputs):
    sess = InferenceSession(
        loaded_models[model_name], providers=["CPUExecutionProvider"]
    )
    start_time = time.time()
    output = sess.run(None, input_feed=inputs)
    end_time = time.time()
    return (output[0], output[1]), (end_time - start_time)


def run_normal_hf(model_name, inputs):
    start_time = time.time()
    output = loaded_models[model_name](**inputs).values()
    end_time = time.time()
    return output, (end_time - start_time)


def inference(model_name, context, question):
    tokenizer = AutoTokenizer.from_pretrained(models[model_name])
    if model_name == "Pruned ONNX Optimized FP16":
        inputs = dict(
            tokenizer(
                question, context, return_tensors="np", max_length=MAX_SEQUENCE_LENGTH
            )
        )
        output, inference_time = run_ort_inference(model_name, inputs)
        answer_start_scores, answer_end_scores = torch.tensor(output[0]), torch.tensor(
            output[1]
        )
    else:
        inputs = tokenizer(
            question, context, return_tensors="pt", max_length=MAX_SEQUENCE_LENGTH
        )
        output, inference_time = run_normal_hf(model_name, inputs)
        answer_start_scores, answer_end_scores = output

    input_ids = inputs["input_ids"].tolist()[0]
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
    )

    return answer, f"{inference_time:.4f}s"


model_field = gr.Dropdown(
    choices=["Base model", "Pruned model", "Pruned ONNX Optimized FP16"],
    value="Pruned ONNX Optimized FP16",
    label="Model",
)
input_text_field = gr.Textbox(placeholder="Enter the text here", label="Text")
input_question_field = gr.Text(placeholder="Enter the question here", label="Question")

output_model = gr.Text(label="Model output")
output_inference_time = gr.Text(label="Inference time in seconds")


examples = [
    [
        "Pruned ONNX Optimized FP16",
        "The first little pig was very lazy. He didn't want to work at all and he built his house out of straw. The second little pig worked a little bit harder but he was somewhat lazy too and he built his house out of sticks. Then, they sang and danced and played together the rest of the day.",
        "Who worked a little bit harder?",
    ]
]

demo = gr.Interface(
    inference,
    inputs=[model_field, input_text_field, input_question_field],
    outputs=[output_model, output_inference_time],
    examples=examples,
)

demo.launch()