juanfkurucz commited on
Commit
54ac152
1 Parent(s): d61e332

Add max sequence length to tokenizer inference

Browse files
Files changed (1) hide show
  1. app.py +10 -2
app.py CHANGED
@@ -6,6 +6,8 @@ from huggingface_hub import hf_hub_download
6
  from onnxruntime import InferenceSession
7
  from transformers import AutoModelForQuestionAnswering, AutoTokenizer
8
 
 
 
9
  models = {
10
  "Base model": "bert-large-uncased-whole-word-masking-finetuned-squad",
11
  "Pruned model": "madlag/bert-large-uncased-wwm-squadv2-x2.63-f82.6-d16-hybrid-v1",
@@ -43,13 +45,19 @@ def run_normal_hf(model_name, inputs):
43
  def inference(model_name, context, question):
44
  tokenizer = AutoTokenizer.from_pretrained(models[model_name])
45
  if model_name == "Pruned ONNX Optimized FP16":
46
- inputs = dict(tokenizer(question, context, return_tensors="np"))
 
 
 
 
47
  output, inference_time = run_ort_inference(model_name, inputs)
48
  answer_start_scores, answer_end_scores = torch.tensor(output[0]), torch.tensor(
49
  output[1]
50
  )
51
  else:
52
- inputs = tokenizer(question, context, return_tensors="pt")
 
 
53
  output, inference_time = run_normal_hf(model_name, inputs)
54
  answer_start_scores, answer_end_scores = output
55
 
 
6
  from onnxruntime import InferenceSession
7
  from transformers import AutoModelForQuestionAnswering, AutoTokenizer
8
 
9
+ MAX_SEQUENCE_LENGTH = 512
10
+
11
  models = {
12
  "Base model": "bert-large-uncased-whole-word-masking-finetuned-squad",
13
  "Pruned model": "madlag/bert-large-uncased-wwm-squadv2-x2.63-f82.6-d16-hybrid-v1",
 
45
  def inference(model_name, context, question):
46
  tokenizer = AutoTokenizer.from_pretrained(models[model_name])
47
  if model_name == "Pruned ONNX Optimized FP16":
48
+ inputs = dict(
49
+ tokenizer(
50
+ question, context, return_tensors="np", max_length=MAX_SEQUENCE_LENGTH
51
+ )
52
+ )
53
  output, inference_time = run_ort_inference(model_name, inputs)
54
  answer_start_scores, answer_end_scores = torch.tensor(output[0]), torch.tensor(
55
  output[1]
56
  )
57
  else:
58
+ inputs = tokenizer(
59
+ question, context, return_tensors="pt", max_length=MAX_SEQUENCE_LENGTH
60
+ )
61
  output, inference_time = run_normal_hf(model_name, inputs)
62
  answer_start_scores, answer_end_scores = output
63