shingjan commited on
Commit
a07656a
1 Parent(s): cdf4c06

Add TVM and speedups

Browse files
Files changed (1) hide show
  1. app.py +12 -7
app.py CHANGED
@@ -5,8 +5,9 @@ import torch._dynamo as dynamo
5
 
6
 
7
  model = torch.load("GPT2Model.pt")
8
- optimized_model = dynamo.optimize("inductor")(model)
9
  tokenizer = torch.load("GPT2Tokenizer.pt")
 
 
10
 
11
  def timed(fn):
12
  start = time.time()
@@ -14,18 +15,22 @@ def timed(fn):
14
  end = time.time() - start
15
  return result, float("{:.5f}".format(end))
16
 
 
17
  def gpt2(prompt):
18
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
19
  eager_outputs, eager_time = timed(lambda: model.generate(input_ids, do_sample=False, max_length=30))
20
- dynamo_outputs, dynamo_time = timed(lambda: optimized_model.generate(input_ids, do_sample=False, max_length=30))
21
- if torch.allclose(eager_outputs, dynamo_outputs):
22
- actual_output = tokenizer.batch_decode(dynamo_outputs, skip_special_tokens=True)[0]
 
23
  else:
24
  actual_output = "Result is not correct between dynamo and eager!"
25
- expect_output = f"Torch eager takes: {eager_time} \nDynamo takes: {dynamo_time} \nSpeedup: "
26
- expect_output += "{:.2f}".format(eager_time/dynamo_time) + f"x \nOutput: {actual_output}"
 
 
27
  return expect_output
28
 
29
  demo = gr.Interface(fn=gpt2, inputs="text", outputs="text")
30
 
31
- demo.launch()
 
5
 
6
 
7
  model = torch.load("GPT2Model.pt")
 
8
  tokenizer = torch.load("GPT2Tokenizer.pt")
9
+ inductor_model = dynamo.optimize("inductor")(model)
10
+ tvm_model = dynamo.optimize("tvm")(model)
11
 
12
  def timed(fn):
13
  start = time.time()
 
15
  end = time.time() - start
16
  return result, float("{:.5f}".format(end))
17
 
18
+
19
  def gpt2(prompt):
20
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
21
  eager_outputs, eager_time = timed(lambda: model.generate(input_ids, do_sample=False, max_length=30))
22
+ inductor_outputs, inductor_time = timed(lambda: inductor_model.generate(input_ids, do_sample=False, max_length=30))
23
+ tvm_outputs, tvm_time = timed(lambda: tvm_model.generate(input_ids, do_sample=False, max_length=30))
24
+ if torch.allclose(eager_outputs, inductor_outputs) and torch.allclose(eager_outputs, tvm_outputs):
25
+ actual_output = tokenizer.batch_decode(eager_outputs, skip_special_tokens=True)[0]
26
  else:
27
  actual_output = "Result is not correct between dynamo and eager!"
28
+ expect_output = f"Torch eager takes: {eager_time} sec\n"
29
+ expect_output += f"Inductor takes: {inductor_time} sec with " + "{:.2}x speedup\n".format(eager_time/inductor_time)
30
+ expect_output += f"TVM takes: {tvm_time} sec with " + "{:.2}x speedup\n".format(eager_time/tvm_time)
31
+ expect_output += f"Output: {actual_output}"
32
  return expect_output
33
 
34
  demo = gr.Interface(fn=gpt2, inputs="text", outputs="text")
35
 
36
+ demo.launch()