bigmoyan commited on
Commit
f8cc4df
1 Parent(s): 78ab63e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +7 -9
README.md CHANGED
@@ -41,19 +41,16 @@ from transformers import AutoTokenizer
41
  from faster_chat_glm import GLM6B, FasterChatGLM
42
 
43
 
44
- tokenizer = AutoTokenizer.from_pretrained(chatglm6b_dir, trust_remote_code=True)
45
-
46
- BATCH_SIZE = 8
47
- MAX_OUT_LEN = 50
48
-
49
- # prepare input
50
- input_str = ["为什么我们需要对深度学习模型加速? ", ] *
51
  inputs = tokenizer(input_str, return_tensors="pt", padding=True)
52
  input_ids = inputs.input_ids.to('cuda:0')
53
 
54
 
 
55
  # kernel for chat model.
56
- kernel = GLM6B(plan_path="./models/glm6b-bs{BATCH_SIZE}.ftm",
57
  batch_size=1,
58
  num_beams=1,
59
  use_cache=True,
@@ -62,7 +59,8 @@ kernel = GLM6B(plan_path="./models/glm6b-bs{BATCH_SIZE}.ftm",
62
  decoder_layers=28,
63
  vocab_size=150528,
64
  max_seq_len=MAX_OUT_LEN)
65
- chat = FasterChatGLM(model_dir=chatglm6b_dir, kernel=kernel).half().cuda()
 
66
 
67
  # generate
68
  sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)
 
41
  from faster_chat_glm import GLM6B, FasterChatGLM
42
 
43
 
44
+ MAX_OUT_LEN = 100
45
+ tokenizer = AutoTokenizer.from_pretrained('./models', trust_remote_code=True)
46
+ input_str = ["为什么我们需要对深度学习模型加速?", ]
 
 
 
 
47
  inputs = tokenizer(input_str, return_tensors="pt", padding=True)
48
  input_ids = inputs.input_ids.to('cuda:0')
49
 
50
 
51
+ plan_path = './models/glm6b-bs8.ftm'
52
  # kernel for chat model.
53
+ kernel = GLM6B(plan_path=plan_path,
54
  batch_size=1,
55
  num_beams=1,
56
  use_cache=True,
 
59
  decoder_layers=28,
60
  vocab_size=150528,
61
  max_seq_len=MAX_OUT_LEN)
62
+
63
+ chat = FasterChatGLM(model_dir="./models", kernel=kernel).half().cuda()
64
 
65
  # generate
66
  sample_output = chat.generate(inputs=input_ids, max_length=MAX_OUT_LEN)