theodotus commited on
Commit
a70311a
1 Parent(s): e72e55e

Use ctranslate2

Browse files
Files changed (2) hide show
  1. app.py +55 -9
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1,7 +1,33 @@
1
- from ctransformers import AutoModelForCausalLM
 
2
 
 
3
  import gradio as gr
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
 
@@ -10,18 +36,33 @@ def generate_prompt(history):
10
  for chain in history[:-1]:
11
  prompt += f"<human>: {chain[0]}\n<bot>: {chain[1]}{end_token}\n"
12
  prompt += f"<human>: {history[-1][0]}\n<bot>:"
13
- return prompt
 
14
 
15
- def generate(history):
16
- prompt = generate_prompt(history)
 
 
 
 
 
 
17
 
18
- streamer = llm(prompt, temperature=0, stream=True)
19
- return streamer
 
 
 
 
 
 
20
 
21
 
22
 
23
- llm = AutoModelForCausalLM.from_pretrained("model/model_q4_0.bin", model_type='llama')
 
24
  end_token = "</s>"
 
25
 
26
 
27
  with gr.Blocks() as demo:
@@ -33,11 +74,16 @@ with gr.Blocks() as demo:
33
  return "", history + [[user_message, ""]]
34
 
35
  def bot(history):
36
- streamer = generate(history)
 
 
 
37
 
38
  for token in streamer:
39
- history[-1][1] += token
 
40
  yield history
 
41
 
42
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
43
  bot, chatbot, chatbot
 
1
+ import ctranslate2
2
+ from transformers import AutoTokenizer
3
 
4
+ import threading
5
  import gradio as gr
6
 
7
+ from typing import Optional
8
+ from queue import Queue
9
+
10
+
11
+
12
+
13
+ class TokenIteratorStreamer:
14
+ def __init__(self, end_token_id: int, timeout: Optional[float] = None):
15
+ self.end_token_id = end_token_id
16
+ self.queue = Queue()
17
+ self.timeout = timeout
18
+
19
+ def put(self, token: int):
20
+ self.queue.put(token, timeout=self.timeout)
21
+
22
+ def __iter__(self):
23
+ return self
24
+
25
+ def __next__(self):
26
+ token = self.queue.get(timeout=self.timeout)
27
+ if token == self.end_token_id:
28
+ raise StopIteration()
29
+ else:
30
+ return token
31
 
32
 
33
 
 
36
  for chain in history[:-1]:
37
  prompt += f"<human>: {chain[0]}\n<bot>: {chain[1]}{end_token}\n"
38
  prompt += f"<human>: {history[-1][0]}\n<bot>:"
39
+ tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt))
40
+ return tokens
41
 
42
+ def generate(streamer, history):
43
+ def stepResultCallback(result):
44
+ streamer.put(result.token_id)
45
+ if result.is_last and (result.token_id != end_token_id):
46
+ streamer.put(end_token_id)
47
+ print(f"step={result.step}, batch_id={result.batch_id}, token={result.token}")
48
+
49
+ tokens = generate_prompt(history)
50
 
51
+ results = translator.translate_batch(
52
+ [tokens],
53
+ beam_size=1,
54
+ max_decoding_length = 256,
55
+ repetition_penalty = 1.2,
56
+ callback = stepResultCallback
57
+ )
58
+ return results
59
 
60
 
61
 
62
+ translator = ctranslate2.Translator("model", intra_threads=2)
63
+ tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", use_fast=False)
64
  end_token = "</s>"
65
+ end_token_id = tokenizer.encode(end_token)[0]
66
 
67
 
68
  with gr.Blocks() as demo:
 
74
  return "", history + [[user_message, ""]]
75
 
76
  def bot(history):
77
+ bot_message_tokens = []
78
+ streamer = TokenIteratorStreamer(end_token_id = end_token_id)
79
+ generation_thread = threading.Thread(target=generate, args=(streamer, history))
80
+ generation_thread.start()
81
 
82
  for token in streamer:
83
+ bot_message_tokens.append(token)
84
+ history[-1][1] = tokenizer.decode(bot_message_tokens)
85
  yield history
86
+ generation_thread.join()
87
 
88
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
89
  bot, chatbot, chatbot
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- ctransformers
 
 
 
1
+ ctranslate2
2
+ transformers
3
+ SentencePiece