winglian commited on
Commit
e0bf185
1 Parent(s): 44eb762

fix stop tokens to match new prompt formatting, stream instruct response, add comments about concurrency to config

Browse files
Files changed (2) hide show
  1. config.yml +3 -2
  2. tabbed.py +6 -3
config.yml CHANGED
@@ -10,7 +10,8 @@ chat:
10
  stop:
11
  - "</s>"
12
  - "<unk>"
13
- - "### User:"
 
14
  queue:
15
  max_size: 16
16
- concurrency_count: 1
 
10
  stop:
11
  - "</s>"
12
  - "<unk>"
13
+ - "### USER:"
14
+ - "USER:"
15
  queue:
16
  max_size: 16
17
+ concurrency_count: 1 # leave this at 1, llama-cpp-python doesn't handle concurrent requests and will crash the entire app
tabbed.py CHANGED
@@ -49,7 +49,7 @@ def chat(history, system_message, max_tokens, temperature, top_p, top_k, repeat_
49
  ):
50
  answer = output['choices'][0]['text']
51
  history[-1][1] += answer
52
-
53
  yield history, history
54
 
55
 
@@ -66,8 +66,11 @@ start_message = """
66
 
67
 
68
  def generate_text_instruct(input_text):
69
- output = llm(f"### Instruction:\n{input_text}\n\n### Response:\n", echo=False, **config['chat'])
70
- return output['choices'][0]['text']
 
 
 
71
 
72
 
73
  instruct_interface = gr.Interface(
 
49
  ):
50
  answer = output['choices'][0]['text']
51
  history[-1][1] += answer
52
+ # stream the response
53
  yield history, history
54
 
55
 
 
66
 
67
 
68
  def generate_text_instruct(input_text):
69
+ response = ""
70
+ for output in llm(f"### Instruction:\n{input_text}\n\n### Response:\n", echo=False, **config['chat']):
71
+ answer = output['choices'][0]['text']
72
+ response += answer
73
+ yield response
74
 
75
 
76
  instruct_interface = gr.Interface(