winglian commited on
Commit
0a981aa
1 Parent(s): 51c11ce

require torch and nvidia-cublas

Browse files

use nvidia pypi
use ggml-webui gpu llama.cpp build
use blas llama.cpp, streaming chat
fix initial history state
use list, not tuple

Files changed (3) hide show
  1. README.md +3 -0
  2. chat.py +36 -26
  3. requirements.txt +5 -1
README.md CHANGED
@@ -11,6 +11,9 @@ pinned: false
11
 
12
  # GGML UI Inference w/ HuggingFace Spaces
13
 
 
 
 
14
  Brought to you by [OpenAccess AI Collective](https://github.com/OpenAccess-AI-Collective)
15
 
16
 
 
11
 
12
  # GGML UI Inference w/ HuggingFace Spaces
13
 
14
+ - Fork this space to use your own GGML models. Simply update the [./config.yml](./config.yml)
15
+ - Contribute at [https://github.com/OpenAccess-AI-Collective/ggml-webui](https://github.com/OpenAccess-AI-Collective/ggml-webui)
16
+
17
  Brought to you by [OpenAccess AI Collective](https://github.com/OpenAccess-AI-Collective)
18
 
19
 
chat.py CHANGED
@@ -21,25 +21,26 @@ while True:
21
  llm = Llama(model_path=fp, **config["llama_cpp"])
22
 
23
 
24
- def chat(inp, history, system_message):
25
  history = history or []
26
- history.append((inp, ""))
 
 
27
 
28
- messages = system_message + \
29
- "\n".join(["\n".join(["### User: "+item[0], "Assistant: "+item[1]])
30
- for item in history])
31
 
 
32
  history = history or []
33
 
34
- output = llm(messages, max_tokens=512, stop=["</s>", "<unk>", "### User:"], echo=False)
35
- answer = output['choices'][0]['text']
36
-
37
- history.pop() # remove user input only history
38
- history.append((inp, answer))
39
 
40
- message = '' # This clears the message text
 
 
 
41
 
42
- return history, history, message
43
 
44
 
45
  def clear_chat(chat_history_state, chat_message):
@@ -65,26 +66,35 @@ with blocks:
65
  placeholder="Ask me anything.",
66
  lines=1,
67
  )
 
 
68
  clear = gr.Button(value="New topic", variant="secondary").style(full_width=False)
69
- submit = gr.Button(value="Send message", variant="secondary").style(full_width=True)
70
 
71
  system_msg = gr.Textbox(
72
  start_message, label="System Message", interactive=False, visible=False)
73
 
74
- # gr.Examples(
75
- # examples=[
76
- # "Tell me a joke about old houses.",
77
- # "Insult me.",
78
- # "What is the future of AI and large language models?",
79
- # ],
80
- # inputs=message,
81
- # )
82
-
83
  chat_history_state = gr.State()
84
  clear.click(clear_chat, inputs=[chat_history_state, message], outputs=[chat_history_state, message])
85
  clear.click(lambda: None, None, chatbot, queue=False)
86
 
87
- submit.click(chat, inputs=[message, chat_history_state, system_msg], outputs=[chatbot, chat_history_state, message])
88
- message.submit(chat, inputs=[message, chat_history_state, system_msg], outputs=[chatbot, chat_history_state, message])
89
-
90
- blocks.queue(max_size=32, concurrency_count=3).launch(debug=True, server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  llm = Llama(model_path=fp, **config["llama_cpp"])
22
 
23
 
24
+ def user(message, history):
25
  history = history or []
26
+ # Append the user's message to the conversation history
27
+ history.append([message, ""])
28
+ return "", history
29
 
 
 
 
30
 
31
+ def chat(history, system_message):
32
  history = history or []
33
 
34
+ messages = system_message + \
35
+ "\n".join(["\n".join(["### User: "+item[0], "Assistant: "+item[1]])
36
+ for item in history])
 
 
37
 
38
+ history[-1][1] = ""
39
+ for output in llm(messages, max_tokens=512, stop=["</s>", "<unk>", "### User:"], echo=False, stream=True):
40
+ answer = output['choices'][0]['text']
41
+ history[-1][1] = answer
42
 
43
+ yield history, history
44
 
45
 
46
  def clear_chat(chat_history_state, chat_message):
 
66
  placeholder="Ask me anything.",
67
  lines=1,
68
  )
69
+ with gr.Row():
70
+ submit = gr.Button(value="Send message", variant="secondary").style(full_width=True)
71
  clear = gr.Button(value="New topic", variant="secondary").style(full_width=False)
72
+ stop = gr.Button(value="Stop", variant="secondary").style(full_width=False)
73
 
74
  system_msg = gr.Textbox(
75
  start_message, label="System Message", interactive=False, visible=False)
76
 
 
 
 
 
 
 
 
 
 
77
  chat_history_state = gr.State()
78
  clear.click(clear_chat, inputs=[chat_history_state, message], outputs=[chat_history_state, message])
79
  clear.click(lambda: None, None, chatbot, queue=False)
80
 
81
+ submit_click_event = submit.click(
82
+ fn=user, inputs=[message, chat_history_state], outputs=[message, chat_history_state], queue=False
83
+ ).then(
84
+ fn=chat, inputs=[chat_history_state, system_msg], outputs=[chatbot, chat_history_state], queue=True
85
+ )
86
+ message_submit_event = message.submit(
87
+ fn=user, inputs=[message, chat_history_state], outputs=[message, chat_history_state], queue=False
88
+ ).then(
89
+ fn=chat, inputs=[chat_history_state, system_msg], outputs=[chatbot, chat_history_state], queue=True
90
+ )
91
+ stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_click_event, message_submit_event], queue=False)
92
+
93
+ gr.Markdown("""
94
+ - This is running on a smaller, shared GPU, so it may take a few seconds to respond.
95
+ - [Duplicate the Space](https://huggingface.co/spaces/openaccess-ai-collective/ggml-ui?duplicate=true) to skip the queue and run in a private space or to use your own GGML models.
96
+ - When using your own models, simply update the [./config.yml](./config.yml)")
97
+ - Contribute at [https://github.com/OpenAccess-AI-Collective/ggml-webui](https://github.com/OpenAccess-AI-Collective/ggml-webui)
98
+ """)
99
+
100
+ blocks.queue(max_size=8, concurrency_count=2).launch(debug=True, server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,2 +1,6 @@
1
- llama-cpp-python @ https://github.com/OpenAccess-AI-Collective/ggml-webui/releases/download/v0.1.49-rc6/llama_cpp_python-cpu-0.1.49-cp38-cp38-linux_x86_64.whl
 
 
 
2
  pyyaml
 
 
1
+ --extra-index-url https://pypi.ngc.nvidia.com
2
+ nvidia-cuda-runtime
3
+ nvidia-cublas
4
+ llama-cpp-python @ https://github.com/OpenAccess-AI-Collective/ggml-webui/releases/download/v0.1.50-rc3/llama_cpp_python-gpu-0.1.50-cp38-cp38-linux_x86_64.whl
5
  pyyaml
6
+ torch