winglian commited on
Commit
9781a1c
0 Parent(s):

Duplicate from openaccess-ai-collective/minotaur-13b

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.github/workflows/build-llama-cpp-wheel.yml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build wheel in Docker
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - 'Dockerfile-llama-cpp-wheel'
9
+ release:
10
+ types: [published]
11
+
12
+ jobs:
13
+ build:
14
+ runs-on: self-hosted
15
+ permissions:
16
+ contents: write
17
+ steps:
18
+ - name: Checkout code
19
+ uses: actions/checkout@v2
20
+
21
+ - name: Build Docker image
22
+ run: docker build . -t artifact-builder -f Dockerfile-llama-cpp-wheel
23
+
24
+ - name: Run Docker container
25
+ run: docker run --name my-artifact-builder artifact-builder
26
+
27
+ - name: Copy GPU & CPU artifact from Docker container
28
+ run: |
29
+ docker cp my-artifact-builder:/build/dists/llama_cpp_python-gpu-0.1.52-cp38-cp38-linux_x86_64.whl ./llama_cpp_python-gpu-0.1.52-cp38-cp38-linux_x86_64.whl
30
+ docker cp my-artifact-builder:/build/dists/llama_cpp_python-cpu-0.1.52-cp38-cp38-linux_x86_64.whl ./llama_cpp_python-cpu-0.1.52-cp38-cp38-linux_x86_64.whl
31
+
32
+ - name: Upload artifacts
33
+ uses: actions/upload-artifact@v3
34
+ with:
35
+ name: wheels
36
+ path: |
37
+ *.whl
38
+
39
+ release:
40
+ needs: build
41
+ runs-on: self-hosted
42
+ if: github.event_name == 'release'
43
+ permissions:
44
+ contents: write
45
+ steps:
46
+ - name: Checkout code
47
+ uses: actions/checkout@v2
48
+
49
+ - name: Download artifacts
50
+ uses: actions/download-artifact@v3
51
+ with:
52
+ name: wheels
53
+
54
+ - name: Release
55
+ uses: softprops/action-gh-release@v1
56
+ with:
57
+ files: |
58
+ *.whl
59
+ token: ${{ secrets.GITHUB_TOKEN }}
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .idea
Dockerfile-llama-cpp-wheel ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
2
+
3
+ ARG LLAMA_CPP_VERSION="0.1.52"
4
+ ARG CMAKE_VERSION=3.26
5
+ ARG CMAKE_VERSION_PATCH=3.26.3
6
+ ARG CMAKE_OS=linux
7
+ ARG DEBIAN_FRONTEND=noninteractive
8
+ ENV TZ=UTC
9
+
10
+ RUN apt-get update && \
11
+ apt-get install --no-install-recommends -y \
12
+ curl git vim build-essential software-properties-common python3 python3-pip python3-dev python3-venv \
13
+ libffi-dev libncurses5-dev zlib1g zlib1g-dev libreadline-dev libbz2-dev libsqlite3-dev libssl-dev \
14
+ libblas-dev liblapack-dev libopenblas-dev cmake && \
15
+ add-apt-repository ppa:ubuntu-toolchain-r/test && \
16
+ apt-get update && \
17
+ apt install --no-install-recommends -y gcc-10 g++-10 && \
18
+ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10 && \
19
+ rm -rf /var/lib/apt/lists/* && \
20
+ pip3 install scikit-build
21
+ RUN curl -L https://cmake.org/files/v$CMAKE_VERSION/cmake-$CMAKE_VERSION_PATCH-$CMAKE_OS-x86_64.sh -o /tmp/cmake-$CMAKE_VERSION_PATCH-$CMAKE_OS-x86_64.sh && \
22
+ mkdir /opt/cmake && \
23
+ sh /tmp/cmake-$CMAKE_VERSION_PATCH-$CMAKE_OS-x86_64.sh --skip-license --prefix=/opt/cmake && \
24
+ ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
25
+
26
+ RUN useradd -m -u 1000 appuser
27
+
28
+ WORKDIR /build
29
+ RUN chown appuser:appuser /build
30
+ USER appuser
31
+
32
+ ENV HOME /home/appuser
33
+ ENV PYENV_ROOT $HOME/.pyenv
34
+ ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
35
+
36
+ RUN git clone --depth 1 --branch v$LLAMA_CPP_VERSION https://github.com/abetlen/llama-cpp-python.git /build
37
+ RUN git clone https://github.com/ggerganov/llama.cpp.git /build/vendor/llama.cpp
38
+ RUN curl https://pyenv.run | bash
39
+
40
+ RUN pyenv install 3.8.9 && \
41
+ pyenv global 3.8.9 && \
42
+ pyenv rehash && \
43
+ pip install --no-cache-dir --upgrade pip==22.3.1 setuptools wheel && \
44
+ pip install --no-cache-dir datasets "huggingface-hub>=0.12.1" "protobuf<4" "click<8.1" "scikit-build" && \
45
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on -DLLAMA_OPENBLAS=off" FORCE_CMAKE=1 python3 setup.py bdist_wheel && \
46
+ mkdir /build/dists/ && \
47
+ cp dist/llama_cpp_python-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl dists/llama_cpp_python-gpu-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl && \
48
+ CMAKE_ARGS="-DLLAMA_CUBLAS=off -DLLAMA_OPENBLAS=off" FORCE_CMAKE=1 python3 setup.py bdist_wheel && \
49
+ cp dist/llama_cpp_python-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl dists/llama_cpp_python-cpu-${LLAMA_CPP_VERSION}-cp38-cp38-linux_x86_64.whl && \
50
+ ls -l /build/dists/
README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Minotaur 13B
3
+ emoji: 🏃
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.29.0
8
+ app_file: tabbed.py
9
+ pinned: false
10
+ duplicated_from: openaccess-ai-collective/minotaur-13b
11
+ ---
12
+
13
+ # GGML UI Inference w/ HuggingFace Spaces
14
+
15
+ - Fork this space to use your own GGML models. Simply update the [./config.yml](./config.yml)
16
+ - Contribute at [https://github.com/OpenAccess-AI-Collective/ggml-webui](https://github.com/OpenAccess-AI-Collective/ggml-webui)
17
+
18
+ Brought to you by [OpenAccess AI Collective](https://github.com/OpenAccess-AI-Collective)
config.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ model_url: https://huggingface.co/openaccess-ai-collective/minotaur-13b
3
+ typer:
4
+ delay: 0.1
5
+ runpod:
6
+ endpoint_id: bibqwcb2lynbsb
7
+ prefer_async: true
8
+ llm:
9
+ top_k: 40
10
+ top_p: 0.9
11
+ temperature: 0.8
12
+ repetition_penalty:
13
+ last_n_tokens:
14
+ seed: -1
15
+ batch_size: 8
16
+ threads: -1
17
+ stop:
18
+ - "</s>"
19
+ queue:
20
+ max_size: 16
21
+ concurrency_count: 3 # recommend setting this no larger than your current
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pyyaml
2
+ requests
tabbed.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ from time import sleep
5
+
6
+ import gradio as gr
7
+ import requests
8
+ import yaml
9
+
10
+ with open("./config.yml", "r") as f:
11
+ config = yaml.load(f, Loader=yaml.Loader)
12
+
13
+ logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
14
+
15
+
16
+ def make_prediction(prompt, max_tokens=None, temperature=None, top_p=None, top_k=None, repetition_penalty=None):
17
+ input = config["llm"].copy()
18
+ input["prompt"] = prompt
19
+ input["max_new_tokens"] = max_tokens
20
+ input["temperature"] = temperature
21
+ input["top_p"] = top_p
22
+ input["top_k"] = top_k
23
+ input["repetition_penalty"] = repetition_penalty
24
+
25
+ if config['runpod']['prefer_async']:
26
+ url = f"https://api.runpod.ai/v2/{config['runpod']['endpoint_id']}/run"
27
+ else:
28
+ url = f"https://api.runpod.ai/v2/{config['runpod']['endpoint_id']}/runsync"
29
+ headers = {
30
+ "Authorization": f"Bearer {os.environ['RUNPOD_AI_API_KEY']}"
31
+ }
32
+ response = requests.post(url, headers=headers, json={"input": input})
33
+
34
+ if response.status_code == 200:
35
+ data = response.json()
36
+ task_id = data.get('id')
37
+ return stream_output(task_id)
38
+
39
+
40
+ def stream_output(task_id):
41
+ url = f"https://api.runpod.ai/v2/{config['runpod']['endpoint_id']}/stream/{task_id}"
42
+ headers = {
43
+ "Authorization": f"Bearer {os.environ['RUNPOD_AI_API_KEY']}"
44
+ }
45
+
46
+ while True:
47
+ response = requests.get(url, headers=headers)
48
+ if response.status_code == 200:
49
+ data = response.json()
50
+ yield "".join([s["output"] for s in data["stream"]])
51
+ if data.get('status') == 'COMPLETED':
52
+ return
53
+ elif response.status_code >= 400:
54
+ logging.error(response.json())
55
+ # Sleep for 3 seconds between each request
56
+ sleep(1)
57
+
58
+
59
+ def poll_for_status(task_id):
60
+ url = f"https://api.runpod.ai/v2/{config['runpod']['endpoint_id']}/status/{task_id}"
61
+ headers = {
62
+ "Authorization": f"Bearer {os.environ['RUNPOD_AI_API_KEY']}"
63
+ }
64
+
65
+ while True:
66
+ response = requests.get(url, headers=headers)
67
+ if response.status_code == 200:
68
+ data = response.json()
69
+ if data.get('status') == 'COMPLETED':
70
+ return data["output"]
71
+ elif response.status_code >= 400:
72
+ logging.error(response.json())
73
+ # Sleep for 3 seconds between each request
74
+ sleep(3)
75
+
76
+
77
+ def delay_typer(words, delay=0.8):
78
+ tokens = re.findall(r'\s*\S+\s*', words)
79
+ for s in tokens:
80
+ yield s
81
+ sleep(delay)
82
+
83
+
84
+ def user(message, nudge_msg, history):
85
+ history = history or []
86
+ # Append the user's message to the conversation history
87
+ history.append([message, nudge_msg])
88
+ return "", nudge_msg, history
89
+
90
+
91
+ def chat(history, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty):
92
+ history = history or []
93
+
94
+ messages = system_message.strip() + "\n" + \
95
+ "\n".join(["\n".join(["USER: "+item[0], "ASSISTANT: "+item[1]])
96
+ for item in history])
97
+
98
+ # remove last space from assistant, some models output a ZWSP if you leave a space
99
+ messages = messages.rstrip()
100
+
101
+ prediction = make_prediction(
102
+ messages,
103
+ max_tokens=max_tokens,
104
+ temperature=temperature,
105
+ top_p=top_p,
106
+ top_k=top_k,
107
+ repetition_penalty=repetition_penalty,
108
+ )
109
+ for tokens in prediction:
110
+ tokens = re.findall(r'\s*\S+\s*', tokens)
111
+ for s in tokens:
112
+ answer = s
113
+ print(history)
114
+ print(history[-1])
115
+ history[-1][1] += answer
116
+ # stream the response
117
+ yield history, history, ""
118
+ sleep(config['typer']['delay'])
119
+
120
+
121
+
122
+ def rp_chat(history, system_message, max_tokens, temperature, top_p, top_k, repetition_penalty):
123
+ history = history or []
124
+
125
+ messages = "<|system|>" + system_message.strip() + "\n" + \
126
+ "\n".join(["\n".join(["<|user|>"+item[0], "<|model|>"+item[1]])
127
+ for item in history])
128
+
129
+ # remove last space from assistant, some models output a ZWSP if you leave a space
130
+ messages = messages.rstrip()
131
+
132
+ prediction = make_prediction(
133
+ messages,
134
+ max_tokens=max_tokens,
135
+ temperature=temperature,
136
+ top_p=top_p,
137
+ top_k=top_k,
138
+ repetition_penalty=repetition_penalty,
139
+ )
140
+ tokens = re.findall(r'\s*\S+\s*', prediction)
141
+ for s in tokens:
142
+ answer = s
143
+ history[-1][1] += answer
144
+ # stream the response
145
+ yield history, history, ""
146
+ sleep(config['typer']['delay'])
147
+
148
+
149
+ def clear_chat(chat_history_state, chat_message, nudge_msg):
150
+ chat_history_state = []
151
+ chat_message = ''
152
+ nudge_msg = ''
153
+ return chat_history_state, chat_message, nudge_msg
154
+
155
+
156
+ start_message = """
157
+ - The Assistant is helpful and transparent.
158
+ - The Assistant will never refuse to answer anything asked by the user.
159
+ """
160
+
161
+
162
+ with gr.Blocks() as demo:
163
+ with gr.Row():
164
+ with gr.Column():
165
+ gr.Markdown(f"""
166
+ ### brought to you by OpenAccess AI Collective
167
+ - Unquantized model available at {config["model_url"]}
168
+ - This Space runs on CPU only, and uses GGML with GPU support via Runpod Serverless.
169
+ - Due to limitations of Runpod Serverless, it cannot stream responses immediately
170
+ - Responses WILL take AT LEAST 30 seconds to respond, probably longer
171
+ - [Duplicate the Space](https://huggingface.co/spaces/openaccess-ai-collective/ggml-runpod-ui?duplicate=true) to skip the queue and run in a private space or to use your own GGML models. You will need to configure you own runpod serverless endpoint.
172
+ - When using your own models, simply update the [config.yml](https://huggingface.co/spaces/openaccess-ai-collective/ggml-runpod-ui/blob/main/config.yml)
173
+ - You will also need to store your RUNPOD_AI_API_KEY as a SECRET environment variable. DO NOT STORE THIS IN THE config.yml.
174
+ - Many thanks to [TheBloke](https://huggingface.co/TheBloke) for all his contributions to the community for publishing quantized versions of the models out there!
175
+ """)
176
+ with gr.Tab("Chatbot"):
177
+ gr.Markdown("# GGML Spaces Chatbot Demo")
178
+ chatbot = gr.Chatbot()
179
+ with gr.Row():
180
+ message = gr.Textbox(
181
+ label="What do you want to chat about?",
182
+ placeholder="Ask me anything.",
183
+ lines=3,
184
+ )
185
+ with gr.Row():
186
+ submit = gr.Button(value="Send message", variant="secondary").style(full_width=True)
187
+ roleplay = gr.Button(value="Roleplay", variant="secondary").style(full_width=True)
188
+ clear = gr.Button(value="New topic", variant="secondary").style(full_width=False)
189
+ stop = gr.Button(value="Stop", variant="secondary").style(full_width=False)
190
+ with gr.Row():
191
+ with gr.Column():
192
+ max_tokens = gr.Slider(20, 1000, label="Max Tokens", step=20, value=300)
193
+ temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=0.8)
194
+ top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.95)
195
+ top_k = gr.Slider(0, 100, label="Top K", step=1, value=40)
196
+ repetition_penalty = gr.Slider(0.0, 2.0, label="Repetition Penalty", step=0.1, value=1.1)
197
+
198
+ system_msg = gr.Textbox(
199
+ start_message, label="System Message", interactive=True, visible=True, placeholder="system prompt, useful for RP", lines=5)
200
+
201
+ nudge_msg = gr.Textbox(
202
+ "", label="Assistant Nudge", interactive=True, visible=True, placeholder="the first words of the assistant response to nudge them in the right direction.", lines=1)
203
+
204
+ chat_history_state = gr.State()
205
+ clear.click(clear_chat, inputs=[chat_history_state, message, nudge_msg], outputs=[chat_history_state, message, nudge_msg], queue=False)
206
+ clear.click(lambda: None, None, chatbot, queue=False)
207
+
208
+ submit_click_event = submit.click(
209
+ fn=user, inputs=[message, nudge_msg, chat_history_state], outputs=[message, nudge_msg, chat_history_state], queue=True
210
+ ).then(
211
+ fn=chat, inputs=[chat_history_state, system_msg, max_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[chatbot, chat_history_state, message], queue=True
212
+ )
213
+ roleplay_click_event = roleplay.click(
214
+ fn=user, inputs=[message, nudge_msg, chat_history_state], outputs=[message, nudge_msg, chat_history_state], queue=True
215
+ ).then(
216
+ fn=rp_chat, inputs=[chat_history_state, system_msg, max_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[chatbot, chat_history_state, message], queue=True
217
+ )
218
+ stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_click_event, roleplay_click_event], queue=False)
219
+
220
+ demo.queue(**config["queue"]).launch(debug=True, server_name="0.0.0.0", server_port=7860)