Spaces:
Runtime error
Runtime error
Commit
·
a88480f
0
Parent(s):
Duplicate from SpacesExamples/llama-cpp-python-cuda-gradio
Browse filesCo-authored-by: Radamés Ajna <radames@users.noreply.huggingface.co>
- .gitattributes +35 -0
- Dockerfile +45 -0
- README.md +11 -0
- app.py +72 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
|
2 |
+
FROM nvidia/cuda:${CUDA_IMAGE}
|
3 |
+
|
4 |
+
# We need to set the host to 0.0.0.0 to allow outside access
|
5 |
+
ENV HOST 0.0.0.0
|
6 |
+
|
7 |
+
RUN apt-get update && apt-get upgrade -y \
|
8 |
+
&& apt-get install -y git build-essential \
|
9 |
+
python3 python3-pip gcc wget \
|
10 |
+
ocl-icd-opencl-dev opencl-headers clinfo \
|
11 |
+
libclblast-dev libopenblas-dev \
|
12 |
+
&& mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
|
13 |
+
|
14 |
+
COPY . .
|
15 |
+
|
16 |
+
# setting build related env vars
|
17 |
+
ENV CUDA_DOCKER_ARCH=all
|
18 |
+
ENV LLAMA_CUBLAS=1
|
19 |
+
|
20 |
+
# Install depencencies
|
21 |
+
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings gradio huggingface_hub hf_transfer
|
22 |
+
|
23 |
+
# Install llama-cpp-python (build with cuda)
|
24 |
+
RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
25 |
+
|
26 |
+
RUN useradd -m -u 1000 user
|
27 |
+
# Switch to the "user" user
|
28 |
+
USER user
|
29 |
+
# Set home to the user's home directory
|
30 |
+
ENV HOME=/home/user \
|
31 |
+
PATH=/home/user/.local/bin:$PATH \
|
32 |
+
PYTHONPATH=$HOME/app \
|
33 |
+
PYTHONUNBUFFERED=1 \
|
34 |
+
GRADIO_ALLOW_FLAGGING=never \
|
35 |
+
GRADIO_NUM_PORTS=1 \
|
36 |
+
GRADIO_SERVER_NAME=0.0.0.0 \
|
37 |
+
GRADIO_THEME=huggingface \
|
38 |
+
SYSTEM=spaces
|
39 |
+
|
40 |
+
WORKDIR $HOME/app
|
41 |
+
|
42 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
43 |
+
COPY --chown=user . $HOME/app
|
44 |
+
|
45 |
+
CMD ["python3", "app.py"]
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Llama Cpp Python Cuda
|
3 |
+
emoji: 🦙
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
duplicated_from: SpacesExamples/llama-cpp-python-cuda-gradio
|
9 |
+
---
|
10 |
+
|
11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import copy
|
4 |
+
import time
|
5 |
+
import llama_cpp
|
6 |
+
from llama_cpp import Llama
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
+
|
9 |
+
|
10 |
+
llm = Llama(
|
11 |
+
model_path=hf_hub_download(
|
12 |
+
repo_id=os.environ.get("REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"),
|
13 |
+
filename=os.environ.get("MODEL_FILE", "llama-2-7b-chat.ggmlv3.q5_0.bin"),
|
14 |
+
),
|
15 |
+
n_ctx=2048,
|
16 |
+
n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
|
17 |
+
)
|
18 |
+
|
19 |
+
history = []
|
20 |
+
|
21 |
+
system_message = """
|
22 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
23 |
+
|
24 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
|
25 |
+
"""
|
26 |
+
|
27 |
+
|
28 |
+
def generate_text(message, history):
|
29 |
+
temp = ""
|
30 |
+
input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
|
31 |
+
for interaction in history:
|
32 |
+
input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
|
33 |
+
|
34 |
+
input_prompt = input_prompt + str(message) + " [/INST] "
|
35 |
+
|
36 |
+
output = llm(
|
37 |
+
input_prompt,
|
38 |
+
temperature=0.15,
|
39 |
+
top_p=0.1,
|
40 |
+
top_k=40,
|
41 |
+
repeat_penalty=1.1,
|
42 |
+
max_tokens=1024,
|
43 |
+
stop=[
|
44 |
+
"<|prompter|>",
|
45 |
+
"<|endoftext|>",
|
46 |
+
"<|endoftext|> \n",
|
47 |
+
"ASSISTANT:",
|
48 |
+
"USER:",
|
49 |
+
"SYSTEM:",
|
50 |
+
],
|
51 |
+
stream=True,
|
52 |
+
)
|
53 |
+
for out in output:
|
54 |
+
stream = copy.deepcopy(out)
|
55 |
+
temp += stream["choices"][0]["text"]
|
56 |
+
yield temp
|
57 |
+
|
58 |
+
history = ["init", input_prompt]
|
59 |
+
|
60 |
+
|
61 |
+
demo = gr.ChatInterface(
|
62 |
+
generate_text,
|
63 |
+
title="llama-cpp-python on GPU",
|
64 |
+
description="Running LLM with https://github.com/abetlen/llama-cpp-python",
|
65 |
+
examples=["tell me everything about llamas"],
|
66 |
+
cache_examples=True,
|
67 |
+
retry_btn=None,
|
68 |
+
undo_btn="Delete Previous",
|
69 |
+
clear_btn="Clear",
|
70 |
+
)
|
71 |
+
demo.queue(concurrency_count=1, max_size=5)
|
72 |
+
demo.launch()
|