1littlecoder radames commited on
Commit
a88480f
·
0 Parent(s):

Duplicate from SpacesExamples/llama-cpp-python-cuda-gradio

Browse files

Co-authored-by: Radamés Ajna <radames@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +35 -0
  2. Dockerfile +45 -0
  3. README.md +11 -0
  4. app.py +72 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
2
+ FROM nvidia/cuda:${CUDA_IMAGE}
3
+
4
+ # We need to set the host to 0.0.0.0 to allow outside access
5
+ ENV HOST 0.0.0.0
6
+
7
+ RUN apt-get update && apt-get upgrade -y \
8
+ && apt-get install -y git build-essential \
9
+ python3 python3-pip gcc wget \
10
+ ocl-icd-opencl-dev opencl-headers clinfo \
11
+ libclblast-dev libopenblas-dev \
12
+ && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
13
+
14
+ COPY . .
15
+
16
+ # setting build related env vars
17
+ ENV CUDA_DOCKER_ARCH=all
18
+ ENV LLAMA_CUBLAS=1
19
+
20
+ # Install depencencies
21
+ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings gradio huggingface_hub hf_transfer
22
+
23
+ # Install llama-cpp-python (build with cuda)
24
+ RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
25
+
26
+ RUN useradd -m -u 1000 user
27
+ # Switch to the "user" user
28
+ USER user
29
+ # Set home to the user's home directory
30
+ ENV HOME=/home/user \
31
+ PATH=/home/user/.local/bin:$PATH \
32
+ PYTHONPATH=$HOME/app \
33
+ PYTHONUNBUFFERED=1 \
34
+ GRADIO_ALLOW_FLAGGING=never \
35
+ GRADIO_NUM_PORTS=1 \
36
+ GRADIO_SERVER_NAME=0.0.0.0 \
37
+ GRADIO_THEME=huggingface \
38
+ SYSTEM=spaces
39
+
40
+ WORKDIR $HOME/app
41
+
42
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
43
+ COPY --chown=user . $HOME/app
44
+
45
+ CMD ["python3", "app.py"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Llama Cpp Python Cuda
3
+ emoji: 🦙
4
+ colorFrom: pink
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ duplicated_from: SpacesExamples/llama-cpp-python-cuda-gradio
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import copy
4
+ import time
5
+ import llama_cpp
6
+ from llama_cpp import Llama
7
+ from huggingface_hub import hf_hub_download
8
+
9
+
10
+ llm = Llama(
11
+ model_path=hf_hub_download(
12
+ repo_id=os.environ.get("REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"),
13
+ filename=os.environ.get("MODEL_FILE", "llama-2-7b-chat.ggmlv3.q5_0.bin"),
14
+ ),
15
+ n_ctx=2048,
16
+ n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
17
+ )
18
+
19
+ history = []
20
+
21
+ system_message = """
22
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
23
+
24
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
25
+ """
26
+
27
+
28
+ def generate_text(message, history):
29
+ temp = ""
30
+ input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
31
+ for interaction in history:
32
+ input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
33
+
34
+ input_prompt = input_prompt + str(message) + " [/INST] "
35
+
36
+ output = llm(
37
+ input_prompt,
38
+ temperature=0.15,
39
+ top_p=0.1,
40
+ top_k=40,
41
+ repeat_penalty=1.1,
42
+ max_tokens=1024,
43
+ stop=[
44
+ "<|prompter|>",
45
+ "<|endoftext|>",
46
+ "<|endoftext|> \n",
47
+ "ASSISTANT:",
48
+ "USER:",
49
+ "SYSTEM:",
50
+ ],
51
+ stream=True,
52
+ )
53
+ for out in output:
54
+ stream = copy.deepcopy(out)
55
+ temp += stream["choices"][0]["text"]
56
+ yield temp
57
+
58
+ history = ["init", input_prompt]
59
+
60
+
61
+ demo = gr.ChatInterface(
62
+ generate_text,
63
+ title="llama-cpp-python on GPU",
64
+ description="Running LLM with https://github.com/abetlen/llama-cpp-python",
65
+ examples=["tell me everything about llamas"],
66
+ cache_examples=True,
67
+ retry_btn=None,
68
+ undo_btn="Delete Previous",
69
+ clear_btn="Clear",
70
+ )
71
+ demo.queue(concurrency_count=1, max_size=5)
72
+ demo.launch()