Radamés Ajna commited on
Commit
eee4d14
0 Parent(s):

Duplicate from radames/llama-cpp-python-cuda-gradio

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. Dockerfile +45 -0
  3. README.md +11 -0
  4. app.py +70 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
2
+ FROM nvidia/cuda:${CUDA_IMAGE}
3
+
4
+ # We need to set the host to 0.0.0.0 to allow outside access
5
+ ENV HOST 0.0.0.0
6
+
7
+ RUN apt-get update && apt-get upgrade -y \
8
+ && apt-get install -y git build-essential \
9
+ python3 python3-pip gcc wget \
10
+ ocl-icd-opencl-dev opencl-headers clinfo \
11
+ libclblast-dev libopenblas-dev \
12
+ && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
13
+
14
+ COPY . .
15
+
16
+ # setting build related env vars
17
+ ENV CUDA_DOCKER_ARCH=all
18
+ ENV LLAMA_CUBLAS=1
19
+
20
+ # Install depencencies
21
+ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings gradio huggingface_hub hf_transfer
22
+
23
+ # Install llama-cpp-python (build with cuda)
24
+ RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
25
+
26
+ RUN useradd -m -u 1000 user
27
+ # Switch to the "user" user
28
+ USER user
29
+ # Set home to the user's home directory
30
+ ENV HOME=/home/user \
31
+ PATH=/home/user/.local/bin:$PATH \
32
+ PYTHONPATH=$HOME/app \
33
+ PYTHONUNBUFFERED=1 \
34
+ GRADIO_ALLOW_FLAGGING=never \
35
+ GRADIO_NUM_PORTS=1 \
36
+ GRADIO_SERVER_NAME=0.0.0.0 \
37
+ GRADIO_THEME=huggingface \
38
+ SYSTEM=spaces
39
+
40
+ WORKDIR $HOME/app
41
+
42
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
43
+ COPY --chown=user . $HOME/app
44
+
45
+ CMD ["python3", "app.py"]
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Llama Cpp Python Cuda
3
+ emoji: 🏆
4
+ colorFrom: pink
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ duplicated_from: radames/llama-cpp-python-cuda-gradio
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import copy
3
+ import time
4
+ import ctypes # to run on C api directly
5
+ import llama_cpp
6
+ from llama_cpp import Llama
7
+ from huggingface_hub import hf_hub_download # load from huggingfaces
8
+
9
+
10
+ llm = Llama(
11
+ model_path=hf_hub_download(
12
+ repo_id="TheBloke/WizardLM-7B-uncensored-GGML",
13
+ filename="WizardLM-7B-uncensored.ggmlv3.q4_0.bin",
14
+ ),
15
+ n_ctx=2048,
16
+ ) # download model from hf/ n_ctx=2048 for high ccontext length
17
+
18
+ history = []
19
+
20
+ pre_prompt = " The user and the AI are having a conversation : <|endoftext|> \n "
21
+
22
+
23
+ def generate_text(input_text, history):
24
+
25
+ temp = ""
26
+ if history == []:
27
+ input_text_with_history = (
28
+ f"SYSTEM:{pre_prompt}"
29
+ + "\n"
30
+ + f"USER: {input_text} "
31
+ + "\n"
32
+ + " ASSISTANT:"
33
+ )
34
+ else:
35
+ input_text_with_history = f"{history[-1][1]}" + "\n"
36
+ input_text_with_history += f"USER: {input_text}" + "\n" + " ASSISTANT:"
37
+
38
+ output = llm(
39
+ input_text_with_history,
40
+ max_tokens=1024,
41
+ stop=[
42
+ "<|prompter|>",
43
+ "<|endoftext|>",
44
+ "<|endoftext|> \n",
45
+ "ASSISTANT:",
46
+ "USER:",
47
+ "SYSTEM:",
48
+ ],
49
+ stream=True,
50
+ )
51
+ for out in output:
52
+ stream = copy.deepcopy(out)
53
+ temp += stream["choices"][0]["text"]
54
+ yield temp
55
+
56
+ history = ["init", input_text_with_history]
57
+
58
+
59
+ demo = gr.ChatInterface(
60
+ generate_text,
61
+ title="llama-cpp-python on GPU",
62
+ description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
63
+ examples=["Hello", "Am I cool?", "Are tomatoes vegetables?"],
64
+ cache_examples=True,
65
+ retry_btn=None,
66
+ undo_btn="Delete Previous",
67
+ clear_btn="Clear",
68
+ )
69
+ demo.queue(concurrency_count=1, max_size=5)
70
+ demo.launch()