sofianhw commited on
Commit
2695082
1 Parent(s): 3d55db5
Files changed (2) hide show
  1. Dockerfile +47 -0
  2. main.py +76 -0
Dockerfile ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
2
+ FROM nvidia/cuda:${CUDA_IMAGE}
3
+
4
+ # We need to set the host to 0.0.0.0 to allow outside access
5
+ ENV HOST 0.0.0.0
6
+
7
+ RUN apt-get update && apt-get upgrade -y \
8
+ && apt-get install -y git build-essential \
9
+ python3 python3-pip gcc wget \
10
+ ocl-icd-opencl-dev opencl-headers clinfo \
11
+ libclblast-dev libopenblas-dev \
12
+ && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
13
+
14
+ COPY . .
15
+
16
+ # setting build related env vars
17
+ ENV CUDA_DOCKER_ARCH=all
18
+ ENV LLAMA_CUBLAS=1
19
+
20
+ # Install depencencies
21
+ RUN python3 -m pip install --upgrade pip pytest cmake \
22
+ scikit-build setuptools fastapi uvicorn sse-starlette \
23
+ pydantic-settings huggingface_hub hf_transfer
24
+
25
+ # Install llama-cpp-python (build with cuda)
26
+ RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
27
+
28
+ RUN useradd -m -u 1000 user
29
+ # Switch to the "user" user
30
+ USER user
31
+ # Set home to the user's home directory
32
+ ENV HOME=/home/user \
33
+ PATH=/home/user/.local/bin:$PATH \
34
+ PYTHONPATH=$HOME/app \
35
+ PYTHONUNBUFFERED=1 \
36
+ GRADIO_ALLOW_FLAGGING=never \
37
+ GRADIO_NUM_PORTS=1 \
38
+ GRADIO_SERVER_NAME=0.0.0.0 \
39
+ GRADIO_THEME=huggingface \
40
+ SYSTEM=spaces
41
+
42
+ WORKDIR $HOME/app
43
+
44
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
45
+ COPY --chown=user . $HOME/app
46
+
47
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import copy
3
+ import time
4
+ import llama_cpp
5
+ from llama_cpp import Llama
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ import uvicorn
9
+ from fastapi import FastAPI, Request
10
+
11
+
12
+ llm = Llama(
13
+ model_path=hf_hub_download(
14
+ repo_id=os.environ.get("REPO_ID", "TheBloke/Llama-2-7b-Chat-GGUF"),
15
+ filename=os.environ.get("MODEL_FILE", "llama-2-7b-chat.Q5_0.gguf"),
16
+ ),
17
+ n_ctx=2048,
18
+ n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
19
+ )
20
+
21
+ history = []
22
+
23
+ system_message = """
24
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
25
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
26
+ """
27
+
28
+
29
+ def generate_text(message, history):
30
+ temp = ""
31
+ input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
32
+ for interaction in history:
33
+ input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
34
+
35
+ input_prompt = input_prompt + str(message) + " [/INST] "
36
+
37
+ output = llm(
38
+ input_prompt,
39
+ temperature=0.15,
40
+ top_p=0.1,
41
+ top_k=40,
42
+ repeat_penalty=1.1,
43
+ max_tokens=1024,
44
+ stop=[
45
+ "<|prompter|>",
46
+ "<|endoftext|>",
47
+ "<|endoftext|> \n",
48
+ "ASSISTANT:",
49
+ "USER:",
50
+ "SYSTEM:",
51
+ ],
52
+ )
53
+ # for out in output:
54
+ # stream = copy.deepcopy(out)
55
+ # temp += stream["choices"][0]["text"]
56
+ # yield temp
57
+
58
+ history = ["init", input_prompt]
59
+
60
+ print(history)
61
+ print(output)
62
+ return output
63
+
64
+ app = FastAPI()
65
+
66
+ @app.post("/api/generate")
67
+ async def generate(request: Request):
68
+ # Receive the request as JSON
69
+ data = await request.json()
70
+ # Check if the event is a completed order
71
+ if data['message']:
72
+ response = generate_text(data['message'], history)
73
+ return {"status": "success", "data":response}
74
+ else:
75
+ # If the event is not what we're looking for, ignore it
76
+ return {"status": "ignored"}