Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .env +14 -0
- .gitignore +6 -0
- LICENSE +21 -0
- README.md +162 -8
- __pycache__/model.cpython-39.pyc +0 -0
- app.py +322 -0
- env_examples/.env.13b_example +14 -0
- env_examples/.env.7b_8bit_example +14 -0
- env_examples/.env.7b_ggmlv3_q4_0_example +14 -0
- env_examples/.env.7b_gptq_example +14 -0
- gradio_cached_examples/19/Chatbot/tmp2twvboxp.json +1 -0
- gradio_cached_examples/19/Chatbot/tmp73v4irik.json +1 -0
- gradio_cached_examples/19/Chatbot/tmpfl3bd2ut.json +1 -0
- gradio_cached_examples/19/Chatbot/tmppuwn4iw8.json +1 -0
- gradio_cached_examples/19/Chatbot/tmptsok1ajp.json +1 -0
- gradio_cached_examples/19/log.csv +6 -0
- model.py +142 -0
- requirements.txt +11 -0
- static/screenshot.png +0 -0
.env
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_PATH = "../LLM/Llama-2-7b-chat-hf"
|
2 |
+
LOAD_IN_8BIT = True
|
3 |
+
LOAD_IN_4BIT = False
|
4 |
+
LLAMA_CPP = False
|
5 |
+
|
6 |
+
MAX_MAX_NEW_TOKENS = 2048
|
7 |
+
DEFAULT_MAX_NEW_TOKENS = 1024
|
8 |
+
MAX_INPUT_TOKEN_LENGTH = 4000
|
9 |
+
|
10 |
+
DEFAULT_SYSTEM_PROMPT = "\
|
11 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
12 |
+
|
13 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
|
14 |
+
"
|
.gitignore
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
models
|
2 |
+
|
3 |
+
.vscode
|
4 |
+
|
5 |
+
__pycache__
|
6 |
+
gradio_cached_examples
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Tom
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,166 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 3.38.0
|
8 |
app_file: app.py
|
9 |
-
|
|
|
10 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
1 |
---
|
2 |
+
title: llama2-webui
|
|
|
|
|
|
|
|
|
|
|
3 |
app_file: app.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 3.37.0
|
6 |
---
|
7 |
+
# llama2-webui
|
8 |
+
|
9 |
+
Running Llama 2 with gradio web UI on GPU or CPU from anywhere (Linux/Windows/Mac). Supporting Llama 2 7B, 13B, 70B with 8-bit, 4-bit mode. Supporting GPU inference with at least 6 GB VRAM, and CPU inference with at least 6 GB RAM.
|
10 |
+
|
11 |
+
![screenshot](./static/screenshot.png)
|
12 |
+
|
13 |
+
## Features
|
14 |
+
|
15 |
+
- Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
|
16 |
+
- Supporting model backends
|
17 |
+
- Nvidia GPU: tranformers, [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ)
|
18 |
+
- GPU inference with at least 6 GB VRAM
|
19 |
+
|
20 |
+
- CPU, Mac/AMD GPU: [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
21 |
+
- CPU inference [Demo](https://twitter.com/liltom_eth/status/1682791729207070720?s=20) on Macbook Air.
|
22 |
+
|
23 |
+
- Web UI interface: gradio
|
24 |
+
|
25 |
+
## Contents
|
26 |
+
|
27 |
+
- [Install](#install)
|
28 |
+
- [Download Llama-2 Models](#download-llama-2-models)
|
29 |
+
- [Model List](#model-list)
|
30 |
+
- [Download Script](#download-script)
|
31 |
+
- [Usage](#usage)
|
32 |
+
- [Config Examples](#config-examples)
|
33 |
+
- [Start Web UI](#start-web-ui)
|
34 |
+
- [Run on Nvidia GPU](#run-on-nvidia-gpu)
|
35 |
+
- [Run on Low Memory GPU with 8 bit](#run-on-low-memory-gpu-with-8-bit)
|
36 |
+
- [Run on Low Memory GPU with 4 bit](#run-on-low-memory-gpu-with-4-bit)
|
37 |
+
- [Run on CPU](#run-on-cpu)
|
38 |
+
- [Mac GPU and AMD/Nvidia GPU Acceleration](#mac-gpu-and-amdnvidia-gpu-acceleration)
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
## Install
|
43 |
+
|
44 |
+
```
|
45 |
+
pip install -r requirements.txt
|
46 |
+
```
|
47 |
+
|
48 |
+
`bitsandbytes >= 0.39` may not work on older NVIDIA GPUs. In that case, to use `LOAD_IN_8BIT`, you may have to downgrade like this:
|
49 |
+
|
50 |
+
- `pip install bitsandbytes==0.38.1`
|
51 |
+
|
52 |
+
If run on CPU, install llama.cpp additionally by `pip install llama-cpp-python`.
|
53 |
+
|
54 |
+
## Download Llama-2 Models
|
55 |
+
|
56 |
+
Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
|
57 |
+
|
58 |
+
Llama-2-7b-Chat-GPTQ is the GPTQ model files for [Meta's Llama 2 7b Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). GPTQ 4-bit Llama-2 model require less GPU VRAM to run it.
|
59 |
+
|
60 |
+
### Model List
|
61 |
+
|
62 |
+
| Model Name | set MODEL_PATH in .env | Download URL |
|
63 |
+
| ------------------------------ | ---------------------------------------- | ------------------------------------------------------------ |
|
64 |
+
| meta-llama/Llama-2-7b-chat-hf | /path-to/Llama-2-7b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-7b-chat-hf) |
|
65 |
+
| meta-llama/Llama-2-13b-chat-hf | /path-to/Llama-2-13b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-13b-chat-hf) |
|
66 |
+
| meta-llama/Llama-2-70b-chat-hf | /path-to/Llama-2-70b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-70b-chat-hf) |
|
67 |
+
| meta-llama/Llama-2-7b-hf | /path-to/Llama-2-7b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf) |
|
68 |
+
| meta-llama/Llama-2-13b-hf | /path-to/Llama-2-13b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-13b-hf) |
|
69 |
+
| meta-llama/Llama-2-70b-hf | /path-to/Llama-2-70b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-70b-hf) |
|
70 |
+
| TheBloke/Llama-2-7b-Chat-GPTQ | /path-to/Llama-2-7b-Chat-GPTQ | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ) |
|
71 |
+
| TheBloke/Llama-2-7B-Chat-GGML | /path-to/llama-2-7b-chat.ggmlv3.q4_0.bin | [Link](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) |
|
72 |
+
| ... | ... | ... |
|
73 |
+
|
74 |
+
Running 4-bit model `Llama-2-7b-Chat-GPTQ` needs GPU with 6GB VRAM.
|
75 |
+
|
76 |
+
Running 4-bit model `llama-2-7b-chat.ggmlv3.q4_0.bin` needs CPU with 6GB RAM. There is also a list of other 2, 3, 4, 5, 6, 8-bit GGML models that can be used from [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML).
|
77 |
+
|
78 |
+
### Download Script
|
79 |
+
|
80 |
+
These models can be downloaded from the link using CMD like:
|
81 |
+
|
82 |
+
```bash
|
83 |
+
# Make sure you have git-lfs installed (https://git-lfs.com)
|
84 |
+
git lfs install
|
85 |
+
git clone git@hf.co:meta-llama/Llama-2-7b-chat-hf
|
86 |
+
```
|
87 |
+
|
88 |
+
To download Llama 2 models, you need to request access from [https://ai.meta.com/llama/](https://ai.meta.com/llama/) and also enable access on repos like [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main). Requests will be processed in hours.
|
89 |
+
|
90 |
+
For GPTQ models like [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), you can directly download without requesting access.
|
91 |
+
|
92 |
+
For GGML models like [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), you can directly download without requesting access.
|
93 |
+
|
94 |
+
## Usage
|
95 |
+
|
96 |
+
### Config Examples
|
97 |
+
|
98 |
+
Setup your `MODEL_PATH` and model configs in `.env` file.
|
99 |
+
|
100 |
+
There are some examples in `./env_examples/` folder.
|
101 |
+
|
102 |
+
| Model Setup | Example .env |
|
103 |
+
| --------------------------------- | --------------------------- |
|
104 |
+
| Llama-2-7b-chat-hf 8-bit on GPU | .env.7b_8bit_example |
|
105 |
+
| Llama-2-7b-Chat-GPTQ 4-bit on GPU | .env.7b_gptq_example |
|
106 |
+
| Llama-2-7B-Chat-GGML 4bit on CPU | .env.7b_ggmlv3_q4_0_example |
|
107 |
+
| Llama-2-13b-chat-hf on GPU | .env.13b_example |
|
108 |
+
| ... | ... |
|
109 |
+
|
110 |
+
### Start Web UI
|
111 |
+
|
112 |
+
Run chatbot with web UI:
|
113 |
+
|
114 |
+
```
|
115 |
+
python app.py
|
116 |
+
```
|
117 |
+
|
118 |
+
### Run on Nvidia GPU
|
119 |
+
|
120 |
+
The running requires around 14GB of GPU VRAM for Llama-2-7b and 28GB of GPU VRAM for Llama-2-13b.
|
121 |
+
|
122 |
+
If you are running on multiple GPUs, the model will be loaded automatically on GPUs and split the VRAM usage. That allows you to run Llama-2-7b (requires 14GB of GPU VRAM) on a setup like 2 GPUs (11GB VRAM each).
|
123 |
+
|
124 |
+
#### Run on Low Memory GPU with 8 bit
|
125 |
+
|
126 |
+
If you do not have enough memory, you can set up your `LOAD_IN_8BIT` as `True` in `.env`. This can reduce memory usage by around half with slightly degraded model quality. It is compatible with the CPU, GPU, and Metal backend.
|
127 |
+
|
128 |
+
Llama-2-7b with 8-bit compression can run on a single GPU with 8 GB of VRAM, like an Nvidia RTX 2080Ti, RTX 4080, T4, V100 (16GB).
|
129 |
+
|
130 |
+
#### Run on Low Memory GPU with 4 bit
|
131 |
+
|
132 |
+
If you want to run 4 bit Llama-2 model like `Llama-2-7b-Chat-GPTQ`, you can set up your `LOAD_IN_4BIT` as `True` in `.env` like example `.env.7b_gptq_example`.
|
133 |
+
|
134 |
+
Make sure you have downloaded the 4-bit model from `Llama-2-7b-Chat-GPTQ` and set the `MODEL_PATH` and arguments in `.env` file.
|
135 |
+
|
136 |
+
`Llama-2-7b-Chat-GPTQ` can run on a single GPU with 6 GB of VRAM.
|
137 |
+
|
138 |
+
### Run on CPU
|
139 |
+
|
140 |
+
Run Llama-2 model on CPU requires [llama.cpp](https://github.com/ggerganov/llama.cpp) dependency and [llama.cpp Python Bindings](https://github.com/abetlen/llama-cpp-python).
|
141 |
+
|
142 |
+
```bash
|
143 |
+
pip install llama-cpp-python
|
144 |
+
```
|
145 |
+
|
146 |
+
Download GGML models like `llama-2-7b-chat.ggmlv3.q4_0.bin` following [Download Llama-2 Models](#download-llama-2-models) section. `llama-2-7b-chat.ggmlv3.q4_0.bin` model requires at least 6 GB RAM to run on CPU.
|
147 |
+
|
148 |
+
Set up configs like `.env.7b_ggmlv3_q4_0_example` from `env_examples` as `.env`.
|
149 |
+
|
150 |
+
Run web UI `python app.py` .
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
#### Mac GPU and AMD/Nvidia GPU Acceleration
|
155 |
+
|
156 |
+
If you would like to use Mac GPU and AMD/Nvidia GPU for acceleration, check these:
|
157 |
+
|
158 |
+
- [Installation with OpenBLAS / cuBLAS / CLBlast / Metal](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)
|
159 |
+
|
160 |
+
- [MacOS Install with Metal GPU](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md)
|
161 |
+
|
162 |
+
## Credits
|
163 |
|
164 |
+
- https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
|
165 |
+
- https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat
|
166 |
+
- https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ
|
__pycache__/model.cpython-39.pyc
ADDED
Binary file (4.04 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Iterator
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from distutils.util import strtobool
|
8 |
+
|
9 |
+
from model import LLAMA2_WRAPPER
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
DEFAULT_SYSTEM_PROMPT = (
|
14 |
+
os.getenv("DEFAULT_SYSTEM_PROMPT")
|
15 |
+
if os.getenv("DEFAULT_SYSTEM_PROMPT") is not None
|
16 |
+
else ""
|
17 |
+
)
|
18 |
+
MAX_MAX_NEW_TOKENS = (
|
19 |
+
int(os.getenv("MAX_MAX_NEW_TOKENS"))
|
20 |
+
if os.getenv("DEFAULT_MAX_NEW_TOKENS") is not None
|
21 |
+
else 2048
|
22 |
+
)
|
23 |
+
DEFAULT_MAX_NEW_TOKENS = (
|
24 |
+
int(os.getenv("DEFAULT_MAX_NEW_TOKENS"))
|
25 |
+
if os.getenv("DEFAULT_MAX_NEW_TOKENS") is not None
|
26 |
+
else 1024
|
27 |
+
)
|
28 |
+
MAX_INPUT_TOKEN_LENGTH = (
|
29 |
+
int(os.getenv("MAX_INPUT_TOKEN_LENGTH"))
|
30 |
+
if os.getenv("MAX_INPUT_TOKEN_LENGTH") is not None
|
31 |
+
else 4000
|
32 |
+
)
|
33 |
+
|
34 |
+
MODEL_PATH = os.getenv("MODEL_PATH")
|
35 |
+
assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
|
36 |
+
|
37 |
+
LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
|
38 |
+
|
39 |
+
LOAD_IN_4BIT = bool(strtobool(os.getenv("LOAD_IN_4BIT", "True")))
|
40 |
+
|
41 |
+
LLAMA_CPP = bool(strtobool(os.getenv("LLAMA_CPP", "True")))
|
42 |
+
|
43 |
+
if LLAMA_CPP:
|
44 |
+
print("Running on CPU with llama.cpp.")
|
45 |
+
else:
|
46 |
+
import torch
|
47 |
+
|
48 |
+
if torch.cuda.is_available():
|
49 |
+
print("Running on GPU with torch transformers.")
|
50 |
+
else:
|
51 |
+
print("CUDA not found.")
|
52 |
+
|
53 |
+
config = {
|
54 |
+
"model_name": MODEL_PATH,
|
55 |
+
"load_in_8bit": LOAD_IN_8BIT,
|
56 |
+
"load_in_4bit": LOAD_IN_4BIT,
|
57 |
+
"llama_cpp": LLAMA_CPP,
|
58 |
+
"MAX_INPUT_TOKEN_LENGTH": MAX_INPUT_TOKEN_LENGTH,
|
59 |
+
}
|
60 |
+
llama2_wrapper = LLAMA2_WRAPPER(config)
|
61 |
+
llama2_wrapper.init_tokenizer()
|
62 |
+
llama2_wrapper.init_model()
|
63 |
+
|
64 |
+
DESCRIPTION = """
|
65 |
+
# llama2-webui
|
66 |
+
|
67 |
+
This is a chatbot based on Llama-2.
|
68 |
+
- Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
|
69 |
+
- Supporting model backends
|
70 |
+
- Nvidia GPU(at least 6 GB VRAM): tranformers, [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ)
|
71 |
+
- CPU(at least 6 GB RAM), Mac/AMD GPU: [llama.cpp](https://github.com/ggerganov/llama.cpp)
|
72 |
+
"""
|
73 |
+
|
74 |
+
|
75 |
+
def clear_and_save_textbox(message: str) -> tuple[str, str]:
|
76 |
+
return "", message
|
77 |
+
|
78 |
+
|
79 |
+
def display_input(
|
80 |
+
message: str, history: list[tuple[str, str]]
|
81 |
+
) -> list[tuple[str, str]]:
|
82 |
+
history.append((message, ""))
|
83 |
+
return history
|
84 |
+
|
85 |
+
|
86 |
+
def delete_prev_fn(history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
|
87 |
+
try:
|
88 |
+
message, _ = history.pop()
|
89 |
+
except IndexError:
|
90 |
+
message = ""
|
91 |
+
return history, message or ""
|
92 |
+
|
93 |
+
|
94 |
+
def generate(
|
95 |
+
message: str,
|
96 |
+
history_with_input: list[tuple[str, str]],
|
97 |
+
system_prompt: str,
|
98 |
+
max_new_tokens: int,
|
99 |
+
temperature: float,
|
100 |
+
top_p: float,
|
101 |
+
top_k: int,
|
102 |
+
) -> Iterator[list[tuple[str, str]]]:
|
103 |
+
if max_new_tokens > MAX_MAX_NEW_TOKENS:
|
104 |
+
raise ValueError
|
105 |
+
|
106 |
+
history = history_with_input[:-1]
|
107 |
+
generator = llama2_wrapper.run(
|
108 |
+
message, history, system_prompt, max_new_tokens, temperature, top_p, top_k
|
109 |
+
)
|
110 |
+
try:
|
111 |
+
first_response = next(generator)
|
112 |
+
yield history + [(message, first_response)]
|
113 |
+
except StopIteration:
|
114 |
+
yield history + [(message, "")]
|
115 |
+
for response in generator:
|
116 |
+
yield history + [(message, response)]
|
117 |
+
|
118 |
+
|
119 |
+
def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
|
120 |
+
generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
|
121 |
+
for x in generator:
|
122 |
+
pass
|
123 |
+
return "", x
|
124 |
+
|
125 |
+
|
126 |
+
def check_input_token_length(
|
127 |
+
message: str, chat_history: list[tuple[str, str]], system_prompt: str
|
128 |
+
) -> None:
|
129 |
+
input_token_length = llama2_wrapper.get_input_token_length(
|
130 |
+
message, chat_history, system_prompt
|
131 |
+
)
|
132 |
+
if input_token_length > MAX_INPUT_TOKEN_LENGTH:
|
133 |
+
raise gr.Error(
|
134 |
+
f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
|
135 |
+
)
|
136 |
+
|
137 |
+
|
138 |
+
with gr.Blocks(css="style.css") as demo:
|
139 |
+
gr.Markdown(DESCRIPTION)
|
140 |
+
|
141 |
+
with gr.Group():
|
142 |
+
chatbot = gr.Chatbot(label="Chatbot")
|
143 |
+
with gr.Row():
|
144 |
+
textbox = gr.Textbox(
|
145 |
+
container=False,
|
146 |
+
show_label=False,
|
147 |
+
placeholder="Type a message...",
|
148 |
+
scale=10,
|
149 |
+
)
|
150 |
+
submit_button = gr.Button("Submit", variant="primary", scale=1, min_width=0)
|
151 |
+
with gr.Row():
|
152 |
+
retry_button = gr.Button("🔄 Retry", variant="secondary")
|
153 |
+
undo_button = gr.Button("↩️ Undo", variant="secondary")
|
154 |
+
clear_button = gr.Button("🗑️ Clear", variant="secondary")
|
155 |
+
|
156 |
+
saved_input = gr.State()
|
157 |
+
|
158 |
+
with gr.Accordion(label="Advanced options", open=False):
|
159 |
+
system_prompt = gr.Textbox(
|
160 |
+
label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6
|
161 |
+
)
|
162 |
+
max_new_tokens = gr.Slider(
|
163 |
+
label="Max new tokens",
|
164 |
+
minimum=1,
|
165 |
+
maximum=MAX_MAX_NEW_TOKENS,
|
166 |
+
step=1,
|
167 |
+
value=DEFAULT_MAX_NEW_TOKENS,
|
168 |
+
)
|
169 |
+
temperature = gr.Slider(
|
170 |
+
label="Temperature",
|
171 |
+
minimum=0.1,
|
172 |
+
maximum=4.0,
|
173 |
+
step=0.1,
|
174 |
+
value=1.0,
|
175 |
+
)
|
176 |
+
top_p = gr.Slider(
|
177 |
+
label="Top-p (nucleus sampling)",
|
178 |
+
minimum=0.05,
|
179 |
+
maximum=1.0,
|
180 |
+
step=0.05,
|
181 |
+
value=0.95,
|
182 |
+
)
|
183 |
+
top_k = gr.Slider(
|
184 |
+
label="Top-k",
|
185 |
+
minimum=1,
|
186 |
+
maximum=1000,
|
187 |
+
step=1,
|
188 |
+
value=50,
|
189 |
+
)
|
190 |
+
|
191 |
+
gr.Examples(
|
192 |
+
examples=[
|
193 |
+
"Hello there! How are you doing?",
|
194 |
+
"Can you explain briefly to me what is the Python programming language?",
|
195 |
+
"Explain the plot of Cinderella in a sentence.",
|
196 |
+
"How many hours does it take a man to eat a Helicopter?",
|
197 |
+
"Write a 100-word article on 'Benefits of Open-Source in AI research'",
|
198 |
+
],
|
199 |
+
inputs=textbox,
|
200 |
+
outputs=[textbox, chatbot],
|
201 |
+
fn=process_example,
|
202 |
+
cache_examples=True,
|
203 |
+
)
|
204 |
+
|
205 |
+
textbox.submit(
|
206 |
+
fn=clear_and_save_textbox,
|
207 |
+
inputs=textbox,
|
208 |
+
outputs=[textbox, saved_input],
|
209 |
+
api_name=False,
|
210 |
+
queue=False,
|
211 |
+
).then(
|
212 |
+
fn=display_input,
|
213 |
+
inputs=[saved_input, chatbot],
|
214 |
+
outputs=chatbot,
|
215 |
+
api_name=False,
|
216 |
+
queue=False,
|
217 |
+
).then(
|
218 |
+
fn=check_input_token_length,
|
219 |
+
inputs=[saved_input, chatbot, system_prompt],
|
220 |
+
api_name=False,
|
221 |
+
queue=False,
|
222 |
+
).success(
|
223 |
+
fn=generate,
|
224 |
+
inputs=[
|
225 |
+
saved_input,
|
226 |
+
chatbot,
|
227 |
+
system_prompt,
|
228 |
+
max_new_tokens,
|
229 |
+
temperature,
|
230 |
+
top_p,
|
231 |
+
top_k,
|
232 |
+
],
|
233 |
+
outputs=chatbot,
|
234 |
+
api_name=False,
|
235 |
+
)
|
236 |
+
|
237 |
+
button_event_preprocess = (
|
238 |
+
submit_button.click(
|
239 |
+
fn=clear_and_save_textbox,
|
240 |
+
inputs=textbox,
|
241 |
+
outputs=[textbox, saved_input],
|
242 |
+
api_name=False,
|
243 |
+
queue=False,
|
244 |
+
)
|
245 |
+
.then(
|
246 |
+
fn=display_input,
|
247 |
+
inputs=[saved_input, chatbot],
|
248 |
+
outputs=chatbot,
|
249 |
+
api_name=False,
|
250 |
+
queue=False,
|
251 |
+
)
|
252 |
+
.then(
|
253 |
+
fn=check_input_token_length,
|
254 |
+
inputs=[saved_input, chatbot, system_prompt],
|
255 |
+
api_name=False,
|
256 |
+
queue=False,
|
257 |
+
)
|
258 |
+
.success(
|
259 |
+
fn=generate,
|
260 |
+
inputs=[
|
261 |
+
saved_input,
|
262 |
+
chatbot,
|
263 |
+
system_prompt,
|
264 |
+
max_new_tokens,
|
265 |
+
temperature,
|
266 |
+
top_p,
|
267 |
+
top_k,
|
268 |
+
],
|
269 |
+
outputs=chatbot,
|
270 |
+
api_name=False,
|
271 |
+
)
|
272 |
+
)
|
273 |
+
|
274 |
+
retry_button.click(
|
275 |
+
fn=delete_prev_fn,
|
276 |
+
inputs=chatbot,
|
277 |
+
outputs=[chatbot, saved_input],
|
278 |
+
api_name=False,
|
279 |
+
queue=False,
|
280 |
+
).then(
|
281 |
+
fn=display_input,
|
282 |
+
inputs=[saved_input, chatbot],
|
283 |
+
outputs=chatbot,
|
284 |
+
api_name=False,
|
285 |
+
queue=False,
|
286 |
+
).then(
|
287 |
+
fn=generate,
|
288 |
+
inputs=[
|
289 |
+
saved_input,
|
290 |
+
chatbot,
|
291 |
+
system_prompt,
|
292 |
+
max_new_tokens,
|
293 |
+
temperature,
|
294 |
+
top_p,
|
295 |
+
top_k,
|
296 |
+
],
|
297 |
+
outputs=chatbot,
|
298 |
+
api_name=False,
|
299 |
+
)
|
300 |
+
|
301 |
+
undo_button.click(
|
302 |
+
fn=delete_prev_fn,
|
303 |
+
inputs=chatbot,
|
304 |
+
outputs=[chatbot, saved_input],
|
305 |
+
api_name=False,
|
306 |
+
queue=False,
|
307 |
+
).then(
|
308 |
+
fn=lambda x: x,
|
309 |
+
inputs=[saved_input],
|
310 |
+
outputs=textbox,
|
311 |
+
api_name=False,
|
312 |
+
queue=False,
|
313 |
+
)
|
314 |
+
|
315 |
+
clear_button.click(
|
316 |
+
fn=lambda: ([], ""),
|
317 |
+
outputs=[chatbot, saved_input],
|
318 |
+
queue=False,
|
319 |
+
api_name=False,
|
320 |
+
)
|
321 |
+
|
322 |
+
demo.queue(max_size=20).launch(share=True)
|
env_examples/.env.13b_example
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_PATH = "/path-to/Llama-2-13b-chat-hf"
|
2 |
+
LOAD_IN_8BIT = False
|
3 |
+
LOAD_IN_4BIT = False
|
4 |
+
LLAMA_CPP = False
|
5 |
+
|
6 |
+
MAX_MAX_NEW_TOKENS = 2048
|
7 |
+
DEFAULT_MAX_NEW_TOKENS = 1024
|
8 |
+
MAX_INPUT_TOKEN_LENGTH = 4000
|
9 |
+
|
10 |
+
DEFAULT_SYSTEM_PROMPT = "\
|
11 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
12 |
+
|
13 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
|
14 |
+
"
|
env_examples/.env.7b_8bit_example
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_PATH = "/path-to/Llama-2-7b-chat-hf"
|
2 |
+
LOAD_IN_8BIT = True
|
3 |
+
LOAD_IN_4BIT = False
|
4 |
+
LLAMA_CPP = False
|
5 |
+
|
6 |
+
MAX_MAX_NEW_TOKENS = 2048
|
7 |
+
DEFAULT_MAX_NEW_TOKENS = 1024
|
8 |
+
MAX_INPUT_TOKEN_LENGTH = 4000
|
9 |
+
|
10 |
+
DEFAULT_SYSTEM_PROMPT = "\
|
11 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
12 |
+
|
13 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
|
14 |
+
"
|
env_examples/.env.7b_ggmlv3_q4_0_example
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_PATH = "/path-to/llama-2-7b-chat.ggmlv3.q4_0.bin"
|
2 |
+
LOAD_IN_8BIT = False
|
3 |
+
LOAD_IN_4BIT = True
|
4 |
+
LLAMA_CPP = True
|
5 |
+
|
6 |
+
MAX_MAX_NEW_TOKENS = 2048
|
7 |
+
DEFAULT_MAX_NEW_TOKENS = 1024
|
8 |
+
MAX_INPUT_TOKEN_LENGTH = 4000
|
9 |
+
|
10 |
+
DEFAULT_SYSTEM_PROMPT = "\
|
11 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
12 |
+
|
13 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
|
14 |
+
"
|
env_examples/.env.7b_gptq_example
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_PATH = "/path-to/Llama-2-7b-Chat-GPTQ"
|
2 |
+
LOAD_IN_8BIT = False
|
3 |
+
LOAD_IN_4BIT = True
|
4 |
+
LLAMA_CPP = False
|
5 |
+
|
6 |
+
MAX_MAX_NEW_TOKENS = 2048
|
7 |
+
DEFAULT_MAX_NEW_TOKENS = 1024
|
8 |
+
MAX_INPUT_TOKEN_LENGTH = 4000
|
9 |
+
|
10 |
+
DEFAULT_SYSTEM_PROMPT = "\
|
11 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
12 |
+
|
13 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
|
14 |
+
"
|
gradio_cached_examples/19/Chatbot/tmp2twvboxp.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[["Explain the plot of Cinderella in a sentence.", "Of course! The plot of Cinderella is a classic tale about a kind and hardworking young woman who, despite facing unjust treatment from her stepmother and stepsisters, ultimately finds true love and happiness with her prince charming."]]
|
gradio_cached_examples/19/Chatbot/tmp73v4irik.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[["Can you explain briefly to me what is the Python programming language?", "Of course! Python is a high-level, interpreted programming language that is widely used for various purposes, including web development, data analysis, artificial intelligence, and more. It is known for its simplicity, readability, and ease of use, making it a great language for beginners and experienced programmers alike. Python is also a versatile language that can be used for a wide range of applications, such as building websites, creating games, and developing machine learning models. Its large and active community is a big plus, as it offers a vast number of libraries and resources for solving various problems. Overall, Python is a powerful and popular language that is here to stay! \ud83d\ude0a"]]
|
gradio_cached_examples/19/Chatbot/tmpfl3bd2ut.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[["How many hours does it take a man to eat a Helicopter?", "Thank you for asking! I'm happy to help you with your question. However, I must respectfully point out that it is not possible for a man to eat a helicopter. Helicopters are machines, not food, and they do not have any edible parts. Therefore, the amount of time it would take a man to eat a helicopter is zero hours, as it is not a feasible or safe action to perform. Is there anything else I can help you with?"]]
|
gradio_cached_examples/19/Chatbot/tmppuwn4iw8.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[["Hello there! How are you doing?", "Hello there! *smiling* I'm just an AI, I don't have feelings or emotions like humans do, but I'm here to help you with any questions or concerns you may have! How can I assist you today? Please feel free to ask anything, and I'll do my best to provide helpful and respectful responses. Is there something specific you'd like to talk about or ask about?"]]
|
gradio_cached_examples/19/Chatbot/tmptsok1ajp.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[["Write a 100-word article on 'Benefits of Open-Source in AI research'", "Open-source principles in AI research promote collaboration, transparency, and inclusivity. By making AI models and frameworks available to the public, researchers can accelerate innovation, improve model performance, and reduce duplication of effort. Open-source AI also allows for greater diversity and inclusivity in the development process, as anyone can contribute to the project. Furthermore, open-source AI can enhance accountability and trustworthiness, as the source code is publicly available for scrutiny. Ultimately, open-source AI can lead to more reliable and ethical AI solutions, which is essential for widespread adoption."]]
|
gradio_cached_examples/19/log.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
component 0,Chatbot,flag,username,timestamp
|
2 |
+
,/workspace/llama2-webui/gradio_cached_examples/19/Chatbot/tmppuwn4iw8.json,,,2023-07-25 02:54:38.591953
|
3 |
+
,/workspace/llama2-webui/gradio_cached_examples/19/Chatbot/tmp73v4irik.json,,,2023-07-25 02:54:56.629963
|
4 |
+
,/workspace/llama2-webui/gradio_cached_examples/19/Chatbot/tmp2twvboxp.json,,,2023-07-25 02:55:03.113231
|
5 |
+
,/workspace/llama2-webui/gradio_cached_examples/19/Chatbot/tmpfl3bd2ut.json,,,2023-07-25 02:55:16.443071
|
6 |
+
,/workspace/llama2-webui/gradio_cached_examples/19/Chatbot/tmptsok1ajp.json,,,2023-07-25 02:55:34.266990
|
model.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from threading import Thread
|
2 |
+
from typing import Iterator
|
3 |
+
|
4 |
+
|
5 |
+
class LLAMA2_WRAPPER:
|
6 |
+
def __init__(self, config: dict = {}):
|
7 |
+
self.config = config
|
8 |
+
self.model = None
|
9 |
+
self.tokenizer = None
|
10 |
+
|
11 |
+
def init_model(self):
|
12 |
+
if self.model is None:
|
13 |
+
self.model = LLAMA2_WRAPPER.create_llama2_model(
|
14 |
+
self.config,
|
15 |
+
)
|
16 |
+
if not self.config.get("llama_cpp"):
|
17 |
+
self.model.eval()
|
18 |
+
|
19 |
+
def init_tokenizer(self):
|
20 |
+
if self.tokenizer is None and not self.config.get("llama_cpp"):
|
21 |
+
self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.config)
|
22 |
+
|
23 |
+
@classmethod
|
24 |
+
def create_llama2_model(cls, config):
|
25 |
+
model_name = config.get("model_name")
|
26 |
+
load_in_8bit = config.get("load_in_8bit", True)
|
27 |
+
load_in_4bit = config.get("load_in_4bit", False)
|
28 |
+
llama_cpp = config.get("llama_cpp", False)
|
29 |
+
if llama_cpp:
|
30 |
+
from llama_cpp import Llama
|
31 |
+
|
32 |
+
model = Llama(
|
33 |
+
model_path=model_name,
|
34 |
+
n_ctx=config.get("MAX_INPUT_TOKEN_LENGTH"),
|
35 |
+
n_batch=config.get("MAX_INPUT_TOKEN_LENGTH"),
|
36 |
+
)
|
37 |
+
elif load_in_4bit:
|
38 |
+
from auto_gptq import AutoGPTQForCausalLM
|
39 |
+
|
40 |
+
model = AutoGPTQForCausalLM.from_quantized(
|
41 |
+
model_name,
|
42 |
+
use_safetensors=True,
|
43 |
+
trust_remote_code=True,
|
44 |
+
device="cuda:0",
|
45 |
+
use_triton=False,
|
46 |
+
quantize_config=None,
|
47 |
+
)
|
48 |
+
else:
|
49 |
+
import torch
|
50 |
+
from transformers import AutoModelForCausalLM
|
51 |
+
|
52 |
+
model = AutoModelForCausalLM.from_pretrained(
|
53 |
+
model_name,
|
54 |
+
device_map="auto",
|
55 |
+
torch_dtype=torch.float16,
|
56 |
+
load_in_8bit=load_in_8bit,
|
57 |
+
)
|
58 |
+
return model
|
59 |
+
|
60 |
+
@classmethod
|
61 |
+
def create_llama2_tokenizer(cls, config):
|
62 |
+
model_name = config.get("model_name")
|
63 |
+
from transformers import AutoTokenizer
|
64 |
+
|
65 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
66 |
+
return tokenizer
|
67 |
+
|
68 |
+
def get_input_token_length(
|
69 |
+
self, message: str, chat_history: list[tuple[str, str]], system_prompt: str
|
70 |
+
) -> int:
|
71 |
+
prompt = get_prompt(message, chat_history, system_prompt)
|
72 |
+
|
73 |
+
if self.config.get("llama_cpp"):
|
74 |
+
input_ids = self.model.tokenize(bytes(prompt, "utf-8"))
|
75 |
+
return len(input_ids)
|
76 |
+
else:
|
77 |
+
input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"]
|
78 |
+
return input_ids.shape[-1]
|
79 |
+
|
80 |
+
def run(
|
81 |
+
self,
|
82 |
+
message: str,
|
83 |
+
chat_history: list[tuple[str, str]],
|
84 |
+
system_prompt: str,
|
85 |
+
max_new_tokens: int = 1024,
|
86 |
+
temperature: float = 0.8,
|
87 |
+
top_p: float = 0.95,
|
88 |
+
top_k: int = 50,
|
89 |
+
) -> Iterator[str]:
|
90 |
+
prompt = get_prompt(message, chat_history, system_prompt)
|
91 |
+
if self.config.get("llama_cpp"):
|
92 |
+
inputs = self.model.tokenize(bytes(prompt, "utf-8"))
|
93 |
+
generate_kwargs = dict(
|
94 |
+
top_p=top_p,
|
95 |
+
top_k=top_k,
|
96 |
+
temp=temperature,
|
97 |
+
)
|
98 |
+
|
99 |
+
generator = self.model.generate(inputs, **generate_kwargs)
|
100 |
+
outputs = []
|
101 |
+
for token in generator:
|
102 |
+
if token == self.model.token_eos():
|
103 |
+
break
|
104 |
+
b_text = self.model.detokenize([token])
|
105 |
+
text = str(b_text, encoding="utf-8")
|
106 |
+
outputs.append(text)
|
107 |
+
yield "".join(outputs)
|
108 |
+
else:
|
109 |
+
from transformers import TextIteratorStreamer
|
110 |
+
|
111 |
+
inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
|
112 |
+
|
113 |
+
streamer = TextIteratorStreamer(
|
114 |
+
self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
|
115 |
+
)
|
116 |
+
generate_kwargs = dict(
|
117 |
+
inputs,
|
118 |
+
streamer=streamer,
|
119 |
+
max_new_tokens=max_new_tokens,
|
120 |
+
do_sample=True,
|
121 |
+
top_p=top_p,
|
122 |
+
top_k=top_k,
|
123 |
+
temperature=temperature,
|
124 |
+
num_beams=1,
|
125 |
+
)
|
126 |
+
t = Thread(target=self.model.generate, kwargs=generate_kwargs)
|
127 |
+
t.start()
|
128 |
+
|
129 |
+
outputs = []
|
130 |
+
for text in streamer:
|
131 |
+
outputs.append(text)
|
132 |
+
yield "".join(outputs)
|
133 |
+
|
134 |
+
|
135 |
+
def get_prompt(
|
136 |
+
message: str, chat_history: list[tuple[str, str]], system_prompt: str
|
137 |
+
) -> str:
|
138 |
+
texts = [f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
|
139 |
+
for user_input, response in chat_history:
|
140 |
+
texts.append(f"{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ")
|
141 |
+
texts.append(f"{message.strip()} [/INST]")
|
142 |
+
return "".join(texts)
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.21.0
|
2 |
+
# auto-gptq==0.3.0
|
3 |
+
bitsandbytes==0.40.2
|
4 |
+
gradio==3.37.0
|
5 |
+
protobuf==3.20.3
|
6 |
+
scipy==1.11.1
|
7 |
+
sentencepiece==0.1.99
|
8 |
+
torch==2.0.1
|
9 |
+
transformers==4.31.0
|
10 |
+
tqdm==4.65.0
|
11 |
+
python-dotenv==1.0.0
|
static/screenshot.png
ADDED