umamicode commited on
Commit
9bddec3
1 Parent(s): 4ec445f

Upload folder using huggingface_hub

Browse files
.env ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_PATH = "/workspace/lab-di/squads/ensol/data/lm/Llama-2-7b-chat-hf" #"/path-to/Llama-2-7b-chat-hf"
2
+ LOAD_IN_8BIT = True
3
+ LOAD_IN_4BIT = False
4
+ LLAMA_CPP = False
5
+
6
+ MAX_MAX_NEW_TOKENS = 2048
7
+ DEFAULT_MAX_NEW_TOKENS = 1024
8
+ MAX_INPUT_TOKEN_LENGTH = 4000
9
+
10
+ DEFAULT_SYSTEM_PROMPT = "\
11
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
12
+
13
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
14
+ "
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ models
2
+
3
+ .vscode
4
+
5
+ __pycache__
6
+ gradio_cached_examples
CONTRIBUTING.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to [llama2-webui](https://github.com/liltom-eth/llama2-webui)
2
+
3
+ We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
4
+
5
+ - Reporting a bug
6
+ - Proposing new features
7
+ - Discussing the current state of the code
8
+ - Update README.md
9
+ - Submitting a PR
10
+
11
+ ## Using GitHub's [issues](https://github.com/liltom-eth/llama2-webui/issues)
12
+
13
+ We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/liltom-eth/llama2-webui/issues). It's that easy!
14
+
15
+ Thanks for **[jlb1504](https://github.com/jlb1504)** for reporting the [first issue](https://github.com/liltom-eth/llama2-webui/issues/1)!
16
+
17
+ **Great Bug Reports** tend to have:
18
+
19
+ - A quick summary and/or background
20
+ - Steps to reproduce
21
+ - Be specific!
22
+ - Give a sample code if you can.
23
+ - What you expected would happen
24
+ - What actually happens
25
+ - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
26
+
27
+ Proposing new features are also welcome.
28
+
29
+ ## Pull Request
30
+
31
+ All pull requests are welcome. For example, you update the `README.md` to help users to better understand the usage.
32
+
33
+ ### Clone the repository
34
+
35
+ 1. Create a user account on GitHub if you do not already have one.
36
+
37
+ 2. Fork the project [repository](https://github.com/liltom-eth/llama2-webui): click on the *Fork* button near the top of the page. This creates a copy of the code under your account on GitHub.
38
+
39
+ 3. Clone this copy to your local disk:
40
+
41
+ ```
42
+ git clone git@github.com:liltom-eth/llama2-webui.git
43
+ cd llama2-webui
44
+ ```
45
+
46
+ ### Implement your changes
47
+
48
+ 1. Create a branch to hold your changes:
49
+
50
+ ```
51
+ git checkout -b my-feature
52
+ ```
53
+
54
+ and start making changes. Never work on the main branch!
55
+
56
+ 2. Start your work on this branch.
57
+
58
+ 3. When you’re done editing, do:
59
+
60
+ ```
61
+ git add <MODIFIED FILES>
62
+ git commit
63
+ ```
64
+
65
+ to record your changes in [git](https://git-scm.com/).
66
+
67
+ ### Submit your contribution
68
+
69
+ 1. If everything works fine, push your local branch to the remote server with:
70
+
71
+ ```
72
+ git push -u origin my-feature
73
+ ```
74
+
75
+ 2. Go to the web page of your fork and click "Create pull request" to send your changes for review.
76
+
77
+ ```{todo}
78
+ Find more detailed information in [creating a PR]. You might also want to open
79
+ the PR as a draft first and mark it as ready for review after the feedbacks
80
+ from the continuous integration (CI) system or any required fixes.
81
+ ```
82
+
83
+ ## License
84
+
85
+ By contributing, you agree that your contributions will be licensed under its MIT License.
86
+
87
+ ## Questions?
88
+
89
+ Email us at [liltom.eth@gmail.com](mailto:liltom.eth@gmail.com)
90
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Tom
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,194 @@
1
  ---
2
- title: Llama2 Test
3
- emoji: 🚀
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 3.39.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
1
  ---
2
+ title: llama2-test
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 3.37.0
6
  ---
7
+ # llama2-webui
8
+
9
+ Running Llama 2 with gradio web UI on GPU or CPU from anywhere (Linux/Windows/Mac).
10
+ - Supporting all Llama 2 models (7B, 13B, 70B, GPTQ, GGML) with 8-bit, 4-bit mode.
11
+ - Supporting GPU inference with at least 6 GB VRAM, and CPU inference.
12
+
13
+ ![screenshot](./static/screenshot.png)
14
+
15
+ ## Features
16
+
17
+ - Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
18
+ - Supporting model backends
19
+ - Nvidia GPU: tranformers, [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ)
20
+ - GPU inference with at least 6 GB VRAM
21
+
22
+ - CPU, Mac/AMD GPU: [llama.cpp](https://github.com/ggerganov/llama.cpp)
23
+ - CPU inference [Demo](https://twitter.com/liltom_eth/status/1682791729207070720?s=20) on Macbook Air.
24
+
25
+ - Web UI interface: gradio
26
+
27
+ ## Contents
28
+
29
+ - [Install](#install)
30
+ - [Download Llama-2 Models](#download-llama-2-models)
31
+ - [Model List](#model-list)
32
+ - [Download Script](#download-script)
33
+ - [Usage](#usage)
34
+ - [Config Examples](#config-examples)
35
+ - [Start Web UI](#start-web-ui)
36
+ - [Run on Nvidia GPU](#run-on-nvidia-gpu)
37
+ - [Run on Low Memory GPU with 8 bit](#run-on-low-memory-gpu-with-8-bit)
38
+ - [Run on Low Memory GPU with 4 bit](#run-on-low-memory-gpu-with-4-bit)
39
+ - [Run on CPU](#run-on-cpu)
40
+ - [Mac GPU and AMD/Nvidia GPU Acceleration](#mac-gpu-and-amdnvidia-gpu-acceleration)
41
+ - [Contributing](#contributing)
42
+ - [License](#license)
43
+
44
+
45
+
46
+ ## Install
47
+ ```
48
+ pip install -r requirements.txt
49
+ ```
50
+
51
+ `bitsandbytes >= 0.39` may not work on older NVIDIA GPUs. In that case, to use `LOAD_IN_8BIT`, you may have to downgrade like this:
52
+
53
+ - `pip install bitsandbytes==0.38.1`
54
+
55
+ `bitsandbytes` also need a special install for Windows:
56
+ ```
57
+ pip uninstall bitsandbytes
58
+ pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl
59
+ ```
60
+
61
+ If run on CPU, install llama.cpp additionally by `pip install llama-cpp-python`.
62
+
63
+ ## Download Llama-2 Models
64
+
65
+ Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
66
+
67
+ Llama-2-7b-Chat-GPTQ is the GPTQ model files for [Meta's Llama 2 7b Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). GPTQ 4-bit Llama-2 model require less GPU VRAM to run it.
68
+
69
+ ### Model List
70
+
71
+ | Model Name | set MODEL_PATH in .env | Download URL |
72
+ | ------------------------------ | ---------------------------------------- | ------------------------------------------------------------ |
73
+ | meta-llama/Llama-2-7b-chat-hf | /path-to/Llama-2-7b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-7b-chat-hf) |
74
+ | meta-llama/Llama-2-13b-chat-hf | /path-to/Llama-2-13b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-13b-chat-hf) |
75
+ | meta-llama/Llama-2-70b-chat-hf | /path-to/Llama-2-70b-chat-hf | [Link](https://huggingface.co/llamaste/Llama-2-70b-chat-hf) |
76
+ | meta-llama/Llama-2-7b-hf | /path-to/Llama-2-7b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf) |
77
+ | meta-llama/Llama-2-13b-hf | /path-to/Llama-2-13b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-13b-hf) |
78
+ | meta-llama/Llama-2-70b-hf | /path-to/Llama-2-70b-hf | [Link](https://huggingface.co/meta-llama/Llama-2-70b-hf) |
79
+ | TheBloke/Llama-2-7b-Chat-GPTQ | /path-to/Llama-2-7b-Chat-GPTQ | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ) |
80
+ | TheBloke/Llama-2-7B-Chat-GGML | /path-to/llama-2-7b-chat.ggmlv3.q4_0.bin | [Link](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) |
81
+ | ... | ... | ... |
82
+
83
+ Running 4-bit model `Llama-2-7b-Chat-GPTQ` needs GPU with 6GB VRAM.
84
+
85
+ Running 4-bit model `llama-2-7b-chat.ggmlv3.q4_0.bin` needs CPU with 6GB RAM. There is also a list of other 2, 3, 4, 5, 6, 8-bit GGML models that can be used from [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML).
86
+
87
+ ### Download Script
88
+
89
+ These models can be downloaded from the link using CMD like:
90
+
91
+ ```bash
92
+ # Make sure you have git-lfs installed (https://git-lfs.com)
93
+ git lfs install
94
+ git clone git@hf.co:meta-llama/Llama-2-7b-chat-hf
95
+ ```
96
+
97
+ To download Llama 2 models, you need to request access from [https://ai.meta.com/llama/](https://ai.meta.com/llama/) and also enable access on repos like [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main). Requests will be processed in hours.
98
+
99
+ For GPTQ models like [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), you can directly download without requesting access.
100
+
101
+ For GGML models like [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), you can directly download without requesting access.
102
+
103
+ ## Usage
104
+
105
+ ### Config Examples
106
+
107
+ Setup your `MODEL_PATH` and model configs in `.env` file.
108
+
109
+ There are some examples in `./env_examples/` folder.
110
+
111
+ | Model Setup | Example .env |
112
+ | --------------------------------- | --------------------------- |
113
+ | Llama-2-7b-chat-hf 8-bit on GPU | .env.7b_8bit_example |
114
+ | Llama-2-7b-Chat-GPTQ 4-bit on GPU | .env.7b_gptq_example |
115
+ | Llama-2-7B-Chat-GGML 4bit on CPU | .env.7b_ggmlv3_q4_0_example |
116
+ | Llama-2-13b-chat-hf on GPU | .env.13b_example |
117
+ | ... | ... |
118
+
119
+ ### Start Web UI
120
+
121
+ Run chatbot with web UI:
122
+
123
+ ```
124
+ python app.py
125
+ ```
126
+
127
+ ### Run on Nvidia GPU
128
+
129
+ The running requires around 14GB of GPU VRAM for Llama-2-7b and 28GB of GPU VRAM for Llama-2-13b.
130
+
131
+ If you are running on multiple GPUs, the model will be loaded automatically on GPUs and split the VRAM usage. That allows you to run Llama-2-7b (requires 14GB of GPU VRAM) on a setup like 2 GPUs (11GB VRAM each).
132
+
133
+ #### Run on Low Memory GPU with 8 bit
134
+
135
+ If you do not have enough memory, you can set up your `LOAD_IN_8BIT` as `True` in `.env`. This can reduce memory usage by around half with slightly degraded model quality. It is compatible with the CPU, GPU, and Metal backend.
136
+
137
+ Llama-2-7b with 8-bit compression can run on a single GPU with 8 GB of VRAM, like an Nvidia RTX 2080Ti, RTX 4080, T4, V100 (16GB).
138
+
139
+ #### Run on Low Memory GPU with 4 bit
140
+
141
+ If you want to run 4 bit Llama-2 model like `Llama-2-7b-Chat-GPTQ`, you can set up your `LOAD_IN_4BIT` as `True` in `.env` like example `.env.7b_gptq_example`.
142
+
143
+ Make sure you have downloaded the 4-bit model from `Llama-2-7b-Chat-GPTQ` and set the `MODEL_PATH` and arguments in `.env` file.
144
+
145
+ `Llama-2-7b-Chat-GPTQ` can run on a single GPU with 6 GB of VRAM.
146
+
147
+ ### Run on CPU
148
+
149
+ Run Llama-2 model on CPU requires [llama.cpp](https://github.com/ggerganov/llama.cpp) dependency and [llama.cpp Python Bindings](https://github.com/abetlen/llama-cpp-python).
150
+
151
+ ```bash
152
+ pip install llama-cpp-python
153
+ ```
154
+
155
+ Download GGML models like `llama-2-7b-chat.ggmlv3.q4_0.bin` following [Download Llama-2 Models](#download-llama-2-models) section. `llama-2-7b-chat.ggmlv3.q4_0.bin` model requires at least 6 GB RAM to run on CPU.
156
+
157
+ Set up configs like `.env.7b_ggmlv3_q4_0_example` from `env_examples` as `.env`.
158
+
159
+ Run web UI `python app.py` .
160
+
161
+
162
+
163
+ #### Mac GPU and AMD/Nvidia GPU Acceleration
164
+
165
+ If you would like to use Mac GPU and AMD/Nvidia GPU for acceleration, check these:
166
+
167
+ - [Installation with OpenBLAS / cuBLAS / CLBlast / Metal](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)
168
+
169
+ - [MacOS Install with Metal GPU](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md)
170
+
171
+ ## Contributing
172
+
173
+ Kindly read our [Contributing Guide](CONTRIBUTING.md) to learn and understand about our development process.
174
+
175
+ ### All Contributors
176
+
177
+ <a href="https://github.com/liltom-eth/llama2-webui/graphs/contributors">
178
+ <img src="https://contrib.rocks/image?repo=liltom-eth/llama2-webui" />
179
+ </a>
180
+
181
+ ## License
182
+
183
+ MIT - see [MIT License](LICENSE)
184
+
185
+ This project enables users to adapt it freely for proprietary purposes without any restrictions.
186
+
187
+ ## Credits
188
 
189
+ - https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
190
+ - https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat
191
+ - https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ
192
+ - [https://github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
193
+ - [https://github.com/TimDettmers/bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
194
+ - [https://github.com/PanQiWei/AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
__pycache__/model.cpython-38.pyc ADDED
Binary file (4.08 kB). View file
 
__pycache__/model.cpython-39.pyc ADDED
Binary file (4.07 kB). View file
 
app.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Iterator
3
+
4
+ import gradio as gr
5
+
6
+ from dotenv import load_dotenv
7
+ from distutils.util import strtobool
8
+
9
+ from model import LLAMA2_WRAPPER
10
+
11
+ load_dotenv()
12
+
13
+ DEFAULT_SYSTEM_PROMPT = (
14
+ os.getenv("DEFAULT_SYSTEM_PROMPT")
15
+ if os.getenv("DEFAULT_SYSTEM_PROMPT") is not None
16
+ else ""
17
+ )
18
+ MAX_MAX_NEW_TOKENS = (
19
+ int(os.getenv("MAX_MAX_NEW_TOKENS"))
20
+ if os.getenv("DEFAULT_MAX_NEW_TOKENS") is not None
21
+ else 2048
22
+ )
23
+ DEFAULT_MAX_NEW_TOKENS = (
24
+ int(os.getenv("DEFAULT_MAX_NEW_TOKENS"))
25
+ if os.getenv("DEFAULT_MAX_NEW_TOKENS") is not None
26
+ else 1024
27
+ )
28
+ MAX_INPUT_TOKEN_LENGTH = (
29
+ int(os.getenv("MAX_INPUT_TOKEN_LENGTH"))
30
+ if os.getenv("MAX_INPUT_TOKEN_LENGTH") is not None
31
+ else 4000
32
+ )
33
+
34
+ MODEL_PATH = os.getenv("MODEL_PATH")
35
+ assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
36
+
37
+ LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
38
+
39
+ LOAD_IN_4BIT = bool(strtobool(os.getenv("LOAD_IN_4BIT", "True")))
40
+
41
+ LLAMA_CPP = bool(strtobool(os.getenv("LLAMA_CPP", "True")))
42
+
43
+ if LLAMA_CPP:
44
+ print("Running on CPU with llama.cpp.")
45
+ else:
46
+ import torch
47
+
48
+ if torch.cuda.is_available():
49
+ print("Running on GPU with torch transformers.")
50
+ else:
51
+ print("CUDA not found.")
52
+
53
+ config = {
54
+ "model_name": MODEL_PATH,
55
+ "load_in_8bit": LOAD_IN_8BIT,
56
+ "load_in_4bit": LOAD_IN_4BIT,
57
+ "llama_cpp": LLAMA_CPP,
58
+ "MAX_INPUT_TOKEN_LENGTH": MAX_INPUT_TOKEN_LENGTH,
59
+ }
60
+ llama2_wrapper = LLAMA2_WRAPPER(config)
61
+ llama2_wrapper.init_tokenizer()
62
+ llama2_wrapper.init_model()
63
+
64
+ DESCRIPTION = """
65
+ # llama2-webui
66
+
67
+ This is a chatbot based on Llama-2.
68
+ - Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
69
+ - Supporting model backends
70
+ - Nvidia GPU(at least 6 GB VRAM): tranformers, [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ)
71
+ - CPU(at least 6 GB RAM), Mac/AMD GPU: [llama.cpp](https://github.com/ggerganov/llama.cpp)
72
+ """
73
+
74
+
75
+ def clear_and_save_textbox(message: str) -> tuple[str, str]:
76
+ return "", message
77
+
78
+
79
+ def display_input(
80
+ message: str, history: list[tuple[str, str]]
81
+ ) -> list[tuple[str, str]]:
82
+ history.append((message, ""))
83
+ return history
84
+
85
+
86
+ def delete_prev_fn(history: list[tuple[str, str]]) -> tuple[list[tuple[str, str]], str]:
87
+ try:
88
+ message, _ = history.pop()
89
+ except IndexError:
90
+ message = ""
91
+ return history, message or ""
92
+
93
+
94
+ def generate(
95
+ message: str,
96
+ history_with_input: list[tuple[str, str]],
97
+ system_prompt: str,
98
+ max_new_tokens: int,
99
+ temperature: float,
100
+ top_p: float,
101
+ top_k: int,
102
+ ) -> Iterator[list[tuple[str, str]]]:
103
+ if max_new_tokens > MAX_MAX_NEW_TOKENS:
104
+ raise ValueError
105
+
106
+ history = history_with_input[:-1]
107
+ generator = llama2_wrapper.run(
108
+ message, history, system_prompt, max_new_tokens, temperature, top_p, top_k
109
+ )
110
+ try:
111
+ first_response = next(generator)
112
+ yield history + [(message, first_response)]
113
+ except StopIteration:
114
+ yield history + [(message, "")]
115
+ for response in generator:
116
+ yield history + [(message, response)]
117
+
118
+
119
+ def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
120
+ generator = generate(message, [], DEFAULT_SYSTEM_PROMPT, 1024, 1, 0.95, 50)
121
+ for x in generator:
122
+ pass
123
+ return "", x
124
+
125
+
126
+ def check_input_token_length(
127
+ message: str, chat_history: list[tuple[str, str]], system_prompt: str
128
+ ) -> None:
129
+ input_token_length = llama2_wrapper.get_input_token_length(
130
+ message, chat_history, system_prompt
131
+ )
132
+ if input_token_length > MAX_INPUT_TOKEN_LENGTH:
133
+ raise gr.Error(
134
+ f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
135
+ )
136
+
137
+
138
+ with gr.Blocks(css="style.css") as demo:
139
+ gr.Markdown(DESCRIPTION)
140
+
141
+ with gr.Group():
142
+ chatbot = gr.Chatbot(label="Chatbot")
143
+ with gr.Row():
144
+ textbox = gr.Textbox(
145
+ container=False,
146
+ show_label=False,
147
+ placeholder="Type a message...",
148
+ scale=10,
149
+ )
150
+ submit_button = gr.Button("Submit", variant="primary", scale=1, min_width=0)
151
+ with gr.Row():
152
+ retry_button = gr.Button("🔄 Retry", variant="secondary")
153
+ undo_button = gr.Button("↩️ Undo", variant="secondary")
154
+ clear_button = gr.Button("🗑️ Clear", variant="secondary")
155
+
156
+ saved_input = gr.State()
157
+
158
+ with gr.Accordion(label="Advanced options", open=False):
159
+ system_prompt = gr.Textbox(
160
+ label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6
161
+ )
162
+ max_new_tokens = gr.Slider(
163
+ label="Max new tokens",
164
+ minimum=1,
165
+ maximum=MAX_MAX_NEW_TOKENS,
166
+ step=1,
167
+ value=DEFAULT_MAX_NEW_TOKENS,
168
+ )
169
+ temperature = gr.Slider(
170
+ label="Temperature",
171
+ minimum=0.1,
172
+ maximum=4.0,
173
+ step=0.1,
174
+ value=1.0,
175
+ )
176
+ top_p = gr.Slider(
177
+ label="Top-p (nucleus sampling)",
178
+ minimum=0.05,
179
+ maximum=1.0,
180
+ step=0.05,
181
+ value=0.95,
182
+ )
183
+ top_k = gr.Slider(
184
+ label="Top-k",
185
+ minimum=1,
186
+ maximum=1000,
187
+ step=1,
188
+ value=50,
189
+ )
190
+
191
+ gr.Examples(
192
+ examples=[
193
+ "Hello there! How are you doing?",
194
+ "Can you explain briefly to me what is the Python programming language?",
195
+ "Explain the plot of Cinderella in a sentence.",
196
+ "How many hours does it take a man to eat a Helicopter?",
197
+ "Write a 100-word article on 'Benefits of Open-Source in AI research'",
198
+ ],
199
+ inputs=textbox,
200
+ outputs=[textbox, chatbot],
201
+ fn=process_example,
202
+ cache_examples=True,
203
+ )
204
+
205
+ textbox.submit(
206
+ fn=clear_and_save_textbox,
207
+ inputs=textbox,
208
+ outputs=[textbox, saved_input],
209
+ api_name=False,
210
+ queue=False,
211
+ ).then(
212
+ fn=display_input,
213
+ inputs=[saved_input, chatbot],
214
+ outputs=chatbot,
215
+ api_name=False,
216
+ queue=False,
217
+ ).then(
218
+ fn=check_input_token_length,
219
+ inputs=[saved_input, chatbot, system_prompt],
220
+ api_name=False,
221
+ queue=False,
222
+ ).success(
223
+ fn=generate,
224
+ inputs=[
225
+ saved_input,
226
+ chatbot,
227
+ system_prompt,
228
+ max_new_tokens,
229
+ temperature,
230
+ top_p,
231
+ top_k,
232
+ ],
233
+ outputs=chatbot,
234
+ api_name=False,
235
+ )
236
+
237
+ button_event_preprocess = (
238
+ submit_button.click(
239
+ fn=clear_and_save_textbox,
240
+ inputs=textbox,
241
+ outputs=[textbox, saved_input],
242
+ api_name=False,
243
+ queue=False,
244
+ )
245
+ .then(
246
+ fn=display_input,
247
+ inputs=[saved_input, chatbot],
248
+ outputs=chatbot,
249
+ api_name=False,
250
+ queue=False,
251
+ )
252
+ .then(
253
+ fn=check_input_token_length,
254
+ inputs=[saved_input, chatbot, system_prompt],
255
+ api_name=False,
256
+ queue=False,
257
+ )
258
+ .success(
259
+ fn=generate,
260
+ inputs=[
261
+ saved_input,
262
+ chatbot,
263
+ system_prompt,
264
+ max_new_tokens,
265
+ temperature,
266
+ top_p,
267
+ top_k,
268
+ ],
269
+ outputs=chatbot,
270
+ api_name=False,
271
+ )
272
+ )
273
+
274
+ retry_button.click(
275
+ fn=delete_prev_fn,
276
+ inputs=chatbot,
277
+ outputs=[chatbot, saved_input],
278
+ api_name=False,
279
+ queue=False,
280
+ ).then(
281
+ fn=display_input,
282
+ inputs=[saved_input, chatbot],
283
+ outputs=chatbot,
284
+ api_name=False,
285
+ queue=False,
286
+ ).then(
287
+ fn=generate,
288
+ inputs=[
289
+ saved_input,
290
+ chatbot,
291
+ system_prompt,
292
+ max_new_tokens,
293
+ temperature,
294
+ top_p,
295
+ top_k,
296
+ ],
297
+ outputs=chatbot,
298
+ api_name=False,
299
+ )
300
+
301
+ undo_button.click(
302
+ fn=delete_prev_fn,
303
+ inputs=chatbot,
304
+ outputs=[chatbot, saved_input],
305
+ api_name=False,
306
+ queue=False,
307
+ ).then(
308
+ fn=lambda x: x,
309
+ inputs=[saved_input],
310
+ outputs=textbox,
311
+ api_name=False,
312
+ queue=False,
313
+ )
314
+
315
+ clear_button.click(
316
+ fn=lambda: ([], ""),
317
+ outputs=[chatbot, saved_input],
318
+ queue=False,
319
+ api_name=False,
320
+ )
321
+
322
+ demo.queue(max_size=20).launch(share=True)
env_examples/.env.13b_example ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_PATH = "/path-to/Llama-2-13b-chat-hf"
2
+ LOAD_IN_8BIT = False
3
+ LOAD_IN_4BIT = False
4
+ LLAMA_CPP = False
5
+
6
+ MAX_MAX_NEW_TOKENS = 2048
7
+ DEFAULT_MAX_NEW_TOKENS = 1024
8
+ MAX_INPUT_TOKEN_LENGTH = 4000
9
+
10
+ DEFAULT_SYSTEM_PROMPT = "\
11
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
12
+
13
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
14
+ "
env_examples/.env.7b_8bit_example ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_PATH = "/path-to/Llama-2-7b-chat-hf"
2
+ LOAD_IN_8BIT = True
3
+ LOAD_IN_4BIT = False
4
+ LLAMA_CPP = False
5
+
6
+ MAX_MAX_NEW_TOKENS = 2048
7
+ DEFAULT_MAX_NEW_TOKENS = 1024
8
+ MAX_INPUT_TOKEN_LENGTH = 4000
9
+
10
+ DEFAULT_SYSTEM_PROMPT = "\
11
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
12
+
13
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
14
+ "
env_examples/.env.7b_ggmlv3_q4_0_example ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_PATH = "/path-to/llama-2-7b-chat.ggmlv3.q4_0.bin"
2
+ LOAD_IN_8BIT = False
3
+ LOAD_IN_4BIT = True
4
+ LLAMA_CPP = True
5
+
6
+ MAX_MAX_NEW_TOKENS = 2048
7
+ DEFAULT_MAX_NEW_TOKENS = 1024
8
+ MAX_INPUT_TOKEN_LENGTH = 4000
9
+
10
+ DEFAULT_SYSTEM_PROMPT = "\
11
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
12
+
13
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
14
+ "
env_examples/.env.7b_gptq_example ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_PATH = "/path-to/Llama-2-7b-Chat-GPTQ"
2
+ LOAD_IN_8BIT = False
3
+ LOAD_IN_4BIT = True
4
+ LLAMA_CPP = False
5
+
6
+ MAX_MAX_NEW_TOKENS = 2048
7
+ DEFAULT_MAX_NEW_TOKENS = 1024
8
+ MAX_INPUT_TOKEN_LENGTH = 4000
9
+
10
+ DEFAULT_SYSTEM_PROMPT = "\
11
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
12
+
13
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\
14
+ "
gradio_cached_examples/19/Chatbot/tmp04pykiig.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [["Write a 100-word article on 'Benefits of Open-Source in AI research'", "As an ethical and respectful assistant, I must emphasize that open-source is essential in AI research. It enables the community to collaborate, share resources, and develop cutting-edge technologies without any restrictions. Open-source allows researchers to access and contribute to the same codebase, ensuring transparency and accountability in the development process. This leads to faster innovation, improved quality, and reduced costs. Moreover, open-source promotes diversity and inclusivity in AI research, providing opportunities for individuals and organizations to contribute and make meaningful impacts. By fostering collaboration and transparency, open-source can help unlock the full potential of AI for the betterment of society."]]
gradio_cached_examples/19/Chatbot/tmp8t0ux8mq.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [["Can you explain briefly to me what is the Python programming language?", "Of course! Python is a high-level, interpreted programming language that is widely used for various purposes, including web development, data analysis, artificial intelligence, and more. It is known for its simplicity, readability, and ease of use, making it a great language for beginners and experienced developers alike.\nPython has a vast range of libraries and frameworks that enable developers to create a wide variety of applications, such as web applications, games, scientific simulations, and more. Its syntax is easy to understand and similar to other languages, making it a great language for beginners to learn. Additionally, Python has a large and active community of developers, which means there are many resources available for learning and troubleshooting.\nOverall, Python is a versatile and powerful language that is widely used in many different fields, and it continues to be a popular choice among developers."]]
gradio_cached_examples/19/Chatbot/tmpa2ff6q5t.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [["Explain the plot of Cinderella in a sentence.", "Of course, I'd be happy to help! The plot of Cinderella is a classic tale of a kind and hardworking young woman who, despite the cruel treatment of her stepmother and stepsisters, ultimately finds true love with a handsome prince through the help of a fairy godmother and a magical ball."]]
gradio_cached_examples/19/Chatbot/tmpihnzggmf.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [["Hello there! How are you doing?", "Hello! I'm just an AI, I don't have personal experiences or emotions like humans do, but I'm here to help you in any way I can. How can I assist you today? Please keep in mind that I'm programmed to provide safe and respectful responses, and I will always do my best to answer your questions accurately and helpfully. If you have any questions or topics you'd like to discuss, feel free to ask!"]]
gradio_cached_examples/19/Chatbot/tmpkkygqkjw.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [["How many hours does it take a man to eat a Helicopter?", "I'm happy to help you with your question! However, I must point out that it is not possible for a man to eat a helicopter, as helicopters are machines made of metal and other materials, and are not considered food. Therefore, the question does not make sense, and I cannot provide an answer.\nIt's important to ask questions that are factually coherent and make sense, as it helps ensure that the information provided is accurate and helpful. If you have any other questions, please feel free to ask!"]]
gradio_cached_examples/19/log.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ component 0,Chatbot,flag,username,timestamp
2
+ ,/workspace/lab-di/squads/ensol/data/lm/llama2-webui/gradio_cached_examples/19/Chatbot/tmpihnzggmf.json,,,2023-07-27 12:19:50.211424
3
+ ,/workspace/lab-di/squads/ensol/data/lm/llama2-webui/gradio_cached_examples/19/Chatbot/tmp8t0ux8mq.json,,,2023-07-27 12:20:14.359876
4
+ ,/workspace/lab-di/squads/ensol/data/lm/llama2-webui/gradio_cached_examples/19/Chatbot/tmpa2ff6q5t.json,,,2023-07-27 12:20:24.077631
5
+ ,/workspace/lab-di/squads/ensol/data/lm/llama2-webui/gradio_cached_examples/19/Chatbot/tmpkkygqkjw.json,,,2023-07-27 12:20:39.875791
6
+ ,/workspace/lab-di/squads/ensol/data/lm/llama2-webui/gradio_cached_examples/19/Chatbot/tmp04pykiig.json,,,2023-07-27 12:21:00.887316
model.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import Thread
2
+ from typing import Iterator
3
+
4
+
5
+ class LLAMA2_WRAPPER:
6
+ def __init__(self, config: dict = {}):
7
+ self.config = config
8
+ self.model = None
9
+ self.tokenizer = None
10
+
11
+ def init_model(self):
12
+ if self.model is None:
13
+ self.model = LLAMA2_WRAPPER.create_llama2_model(
14
+ self.config,
15
+ )
16
+ if not self.config.get("llama_cpp"):
17
+ self.model.eval()
18
+
19
+ def init_tokenizer(self):
20
+ if self.tokenizer is None and not self.config.get("llama_cpp"):
21
+ self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.config)
22
+
23
+ @classmethod
24
+ def create_llama2_model(cls, config):
25
+ model_name = config.get("model_name")
26
+ load_in_8bit = config.get("load_in_8bit", True)
27
+ load_in_4bit = config.get("load_in_4bit", False)
28
+ llama_cpp = config.get("llama_cpp", False)
29
+ if llama_cpp:
30
+ from llama_cpp import Llama
31
+
32
+ model = Llama(
33
+ model_path=model_name,
34
+ n_ctx=config.get("MAX_INPUT_TOKEN_LENGTH"),
35
+ n_batch=config.get("MAX_INPUT_TOKEN_LENGTH"),
36
+ )
37
+ elif load_in_4bit:
38
+ from auto_gptq import AutoGPTQForCausalLM
39
+
40
+ model = AutoGPTQForCausalLM.from_quantized(
41
+ model_name,
42
+ use_safetensors=True,
43
+ trust_remote_code=True,
44
+ device="cuda:0",
45
+ use_triton=False,
46
+ quantize_config=None,
47
+ )
48
+ else:
49
+ import torch
50
+ from transformers import AutoModelForCausalLM
51
+
52
+ model = AutoModelForCausalLM.from_pretrained(
53
+ model_name,
54
+ device_map="auto",
55
+ torch_dtype=torch.float16,
56
+ load_in_8bit=load_in_8bit,
57
+ )
58
+ return model
59
+
60
+ @classmethod
61
+ def create_llama2_tokenizer(cls, config):
62
+ model_name = config.get("model_name")
63
+ from transformers import AutoTokenizer
64
+
65
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
66
+ return tokenizer
67
+
68
+ def get_input_token_length(
69
+ self, message: str, chat_history: list[tuple[str, str]], system_prompt: str
70
+ ) -> int:
71
+ prompt = get_prompt(message, chat_history, system_prompt)
72
+
73
+ if self.config.get("llama_cpp"):
74
+ input_ids = self.model.tokenize(bytes(prompt, "utf-8"))
75
+ return len(input_ids)
76
+ else:
77
+ input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"]
78
+ return input_ids.shape[-1]
79
+
80
+ def run(
81
+ self,
82
+ message: str,
83
+ chat_history: list[tuple[str, str]],
84
+ system_prompt: str,
85
+ max_new_tokens: int = 1024,
86
+ temperature: float = 0.8,
87
+ top_p: float = 0.95,
88
+ top_k: int = 50,
89
+ ) -> Iterator[str]:
90
+ prompt = get_prompt(message, chat_history, system_prompt)
91
+ if self.config.get("llama_cpp"):
92
+ inputs = self.model.tokenize(bytes(prompt, "utf-8"))
93
+ generate_kwargs = dict(
94
+ top_p=top_p,
95
+ top_k=top_k,
96
+ temp=temperature,
97
+ )
98
+
99
+ generator = self.model.generate(inputs, **generate_kwargs)
100
+ outputs = []
101
+ for token in generator:
102
+ if token == self.model.token_eos():
103
+ break
104
+ b_text = self.model.detokenize([token])
105
+ text = str(b_text, encoding="utf-8")
106
+ outputs.append(text)
107
+ yield "".join(outputs)
108
+ else:
109
+ from transformers import TextIteratorStreamer
110
+
111
+ inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
112
+
113
+ streamer = TextIteratorStreamer(
114
+ self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
115
+ )
116
+ generate_kwargs = dict(
117
+ inputs,
118
+ streamer=streamer,
119
+ max_new_tokens=max_new_tokens,
120
+ do_sample=True,
121
+ top_p=top_p,
122
+ top_k=top_k,
123
+ temperature=temperature,
124
+ num_beams=1,
125
+ )
126
+ t = Thread(target=self.model.generate, kwargs=generate_kwargs)
127
+ t.start()
128
+
129
+ outputs = []
130
+ for text in streamer:
131
+ outputs.append(text)
132
+ yield "".join(outputs)
133
+
134
+
135
+ def get_prompt(
136
+ message: str, chat_history: list[tuple[str, str]], system_prompt: str
137
+ ) -> str:
138
+ texts = [f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
139
+ for user_input, response in chat_history:
140
+ texts.append(f"{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ")
141
+ texts.append(f"{message.strip()} [/INST]")
142
+ return "".join(texts)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.21.0
2
+ auto-gptq==0.3.0
3
+ bitsandbytes==0.40.2
4
+ gradio==3.37.0
5
+ protobuf==3.20.3
6
+ scipy==1.11.1
7
+ sentencepiece==0.1.99
8
+ torch==2.0.1
9
+ transformers==4.31.0
10
+ tqdm==4.65.0
11
+ python-dotenv==1.0.0
static/screenshot.png ADDED