Spaces:

goavinash5
/

Gradio_LLAMA_Testing

Sleeping

App Files Files Community

goavinash5 commited on Dec 12, 2023

Commit

e97665c

•

1 Parent(s): a575d7a

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env +28 -0
.gitattributes +1 -0
.github/workflows/branch.yml +60 -0
.github/workflows/release.yml +30 -0
.github/workflows/update_space.yml +28 -0
.gitignore +10 -0
CONTRIBUTING.md +90 -0
LICENSE +21 -0
README.md +379 -8
app.py +418 -0
benchmark.py +145 -0
code_completion.py +216 -0
colab/Llama_2_7b_Chat_GPTQ.ipynb +0 -0
colab/ggmlv3_q4_0.ipynb +109 -0
colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb +514 -0
docs/issues.md +0 -0
docs/news.md +38 -0
docs/performance.md +32 -0
docs/pypi.md +187 -0
env_examples/.env.13b_example +13 -0
env_examples/.env.7b_8bit_example +13 -0
env_examples/.env.7b_ggmlv3_q4_0_example +18 -0
env_examples/.env.7b_gptq_example +18 -0
llama2_wrapper/__init__.py +1 -0
llama2_wrapper/__pycache__/__init__.cpython-310.pyc +0 -0
llama2_wrapper/__pycache__/model.cpython-310.pyc +0 -0
llama2_wrapper/__pycache__/types.cpython-310.pyc +0 -0
llama2_wrapper/download/__init__.py +0 -0
llama2_wrapper/download/__main__.py +59 -0
llama2_wrapper/download/__pycache__/__init__.cpython-310.pyc +0 -0
llama2_wrapper/download/__pycache__/__main__.cpython-310.pyc +0 -0
llama2_wrapper/model.py +787 -0
llama2_wrapper/server/__init__.py +0 -0
llama2_wrapper/server/__main__.py +46 -0
llama2_wrapper/server/__pycache__/__init__.cpython-310.pyc +0 -0
llama2_wrapper/server/__pycache__/__main__.cpython-310.pyc +0 -0
llama2_wrapper/server/__pycache__/app.cpython-310.pyc +0 -0
llama2_wrapper/server/app.py +526 -0
llama2_wrapper/types.py +115 -0
models/CodeLlama-7B-Python-GPTQ/.gitattributes +35 -0
models/CodeLlama-7B-Python-GPTQ/LICENSE +1 -0
models/CodeLlama-7B-Python-GPTQ/LICENSE.txt +126 -0
models/CodeLlama-7B-Python-GPTQ/Notice +1 -0
models/CodeLlama-7B-Python-GPTQ/README.md +338 -0
models/CodeLlama-7B-Python-GPTQ/USE_POLICY.md +50 -0
models/CodeLlama-7B-Python-GPTQ/config.json +43 -0
models/CodeLlama-7B-Python-GPTQ/configuration_llama.py +176 -0
models/CodeLlama-7B-Python-GPTQ/generation_config.json +7 -0
models/CodeLlama-7B-Python-GPTQ/modeling_llama.py +1020 -0
models/CodeLlama-7B-Python-GPTQ/quantize_config.json +10 -0

.env ADDED Viewed

	@@ -0,0 +1,28 @@

+MODEL_PATH = ""
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example ggml path:
+# MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin"
+# MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "llama.cpp"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "
+You are a movie recommender chatbot. You give movie recommendations to users based on their profile. Your job now is to fully understand the user profile based on the given context and give them recommendations based on their input. Here are some rules for you to follow while generating a response:
+1: Give an explanation for why each of the recommendations is a good fit for the user
+2: Give a maximum of 5 recommendations, unless specified otherwise by the user
+3: Give a predicted rating for the movie on a scale of 1 to 5: this is a rating the user would give to the movie if they watched it
+4: Mention how popular the movie is. Choose from among High, Medium, Low: High being most popular, Low being least
+5: Avoid recommending movies already rated by the user
+''' User Context '''
+"

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+models/llama-2-7b-chat.Q4_0.gguf filter=lfs diff=lfs merge=lfs -text

.github/workflows/branch.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: Push
+on: [push]
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run image
+        uses: abatilo/actions-poetry@v2.1.4
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Install dependencies
+        run: poetry install
+      - name: Run tests
+        run: poetry run pytest
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v3
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+      # - name: Upload coverage to Codecov
+      #   uses: codecov/codecov-action@v2
+  code-quality:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Python Poetry Action
+        uses: abatilo/actions-poetry@v2.1.6
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Install dependencies
+        run: poetry install
+      - name: Run black
+        run: poetry run black . --check
+      # - name: Run isort
+      #   run: poetry run isort . --check-only --profile black
+      # - name: Run flake8
+      #   run: poetry run flake8 .
+      # - name: Run bandit
+      #   run: poetry run bandit .
+      # - name: Run saftey
+      #   run: poetry run safety check

.github/workflows/release.yml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: Release
+on:
+  release:
+    types:
+      - created
+jobs:
+  publish:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10']
+        poetry-version: ['1.5.1']
+        os: [ubuntu-latest]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Run image
+        uses: abatilo/actions-poetry@v2.1.4
+        with:
+          poetry-version: ${{ matrix.poetry-version }}
+      - name: Publish
+        env:
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          poetry config pypi-token.pypi $PYPI_TOKEN
+          poetry publish --build

.github/workflows/update_space.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Run Python script
+on:
+  push:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9'
+    - name: Install Gradio
+      run: python -m pip install gradio
+    - name: Log in to Hugging Face
+      run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
+    - name: Deploy to Spaces
+      run: gradio deploy

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+models
+dist
+.DS_Store
+.vscode
+__pycache__
+gradio_cached_examples
+.pytest_cache

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# Contributing to [llama2-webui](https://github.com/liltom-eth/llama2-webui)
+We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
+- Reporting a bug
+- Proposing new features
+- Discussing the current state of the code
+- Update README.md
+- Submitting a PR
+## Using GitHub's [issues](https://github.com/liltom-eth/llama2-webui/issues)
+We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/liltom-eth/llama2-webui/issues). It's that easy!
+Thanks for **[jlb1504](https://github.com/jlb1504)** for reporting the [first issue](https://github.com/liltom-eth/llama2-webui/issues/1)!
+**Great Bug Reports** tend to have:
+- A quick summary and/or background
+- Steps to reproduce
+  - Be specific!
+  - Give a sample code if you can.
+- What you expected would happen
+- What actually happens
+- Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
+Proposing new features are also welcome.
+## Pull Request
+All pull requests are welcome. For example, you update the `README.md` to help users to better understand the usage.
+### Clone the repository
+1. Create a user account on GitHub if you do not already have one.
+2. Fork the project [repository](https://github.com/liltom-eth/llama2-webui): click on the *Fork* button near the top of the page. This creates a copy of the code under your account on GitHub.
+3. Clone this copy to your local disk:
+   ```
+   git clone git@github.com:liltom-eth/llama2-webui.git
+   cd llama2-webui
+   ```
+### Implement your changes
+1. Create a branch to hold your changes:
+   ```
+   git checkout -b my-feature
+   ```
+   and start making changes. Never work on the main branch!
+2. Start your work on this branch.
+3. When you’re done editing, do:
+   ```
+   git add <MODIFIED FILES>
+   git commit
+   ```
+   to record your changes in [git](https://git-scm.com/).
+### Submit your contribution
+1. If everything works fine, push your local branch to the remote server with:
+   ```
+   git push -u origin my-feature
+   ```
+2. Go to the web page of your fork and click "Create pull request" to send your changes for review.
+   ```{todo}
+      Find more detailed information in [creating a PR]. You might also want to open
+      the PR as a draft first and mark it as ready for review after the feedbacks
+      from the continuous integration (CI) system or any required fixes.
+   ```
+## License
+By contributing, you agree that your contributions will be licensed under its MIT License.
+## Questions?
+Email us at [liltom.eth@gmail.com](mailto:liltom.eth@gmail.com)

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Tom
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,383 @@
 ---
-title: Gradio LLAMA Testing
-emoji: 📚
-colorFrom: blue
-colorTo: yellow
-sdk: gradio
-sdk_version: 4.8.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Gradio_LLAMA_Testing
 app_file: app.py
+sdk: gradio
+sdk_version: 3.37.0
 ---
+# llama2-webui
+Running Llama 2 with gradio web UI on GPU or CPU from anywhere (Linux/Windows/Mac).
+- Supporting all Llama 2 models (7B, 13B, 70B, GPTQ, GGML, GGUF, [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)) with 8-bit, 4-bit mode.
+- Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
+![screenshot](./static/screenshot.png)
+![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
+## Features
+- Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML),  [Llama-2-GGUF](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF),  [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
+- Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on free Colab T4 GPU](./colab/Llama_2_7b_Chat_GPTQ.ipynb)
+- Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](#start-openai-compatible-api) on Llama2 models.
+- [News](./docs/news.md), [Benchmark](./docs/performance.md), [Issue Solutions](./docs/issues.md)
+## Contents
+- [Install](#install)
+- [Usage](#usage)
+  - [Start Chat UI](#start-chat-ui)
+  - [Start Code Llama UI](#start-code-llama-ui)
+  - [Use llama2-wrapper for Your App](#use-llama2-wrapper-for-your-app)
+  - [Start OpenAI Compatible API](#start-openai-compatible-api)
+- [Benchmark](#benchmark)
+- [Download Llama-2 Models](#download-llama-2-models)
+  - [Model List](#model-list)
+  - [Download Script](#download-script)
+- [Tips](#tips)
+  - [Env Examples](#env-examples)
+  - [Run on Nvidia GPU](#run-on-nvidia-gpu)
+    - [Run bitsandbytes 8 bit](#run-bitsandbytes-8-bit)
+    - [Run GPTQ 4 bit](#run-gptq-4-bit)
+  - [Run on CPU](#run-on-cpu)
+    - [Mac Metal Acceleration](#mac-metal-acceleration)
+    - [AMD/Nvidia GPU Acceleration](#amdnvidia-gpu-acceleration)
+- [License](#license)
+- [Contributing](#contributing)
+## Install
+### Method 1: From [PyPI](https://pypi.org/project/llama2-wrapper/)
+```
+pip install llama2-wrapper
+```
+The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
+If you would like to use old `ggml` models, install `llama2-wrapper<=0.1.13` or manually install `llama-cpp-python==0.1.77`.
+### Method 2: From Source:
+```
+git clone https://github.com/liltom-eth/llama2-webui.git
+cd llama2-webui
+pip install -r requirements.txt
+```
+### Install Issues:
+`bitsandbytes >= 0.39` may not work on older NVIDIA GPUs. In that case, to use `LOAD_IN_8BIT`, you may have to downgrade like this:
+-  `pip install bitsandbytes==0.38.1`
+`bitsandbytes` also need a special install for Windows:
+```
+pip uninstall bitsandbytes
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.0-py3-none-win_amd64.whl
+```
+## Usage
+### Start Chat UI
+Run chatbot simply with web UI:
+```bash
+python app.py
+```
+`app.py` will load the default config `.env` which uses `llama.cpp` as the backend to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model for inference. The model `llama-2-7b-chat.ggmlv3.q4_0.bin` will be automatically downloaded.
+```bash
+Running on backend llama.cpp.
+Use default model path: ./models/llama-2-7b-chat.Q4_0.gguf
+Start downloading model to: ./models/llama-2-7b-chat.Q4_0.gguf
+```
+You can also customize your `MODEL_PATH`, `BACKEND_TYPE,` and model configs in `.env` file to run different llama2 models on different backends (llama.cpp, transformers, gptq).
+### Start Code Llama UI
+We provide a code completion / filling UI for Code Llama.
+Base model **Code Llama** and extend model **Code Llama — Python** are not fine-tuned to follow instructions. They should be prompted so that the expected answer is the natural continuation of the prompt. That means these two models focus on code filling and code completion.
+Here is an example run CodeLlama code completion on llama.cpp backend:
+```
+python code_completion.py --model_path ./models/codellama-7b.Q4_0.gguf
+```
+![code_llama_playground](https://i.imgur.com/FgMUiT6.gif)
+`codellama-7b.Q4_0.gguf` can be downloaded from [TheBloke/CodeLlama-7B-GGUF](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/blob/main/codellama-7b.Q4_0.gguf).
+**Code Llama — Instruct** trained with “natural language instruction” inputs paired with anticipated outputs. This strategic methodology enhances the model’s capacity to grasp human expectations in prompts. That means instruct models can be used in a chatbot-like app.
+Example run CodeLlama chat on gptq backend:
+```
+python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True
+```
+![code_llama_chat](https://i.imgur.com/lQLfemB.gif)
+`CodeLlama-7B-Instruct-GPTQ` can be downloaded from [TheBloke/CodeLlama-7B-Instruct-GPTQ](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)
+### Use llama2-wrapper for Your App
+🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
+Use  `llama2-wrapper`  as your local llama2 backend to answer questions and more, [colab example](./colab/ggmlv3_q4_0.ipynb):
+```python
+# pip install llama2-wrapper
+from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+llama2_wrapper = LLAMA2_WRAPPER()
+# Default running on backend llama.cpp.
+# Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
+prompt = "Do you know Pytorch"
+answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
+```
+Run gptq llama2 model on Nvidia GPU, [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
+# Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
+```
+Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(
+	model_path = "./models/Llama-2-7b-chat-hf",
+  backend_type = "transformers",
+  load_in_8bit = True
+)
+```
+Check [API Document](https://pypi.org/project/llama2-wrapper/) for more usages.
+### Start OpenAI Compatible API
+`llama2-wrapper` offers a web server that acts as a drop-in replacement for the OpenAI API. This allows you to use Llama2 models with any OpenAI compatible clients, libraries or services, etc.
+Start Fast API:
+```
+python -m llama2_wrapper.server
+```
+it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
+Start Fast API for `gptq` backend:
+```
+python -m llama2_wrapper.server --backend_type gptq
+```
+Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
+#### Basic settings
+| Flag             | Description                                                  |
+| ---------------- | ------------------------------------------------------------ |
+| `-h`, `--help`   | Show this help message.                                      |
+| `--model_path`   | The path to the model to use for generating completions.     |
+| `--backend_type` | Backend for llama2, options: llama.cpp, gptq, transformers   |
+| `--max_tokens`   | Maximum context size.                                        |
+| `--load_in_8bit` | Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models). |
+| `--verbose`      | Whether to print verbose output to stderr.                   |
+| `--host`         | API address                                                  |
+| `--port`         | API port                                                     |
+## Benchmark
+Run benchmark script to compute performance on your device, `benchmark.py` will load the same `.env` as `app.py`.:
+```bash
+python benchmark.py
+```
+You can also select the `iter`, `backend_type` and `model_path` the benchmark will be run (overwrite .env args) :
+```bash
+python benchmark.py --iter NB_OF_ITERATIONS --backend_type gptq
+```
+ By default, the number of iterations is 5, but if you want a faster result or a more accurate one
+ you can set it to whatever value you want, but please only report results with at least 5 iterations.
+This [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb) also show you how to benchmark gptq model on free Google Colab T4 GPU.
+Some benchmark performance:
+| Model                       | Precision | Device             | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------- | --------- | ------------------ | -------------- | ------------------ | ------------- |
+| Llama-2-7b-chat-hf          | 8 bit     | NVIDIA RTX 2080 Ti | 7.7 GB VRAM    | 3.76               | 641.36        |
+| Llama-2-7b-Chat-GPTQ        | 4 bit     | NVIDIA RTX 2080 Ti | 5.8 GB VRAM    | 18.85              | 192.91        |
+| Llama-2-7b-Chat-GPTQ        | 4 bit     | Google Colab T4    | 5.8 GB VRAM    | 18.19              | 37.44         |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M1 Pro CPU   | 5.4 GB RAM     | 17.90              | 0.18          |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU       | 5.4 GB RAM     | 13.70              | 0.13          |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 Metal     | 5.4 GB RAM     | 12.60              | 0.10          |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700      | 4.5 GB RAM     | 7.88               | 31.90         |
+Check/contribute the performance of your device in the full [performance doc](./docs/performance.md).
+## Download Llama-2 Models
+Llama 2 is a collection of pre-trained and fine-tuned generative text models ranging in scale from 7 billion to 70 billion parameters.
+Llama-2-7b-Chat-GPTQ is the GPTQ model files for [Meta's Llama 2 7b Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf). GPTQ 4-bit Llama-2 model require less GPU VRAM to run it.
+### Model List
+| Model Name                          | set MODEL_PATH in .env                   | Download URL                                                 |
+| ----------------------------------- | ---------------------------------------- | ------------------------------------------------------------ |
+| meta-llama/Llama-2-7b-chat-hf       | /path-to/Llama-2-7b-chat-hf              | [Link](https://huggingface.co/llamaste/Llama-2-7b-chat-hf)   |
+| meta-llama/Llama-2-13b-chat-hf      | /path-to/Llama-2-13b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)  |
+| meta-llama/Llama-2-70b-chat-hf      | /path-to/Llama-2-70b-chat-hf             | [Link](https://huggingface.co/llamaste/Llama-2-70b-chat-hf)  |
+| meta-llama/Llama-2-7b-hf            | /path-to/Llama-2-7b-hf                   | [Link](https://huggingface.co/meta-llama/Llama-2-7b-hf)      |
+| meta-llama/Llama-2-13b-hf           | /path-to/Llama-2-13b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-13b-hf)     |
+| meta-llama/Llama-2-70b-hf           | /path-to/Llama-2-70b-hf                  | [Link](https://huggingface.co/meta-llama/Llama-2-70b-hf)     |
+| TheBloke/Llama-2-7b-Chat-GPTQ       | /path-to/Llama-2-7b-Chat-GPTQ            | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ) |
+| TheBloke/Llama-2-7b-Chat-GGUF       | /path-to/llama-2-7b-chat.Q4_0.gguf       | [Link](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_0.gguf) |
+| TheBloke/Llama-2-7B-Chat-GGML       | /path-to/llama-2-7b-chat.ggmlv3.q4_0.bin | [Link](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) |
+| TheBloke/CodeLlama-7B-Instruct-GPTQ | TheBloke/CodeLlama-7B-Instruct-GPTQ      | [Link](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) |
+| ...                                 | ...                                      | ...                                                          |
+Running 4-bit model `Llama-2-7b-Chat-GPTQ` needs GPU with 6GB VRAM.
+Running 4-bit model `llama-2-7b-chat.ggmlv3.q4_0.bin` needs CPU with 6GB RAM. There is also a list of other 2, 3, 4, 5, 6, 8-bit GGML models that can be used from [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML).
+### Download Script
+These models can be downloaded through:
+```bash
+python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Python-GPTQ
+python -m llama2_wrapper.download --repo_id TheBloke/Llama-2-7b-Chat-GGUF --filename llama-2-7b-chat.Q4_0.gguf --save_dir ./models
+```
+Or use CMD like:
+```bash
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone git@hf.co:meta-llama/Llama-2-7b-chat-hf
+```
+To download Llama 2 models, you need to request access from [https://ai.meta.com/llama/](https://ai.meta.com/llama/) and also enable access on repos like [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main). Requests will be processed in hours.
+For GPTQ models like [TheBloke/Llama-2-7b-Chat-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), you can directly download without requesting access.
+For GGML models like [TheBloke/Llama-2-7B-Chat-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), you can directly download without requesting access.
+## Tips
+### Env Examples
+There are some examples in `./env_examples/` folder.
+| Model Setup                                            | Example .env                |
+| ------------------------------------------------------ | --------------------------- |
+| Llama-2-7b-chat-hf 8-bit (transformers backend)        | .env.7b_8bit_example        |
+| Llama-2-7b-Chat-GPTQ 4-bit (gptq transformers backend) | .env.7b_gptq_example        |
+| Llama-2-7B-Chat-GGML 4bit (llama.cpp backend)          | .env.7b_ggmlv3_q4_0_example |
+| Llama-2-13b-chat-hf (transformers backend)             | .env.13b_example            |
+| ...                                                    | ...                         |
+### Run on Nvidia GPU
+The running requires around 14GB of GPU VRAM for Llama-2-7b and 28GB of GPU VRAM for Llama-2-13b.
+If you are running on multiple GPUs, the model will be loaded automatically on GPUs and split the VRAM usage. That allows you to run Llama-2-7b (requires 14GB of GPU VRAM) on a setup like 2 GPUs (11GB VRAM each).
+#### Run bitsandbytes 8 bit
+If you do not have enough memory,  you can set up your `LOAD_IN_8BIT` as `True` in `.env`. This can reduce memory usage by around half with slightly degraded model quality. It is compatible with the CPU, GPU, and Metal backend.
+Llama-2-7b with 8-bit compression can run on a single GPU with 8 GB of VRAM, like an Nvidia RTX 2080Ti, RTX 4080, T4, V100 (16GB).
+#### Run GPTQ 4 bit
+If you want to run 4 bit  Llama-2 model like `Llama-2-7b-Chat-GPTQ`,  you can set up your `BACKEND_TYPE` as `gptq` in `.env` like example `.env.7b_gptq_example`.
+Make sure you have downloaded the 4-bit model from `Llama-2-7b-Chat-GPTQ` and set the `MODEL_PATH` and arguments in `.env` file.
+`Llama-2-7b-Chat-GPTQ` can run on a single GPU with 6 GB of VRAM.
+If you encounter issue like `NameError: name 'autogptq_cuda_256' is not defined`, please refer to [here](https://huggingface.co/TheBloke/open-llama-13b-open-instruct-GPTQ/discussions/1)
+> pip install https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl
+### Run on CPU
+Run Llama-2 model on CPU requires [llama.cpp](https://github.com/ggerganov/llama.cpp) dependency and [llama.cpp Python Bindings](https://github.com/abetlen/llama-cpp-python), which are already installed.
+Download GGML models like `llama-2-7b-chat.ggmlv3.q4_0.bin` following [Download Llama-2 Models](#download-llama-2-models) section. `llama-2-7b-chat.ggmlv3.q4_0.bin` model requires at least 6 GB RAM to run on CPU.
+Set up configs like `.env.7b_ggmlv3_q4_0_example` from `env_examples` as `.env`.
+Run web UI `python app.py` .
+#### Mac Metal Acceleration
+For Mac users, you can also set up Mac Metal for acceleration, try install this dependencies:
+```bash
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+```
+or check details:
+- [MacOS Install with Metal GPU](https://github.com/abetlen/llama-cpp-python/blob/main/docs/install/macos.md)
+#### AMD/Nvidia GPU Acceleration
+If you would like to use AMD/Nvidia GPU for acceleration, check this:
+- [Installation with OpenBLAS / cuBLAS / CLBlast / Metal](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)
+## License
+MIT - see [MIT License](LICENSE)
+This project enables users to adapt it freely for proprietary purposes without any restrictions.
+## Contributing
+Kindly read our [Contributing Guide](CONTRIBUTING.md) to learn and understand our development process.
+### All Contributors
+<a href="https://github.com/liltom-eth/llama2-webui/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=liltom-eth/llama2-webui" />
+</a>
+### Review
+<a href='https://github.com/repo-reviews/repo-reviews.github.io/blob/main/create.md' target="_blank"><img alt='Github' src='https://img.shields.io/badge/review-100000?style=flat&logo=Github&logoColor=white&labelColor=888888&color=555555'/></a>
+### Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=liltom-eth/llama2-webui&type=Date)](https://star-history.com/#liltom-eth/llama2-webui&Date)
+## Credits
+- https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+- https://huggingface.co/spaces/huggingface-projects/llama-2-7b-chat
+- https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ
+- [https://github.com/ggerganov/llama.cpp](https://github.com/ggerganov/llama.cpp)
+- [https://github.com/TimDettmers/bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- [https://github.com/PanQiWei/AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)
+- [https://github.com/abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)

app.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import os
+import argparse
+from typing import Iterator
+import gradio as gr
+from dotenv import load_dotenv
+from distutils.util import strtobool
+from llama2_wrapper import LLAMA2_WRAPPER
+import logging
+from prompts.utils import PromtsContainer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="", help="model path")
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    parser.add_argument(
+        "--share",
+        type=bool,
+        default=False,
+        help="Whether to share public for gradio.",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
+    MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
+    DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
+    MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
+    MODEL_PATH = os.getenv("MODEL_PATH")
+    assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
+    BACKEND_TYPE = os.getenv("BACKEND_TYPE")
+    assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
+    LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
+    if args.model_path != "":
+        MODEL_PATH = args.model_path
+    if args.backend_type != "":
+        BACKEND_TYPE = args.backend_type
+    if args.load_in_8bit:
+        LOAD_IN_8BIT = True
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=MODEL_PATH,
+        backend_type=BACKEND_TYPE,
+        max_tokens=MAX_INPUT_TOKEN_LENGTH,
+        load_in_8bit=LOAD_IN_8BIT,
+        # verbose=True,
+    )
+    DESCRIPTION = """
+    # llama2-webui
+    """
+    DESCRIPTION2 = """
+    - Supporting models: [Llama-2-7b](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ) ...
+    - Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+    """
+    def clear_and_save_textbox(message: str) -> tuple[str, str]:
+        return "", message
+    def save_textbox_for_prompt(message: str) -> str:
+        logging.info("start save_textbox_from_prompt")
+        message = convert_summary_to_prompt(message)
+        return message
+    def display_input(
+        message: str, history: list[tuple[str, str]]
+    ) -> list[tuple[str, str]]:
+        history.append((message, ""))
+        return history
+    def delete_prev_fn(
+        history: list[tuple[str, str]]
+    ) -> tuple[list[tuple[str, str]], str]:
+        try:
+            message, _ = history.pop()
+        except IndexError:
+            message = ""
+        return history, message or ""
+    def generate(
+        message: str,
+        history_with_input: list[tuple[str, str]],
+        system_prompt: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+    ) -> Iterator[list[tuple[str, str]]]:
+        if max_new_tokens > MAX_MAX_NEW_TOKENS:
+            raise ValueError
+        try:
+            history = history_with_input[:-1]
+            generator = llama2_wrapper.run(
+                message,
+                history,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            )
+            try:
+                first_response = next(generator)
+                yield history + [(message, first_response)]
+            except StopIteration:
+                yield history + [(message, "")]
+            for response in generator:
+                yield history + [(message, response)]
+        except Exception as e:
+            logging.exception(e)
+    def check_input_token_length(
+        message: str, chat_history: list[tuple[str, str]], system_prompt: str
+    ) -> None:
+        input_token_length = llama2_wrapper.get_input_token_length(
+            message, chat_history, system_prompt
+        )
+        if input_token_length > MAX_INPUT_TOKEN_LENGTH:
+            raise gr.Error(
+                f"The accumulated input is too long ({input_token_length} > {MAX_INPUT_TOKEN_LENGTH}). Clear your chat history and try again."
+            )
+    prompts_container = PromtsContainer()
+    prompts = prompts_container.get_prompts_tab_dict()
+    default_prompts_checkbox = False
+    default_advanced_checkbox = False
+    def convert_summary_to_prompt(summary):
+        return prompts_container.get_prompt_by_summary(summary)
+    def two_columns_list(tab_data, chatbot):
+        result = []
+        for i in range(int(len(tab_data) / 2) + 1):
+            row = gr.Row()
+            with row:
+                for j in range(2):
+                    index = 2 * i + j
+                    if index >= len(tab_data):
+                        break
+                    item = tab_data[index]
+                    with gr.Group():
+                        gr.HTML(
+                            f'<p style="color: black; font-weight: bold;">{item["act"]}</p>'
+                        )
+                        prompt_text = gr.Button(
+                            label="",
+                            value=f"{item['summary']}",
+                            size="sm",
+                            elem_classes="text-left-aligned",
+                        )
+                        prompt_text.click(
+                            fn=save_textbox_for_prompt,
+                            inputs=prompt_text,
+                            outputs=saved_input,
+                            api_name=False,
+                            queue=True,
+                        ).then(
+                            fn=display_input,
+                            inputs=[saved_input, chatbot],
+                            outputs=chatbot,
+                            api_name=False,
+                            queue=True,
+                        ).then(
+                            fn=check_input_token_length,
+                            inputs=[saved_input, chatbot, system_prompt],
+                            api_name=False,
+                            queue=False,
+                        ).success(
+                            fn=generate,
+                            inputs=[
+                                saved_input,
+                                chatbot,
+                                system_prompt,
+                                max_new_tokens,
+                                temperature,
+                                top_p,
+                                top_k,
+                            ],
+                            outputs=chatbot,
+                            api_name=False,
+                        )
+                result.append(row)
+        return result
+    CSS = """
+        .contain { display: flex; flex-direction: column;}
+        #component-0 #component-1 #component-2 #component-4 #component-5 { height:71vh !important; }
+        #component-0 #component-1 #component-24 > div:nth-child(2) { height:80vh !important; overflow-y:auto }
+        .text-left-aligned {text-align: left !important; font-size: 16px;}
+    """
+    with gr.Blocks(css=CSS) as demo:
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=2):
+                gr.Markdown(DESCRIPTION)
+                with gr.Group():
+                    chatbot = gr.Chatbot(label="Chatbot")
+                    with gr.Row():
+                        textbox = gr.Textbox(
+                            container=False,
+                            show_label=False,
+                            placeholder="Type a message...",
+                            scale=10,
+                        )
+                        submit_button = gr.Button(
+                            "Submit", variant="primary", scale=1, min_width=0
+                        )
+                with gr.Row():
+                    retry_button = gr.Button("🔄  Retry", variant="secondary")
+                    undo_button = gr.Button("↩️ Undo", variant="secondary")
+                    clear_button = gr.Button("🗑️  Clear", variant="secondary")
+                saved_input = gr.State()
+                with gr.Row():
+                    advanced_checkbox = gr.Checkbox(
+                        label="Advanced",
+                        value=default_prompts_checkbox,
+                        container=False,
+                        elem_classes="min_check",
+                    )
+                    prompts_checkbox = gr.Checkbox(
+                        label="Prompts",
+                        value=default_prompts_checkbox,
+                        container=False,
+                        elem_classes="min_check",
+                    )
+                with gr.Column(visible=default_advanced_checkbox) as advanced_column:
+                    system_prompt = gr.Textbox(
+                        label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6
+                    )
+                    max_new_tokens = gr.Slider(
+                        label="Max new tokens",
+                        minimum=1,
+                        maximum=MAX_MAX_NEW_TOKENS,
+                        step=1,
+                        value=DEFAULT_MAX_NEW_TOKENS,
+                    )
+                    temperature = gr.Slider(
+                        label="Temperature",
+                        minimum=0.1,
+                        maximum=4.0,
+                        step=0.1,
+                        value=1.0,
+                    )
+                    top_p = gr.Slider(
+                        label="Top-p (nucleus sampling)",
+                        minimum=0.05,
+                        maximum=1.0,
+                        step=0.05,
+                        value=0.95,
+                    )
+                    top_k = gr.Slider(
+                        label="Top-k",
+                        minimum=1,
+                        maximum=1000,
+                        step=1,
+                        value=50,
+                    )
+            with gr.Column(scale=1, visible=default_prompts_checkbox) as prompt_column:
+                gr.HTML(
+                    '<p style="color: green; font-weight: bold;font-size: 16px;">\N{four leaf clover} prompts</p>'
+                )
+                for k, v in prompts.items():
+                    with gr.Tab(k, scroll_to_output=True):
+                        lst = two_columns_list(v, chatbot)
+            prompts_checkbox.change(
+                lambda x: gr.update(visible=x),
+                prompts_checkbox,
+                prompt_column,
+                queue=False,
+            )
+            advanced_checkbox.change(
+                lambda x: gr.update(visible=x),
+                advanced_checkbox,
+                advanced_column,
+                queue=False,
+            )
+        textbox.submit(
+            fn=clear_and_save_textbox,
+            inputs=textbox,
+            outputs=[textbox, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=check_input_token_length,
+            inputs=[saved_input, chatbot, system_prompt],
+            api_name=False,
+            queue=False,
+        ).success(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            ],
+            outputs=chatbot,
+            api_name=False,
+        )
+        button_event_preprocess = (
+            submit_button.click(
+                fn=clear_and_save_textbox,
+                inputs=textbox,
+                outputs=[textbox, saved_input],
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                fn=display_input,
+                inputs=[saved_input, chatbot],
+                outputs=chatbot,
+                api_name=False,
+                queue=False,
+            )
+            .then(
+                fn=check_input_token_length,
+                inputs=[saved_input, chatbot, system_prompt],
+                api_name=False,
+                queue=False,
+            )
+            .success(
+                fn=generate,
+                inputs=[
+                    saved_input,
+                    chatbot,
+                    system_prompt,
+                    max_new_tokens,
+                    temperature,
+                    top_p,
+                    top_k,
+                ],
+                outputs=chatbot,
+                api_name=False,
+            )
+        )
+        retry_button.click(
+            fn=delete_prev_fn,
+            inputs=chatbot,
+            outputs=[chatbot, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=display_input,
+            inputs=[saved_input, chatbot],
+            outputs=chatbot,
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=generate,
+            inputs=[
+                saved_input,
+                chatbot,
+                system_prompt,
+                max_new_tokens,
+                temperature,
+                top_p,
+                top_k,
+            ],
+            outputs=chatbot,
+            api_name=False,
+        )
+        undo_button.click(
+            fn=delete_prev_fn,
+            inputs=chatbot,
+            outputs=[chatbot, saved_input],
+            api_name=False,
+            queue=False,
+        ).then(
+            fn=lambda x: x,
+            inputs=[saved_input],
+            outputs=textbox,
+            api_name=False,
+            queue=False,
+        )
+        clear_button.click(
+            fn=lambda: ([], ""),
+            outputs=[chatbot, saved_input],
+            queue=False,
+            api_name=False,
+        )
+    demo.queue(max_size=20).launch(share=args.share)
+if __name__ == "__main__":
+    main()

benchmark.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import time
+import argparse
+from dotenv import load_dotenv
+from distutils.util import strtobool
+from memory_profiler import memory_usage
+from tqdm import tqdm
+from llama2_wrapper import LLAMA2_WRAPPER
+def run_iteration(
+    llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
+):
+    def generation():
+        generator = llama2_wrapper.run(
+            prompt_example,
+            [],
+            DEFAULT_SYSTEM_PROMPT,
+            DEFAULT_MAX_NEW_TOKENS,
+            1,
+            0.95,
+            50,
+        )
+        model_response = None
+        try:
+            first_model_response = next(generator)
+        except StopIteration:
+            pass
+        for model_response in generator:
+            pass
+        return llama2_wrapper.get_token_length(model_response), model_response
+    tic = time.perf_counter()
+    mem_usage, (output_token_length, model_response) = memory_usage(
+        (generation,), max_usage=True, retval=True
+    )
+    toc = time.perf_counter()
+    generation_time = toc - tic
+    tokens_per_second = output_token_length / generation_time
+    return generation_time, tokens_per_second, mem_usage, model_response
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--iter", type=int, default=5, help="Number of iterations")
+    parser.add_argument("--model_path", type=str, default="", help="model path")
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    args = parser.parse_args()
+    load_dotenv()
+    DEFAULT_SYSTEM_PROMPT = os.getenv("DEFAULT_SYSTEM_PROMPT", "")
+    MAX_MAX_NEW_TOKENS = int(os.getenv("MAX_MAX_NEW_TOKENS", 2048))
+    DEFAULT_MAX_NEW_TOKENS = int(os.getenv("DEFAULT_MAX_NEW_TOKENS", 1024))
+    MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", 4000))
+    MODEL_PATH = os.getenv("MODEL_PATH")
+    assert MODEL_PATH is not None, f"MODEL_PATH is required, got: {MODEL_PATH}"
+    BACKEND_TYPE = os.getenv("BACKEND_TYPE")
+    assert BACKEND_TYPE is not None, f"BACKEND_TYPE is required, got: {BACKEND_TYPE}"
+    LOAD_IN_8BIT = bool(strtobool(os.getenv("LOAD_IN_8BIT", "True")))
+    if args.model_path != "":
+        MODEL_PATH = args.model_path
+    if args.backend_type != "":
+        BACKEND_TYPE = args.backend_type
+    if args.load_in_8bit:
+        LOAD_IN_8BIT = True
+    # Initialization
+    init_tic = time.perf_counter()
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=MODEL_PATH,
+        backend_type=BACKEND_TYPE,
+        max_tokens=MAX_INPUT_TOKEN_LENGTH,
+        load_in_8bit=LOAD_IN_8BIT,
+        # verbose=True,
+    )
+    init_toc = time.perf_counter()
+    initialization_time = init_toc - init_tic
+    total_time = 0
+    total_tokens_per_second = 0
+    total_memory_gen = 0
+    prompt_example = (
+        "Can you explain briefly to me what is the Python programming language?"
+    )
+    # Cold run
+    print("Performing cold run...")
+    run_iteration(
+        llama2_wrapper, prompt_example, DEFAULT_SYSTEM_PROMPT, DEFAULT_MAX_NEW_TOKENS
+    )
+    # Timed runs
+    print(f"Performing {args.iter} timed runs...")
+    for i in tqdm(range(args.iter)):
+        try:
+            gen_time, tokens_per_sec, mem_gen, model_response = run_iteration(
+                llama2_wrapper,
+                prompt_example,
+                DEFAULT_SYSTEM_PROMPT,
+                DEFAULT_MAX_NEW_TOKENS,
+            )
+            total_time += gen_time
+            total_tokens_per_second += tokens_per_sec
+            total_memory_gen += mem_gen
+        except:
+            break
+    avg_time = total_time / (i + 1)
+    avg_tokens_per_second = total_tokens_per_second / (i + 1)
+    avg_memory_gen = total_memory_gen / (i + 1)
+    print(f"Last model response: {model_response}")
+    print(f"Initialization time: {initialization_time:0.4f} seconds.")
+    print(
+        f"Average generation time over {(i + 1)} iterations: {avg_time:0.4f} seconds."
+    )
+    print(
+        f"Average speed over {(i + 1)} iterations: {avg_tokens_per_second:0.4f} tokens/sec."
+    )
+    print(f"Average memory usage during generation: {avg_memory_gen:.2f} MiB")
+if __name__ == "__main__":
+    main()

code_completion.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import argparse
+import gradio as gr
+from llama2_wrapper import LLAMA2_WRAPPER
+FIM_PREFIX = "<PRE> "
+FIM_MIDDLE = " <MID>"
+FIM_SUFFIX = " <SUF>"
+FIM_INDICATOR = "<FILL_ME>"
+EOS_STRING = "</s>"
+EOT_STRING = "<EOT>"
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="./models/codellama-7b-instruct.ggmlv3.Q4_0.bin",
+        help="model path",
+    )
+    parser.add_argument(
+        "--backend_type",
+        type=str,
+        default="llama.cpp",
+        help="Backend options: llama.cpp, gptq, transformers",
+    )
+    parser.add_argument(
+        "--max_tokens",
+        type=int,
+        default=4000,
+        help="Maximum context size.",
+    )
+    parser.add_argument(
+        "--load_in_8bit",
+        type=bool,
+        default=False,
+        help="Whether to use bitsandbytes 8 bit.",
+    )
+    parser.add_argument(
+        "--share",
+        type=bool,
+        default=False,
+        help="Whether to share public for gradio.",
+    )
+    args = parser.parse_args()
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path=args.model_path,
+        backend_type=args.backend_type,
+        max_tokens=args.max_tokens,
+        load_in_8bit=args.load_in_8bit,
+    )
+    def generate(
+        prompt,
+        temperature=0.9,
+        max_new_tokens=256,
+        top_p=0.95,
+        repetition_penalty=1.0,
+    ):
+        temperature = float(temperature)
+        if temperature < 1e-2:
+            temperature = 1e-2
+        top_p = float(top_p)
+        fim_mode = False
+        generate_kwargs = dict(
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            stream=True,
+        )
+        if FIM_INDICATOR in prompt:
+            fim_mode = True
+            try:
+                prefix, suffix = prompt.split(FIM_INDICATOR)
+            except:
+                raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!")
+            prompt = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
+        stream = llama2_wrapper.__call__(prompt, **generate_kwargs)
+        if fim_mode:
+            output = prefix
+        else:
+            output = prompt
+        # for response in stream:
+        #     output += response
+        #     yield output
+        # return output
+        previous_token = ""
+        for response in stream:
+            if any([end_token in response for end_token in [EOS_STRING, EOT_STRING]]):
+                if fim_mode:
+                    output += suffix
+                    yield output
+                    return output
+                    print("output", output)
+                else:
+                    return output
+            else:
+                output += response
+            previous_token = response
+            yield output
+        return output
+    examples = [
+        'def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\nprint(remove_non_ascii(\'afkdj$$(\'))',
+        "X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1)\n\n# Train a logistic regression model, predict the labels on the test set and compute the accuracy score",
+        "// Returns every other value in the array as a new array.\nfunction everyOther(arr) {",
+        "Poor English: She no went to the market. Corrected English:",
+        "def alternating(list1, list2):\n   results = []\n   for i in range(min(len(list1), len(list2))):\n       results.append(list1[i])\n       results.append(list2[i])\n   if len(list1) > len(list2):\n       <FILL_ME>\n   else:\n       results.extend(list2[i+1:])\n   return results",
+    ]
+    def process_example(args):
+        for x in generate(args):
+            pass
+        return x
+    description = """
+    <div style="text-align: center;">
+        <h1>Code Llama Playground</h1>
+    </div>
+    <div style="text-align: center;">
+        <p>This is a demo to complete code with Code Llama. For instruction purposes, please use llama2-webui app.py with CodeLlama-Instruct models. </p>
+    </div>
+    """
+    with gr.Blocks() as demo:
+        with gr.Column():
+            gr.Markdown(description)
+            with gr.Row():
+                with gr.Column():
+                    instruction = gr.Textbox(
+                        placeholder="Enter your code here",
+                        lines=5,
+                        label="Input",
+                        elem_id="q-input",
+                    )
+                    submit = gr.Button("Generate", variant="primary")
+                    output = gr.Code(elem_id="q-output", lines=30, label="Output")
+                    with gr.Row():
+                        with gr.Column():
+                            with gr.Accordion("Advanced settings", open=False):
+                                with gr.Row():
+                                    column_1, column_2 = gr.Column(), gr.Column()
+                                    with column_1:
+                                        temperature = gr.Slider(
+                                            label="Temperature",
+                                            value=0.1,
+                                            minimum=0.0,
+                                            maximum=1.0,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Higher values produce more diverse outputs",
+                                        )
+                                        max_new_tokens = gr.Slider(
+                                            label="Max new tokens",
+                                            value=256,
+                                            minimum=0,
+                                            maximum=8192,
+                                            step=64,
+                                            interactive=True,
+                                            info="The maximum numbers of new tokens",
+                                        )
+                                    with column_2:
+                                        top_p = gr.Slider(
+                                            label="Top-p (nucleus sampling)",
+                                            value=0.90,
+                                            minimum=0.0,
+                                            maximum=1,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Higher values sample more low-probability tokens",
+                                        )
+                                        repetition_penalty = gr.Slider(
+                                            label="Repetition penalty",
+                                            value=1.05,
+                                            minimum=1.0,
+                                            maximum=2.0,
+                                            step=0.05,
+                                            interactive=True,
+                                            info="Penalize repeated tokens",
+                                        )
+                    gr.Examples(
+                        examples=examples,
+                        inputs=[instruction],
+                        cache_examples=False,
+                        fn=process_example,
+                        outputs=[output],
+                    )
+        submit.click(
+            generate,
+            inputs=[
+                instruction,
+                temperature,
+                max_new_tokens,
+                top_p,
+                repetition_penalty,
+            ],
+            outputs=[output],
+        )
+    demo.queue(concurrency_count=16).launch(share=args.share)
+if __name__ == "__main__":
+    main()

colab/Llama_2_7b_Chat_GPTQ.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

colab/ggmlv3_q4_0.ipynb ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "toc_visible": true,
+      "authorship_tag": "ABX9TyM9WbudQYrVFksXUrt4Opt3",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7O5JSosg5-rx"
+      },
+      "outputs": [],
+      "source": [
+        "%cd /content\n",
+        "!pip install llama2-wrapper\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from llama2_wrapper import LLAMA2_WRAPPER, get_prompt\n",
+        "\n",
+        "llama2_wrapper = LLAMA2_WRAPPER()"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8rgb1ckl72wC",
+        "outputId": "d9ca2e20-26a5-490b-86f2-1a182e533b20"
+      },
+      "execution_count": 5,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Running on backend llama.cpp.\n",
+            "Use default model path: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n",
+            "Start downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "prompt = get_prompt(\"Hi do you know Pytorch?\")\n",
+        "print(llama2_wrapper(prompt))"
+      ],
+      "metadata": {
+        "id": "Qz2xAqozTIf6",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "1380fa52-3d4a-4ac5-ed02-7faefe7ec2f6"
+      },
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "  Yes, I'm familiar with PyTorch! PyTorch is an open-source deep learning framework that is widely used for building and training neural networks. It was originally developed by Facebook and is now maintained by the PyTorch Foundation.\n",
+            "\n",
+            "Here are some key features and capabilities of PyTorch:\n",
+            "\n",
+            "1. **Tensor Computation**: PyTorch provides a powerful tensor computation engine that allows for complex mathematical operations on large datasets.\n",
+            "2. **Autograd**: PyTorch's autograd system automatically computes gradients, which can save a lot of time and effort during training.\n",
+            "3. **Dynamic Compute**: PyTorch's dynamic compute system allows for more efficient computation by only computing the necessary computations at runtime.\n",
+            "4. **Memory-efficient**: PyTorch is designed to be memory-efficient, which is important for training large models that require a lot of memory.\n",
+            "5. **Accelerators**: PyTorch supports a wide range of accelerators, including GPUs, TPUs, and FPGAs, which can significantly speed up training times.\n",
+            "6. **Modules**: PyTorch provides a wide range of pre-built modules for common tasks, such as convolutional layers, recurrent neural networks, and more.\n",
+            "7. **Extensive Community**: PyTorch has a large and active community of developers and users, which can be helpful for getting support and staying up-to-date with the latest developments.\n",
+            "8. **Easy Integration**: PyTorch can be easily integrated with other popular deep learning frameworks, such as TensorFlow and Keras.\n",
+            "9. **Pythonic**: PyTorch is written in Python, which is a popular and easy-to-learn programming language.\n",
+            "10. **Flexible**: PyTorch allows for a wide range of customization options, which can be useful for building and training unique models.\n",
+            "\n",
+            "Overall, PyTorch is a powerful and flexible deep learning framework that can be used for a wide range of applications, including computer vision, natural language processing, and more.\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb ADDED Viewed

	@@ -0,0 +1,514 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4",
+      "authorship_tag": "ABX9TyOZhPcZe61RhDjhEFQv0vrl",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/liltom-eth/llama2-webui/blob/main/colab/webui_CodeLlama_7B_Instruct_GPTQ.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7O5JSosg5-rx"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U llama2-wrapper==0.1.12"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd /content\n",
+        "!git clone https://github.com/liltom-eth/llama2-webui\n",
+        "\n",
+        "%cd /content/llama2-webui\n",
+        "!python -m llama2_wrapper.download --repo_id TheBloke/CodeLlama-7B-Instruct-GPTQ\n",
+        "\n",
+        "%cd /content/llama2-webui\n",
+        "!python app.py --backend_type gptq --model_path ./models/CodeLlama-7B-Instruct-GPTQ/ --share True"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Y6A7bJdkmzY8",
+        "outputId": "0d702a7d-68ab-4747-f012-246d4dee3718"
+      },
+      "execution_count": 4,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content\n",
+            "fatal: destination path 'llama2-webui' already exists and is not an empty directory.\n",
+            "/content/llama2-webui\n",
+            "Start downloading model TheBloke/CodeLlama-7B-Instruct-GPTQ to: ./models/CodeLlama-7B-Instruct-GPTQ\n",
+            "Fetching 15 files:   0% 0/15 [00:00<?, ?it/s]\n",
+            "Downloading (…)d0d05/.gitattributes: 100% 1.52k/1.52k [00:00<00:00, 7.94MB/s]\n",
+            "Fetching 15 files:   7% 1/15 [00:01<00:16,  1.15s/it]\n",
+            "Downloading (…)478d0d05/LICENSE.txt: 100% 7.02k/7.02k [00:00<00:00, 31.6MB/s]\n",
+            "\n",
+            "Downloading (…)478d0d05/config.json: 100% 1.25k/1.25k [00:00<00:00, 7.95MB/s]\n",
+            "\n",
+            "Downloading (…)nfiguration_llama.py: 100% 8.56k/8.56k [00:00<00:00, 41.7MB/s]\n",
+            "\n",
+            "Downloading (…)81b84478d0d05/Notice: 100% 112/112 [00:00<00:00, 750kB/s]\n",
+            "\n",
+            "Downloading (…)neration_config.json: 100% 132/132 [00:00<00:00, 836kB/s]\n",
+            "\n",
+            "Downloading (…)8d0d05/USE_POLICY.md: 100% 105/105 [00:00<00:00, 686kB/s]\n",
+            "\n",
+            "Downloading (…)84478d0d05/README.md: 100% 22.0k/22.0k [00:00<00:00, 59.5MB/s]\n",
+            "\n",
+            "Downloading (…)05/modeling_llama.py: 100% 45.9k/45.9k [00:00<00:00, 27.5MB/s]\n",
+            "\n",
+            "Downloading (…)quantize_config.json: 100% 187/187 [00:00<00:00, 1.34MB/s]\n",
+            "\n",
+            "Downloading (…)cial_tokens_map.json: 100% 411/411 [00:00<00:00, 2.82MB/s]\n",
+            "\n",
+            "Downloading (…)d0d05/tokenizer.json:   0% 0.00/1.84M [00:00<?, ?B/s]\u001b[A\n",
+            "\n",
+            "Downloading (…)okenizer_config.json: 100% 824/824 [00:00<00:00, 5.75MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading model.safetensors:   0% 0.00/3.90G [00:00<?, ?B/s]\u001b[A\u001b[A\n",
+            "\n",
+            "\n",
+            "Downloading tokenizer.model: 100% 500k/500k [00:00<00:00, 16.3MB/s]\n",
+            "\n",
+            "Downloading (…)d0d05/tokenizer.json: 100% 1.84M/1.84M [00:00<00:00, 5.47MB/s]\n",
+            "\n",
+            "\n",
+            "Downloading model.safetensors:   0% 10.5M/3.90G [00:00<01:08, 56.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 21.0M/3.90G [00:00<00:57, 67.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 31.5M/3.90G [00:00<00:51, 75.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   1% 52.4M/3.90G [00:00<00:40, 94.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   2% 73.4M/3.90G [00:00<00:33, 113MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   2% 94.4M/3.90G [00:00<00:28, 133MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   3% 115M/3.90G [00:00<00:25, 148MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   3% 136M/3.90G [00:01<00:24, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   4% 157M/3.90G [00:01<00:22, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   5% 178M/3.90G [00:01<00:22, 168MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   5% 199M/3.90G [00:01<00:21, 169MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   6% 220M/3.90G [00:01<00:21, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   6% 241M/3.90G [00:01<00:21, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   7% 262M/3.90G [00:01<00:20, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   7% 283M/3.90G [00:02<01:08, 52.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   8% 315M/3.90G [00:02<00:47, 75.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   9% 346M/3.90G [00:03<00:36, 97.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:   9% 367M/3.90G [00:03<00:31, 111MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  10% 388M/3.90G [00:03<00:28, 122MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  10% 409M/3.90G [00:03<00:26, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  11% 430M/3.90G [00:03<00:24, 141MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  12% 461M/3.90G [00:03<00:21, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  12% 482M/3.90G [00:03<00:20, 165MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  13% 503M/3.90G [00:04<00:20, 166MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  13% 524M/3.90G [00:04<00:19, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  14% 556M/3.90G [00:04<00:18, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  15% 577M/3.90G [00:04<00:18, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  15% 598M/3.90G [00:04<00:18, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  16% 619M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  16% 640M/3.90G [00:04<00:17, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  17% 661M/3.90G [00:04<00:18, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  17% 682M/3.90G [00:04<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  18% 703M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  19% 724M/3.90G [00:05<00:17, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  19% 744M/3.90G [00:05<00:18, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  20% 765M/3.90G [00:05<00:18, 173MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  20% 786M/3.90G [00:05<00:17, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  21% 807M/3.90G [00:05<00:17, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  21% 828M/3.90G [00:05<00:17, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  22% 849M/3.90G [00:05<00:16, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  22% 870M/3.90G [00:07<01:37, 30.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  23% 891M/3.90G [00:08<01:13, 40.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  24% 923M/3.90G [00:08<00:50, 59.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  24% 944M/3.90G [00:08<00:42, 70.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  25% 975M/3.90G [00:08<00:30, 94.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  26% 996M/3.90G [00:08<00:27, 107MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  26% 1.02G/3.90G [00:08<00:23, 121MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  27% 1.04G/3.90G [00:08<00:21, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  27% 1.06G/3.90G [00:08<00:20, 141MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  28% 1.08G/3.90G [00:09<00:18, 151MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  28% 1.10G/3.90G [00:09<00:17, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  29% 1.12G/3.90G [00:09<00:16, 166MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  29% 1.14G/3.90G [00:09<00:16, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  30% 1.16G/3.90G [00:09<00:15, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  30% 1.18G/3.90G [00:09<00:15, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  31% 1.21G/3.90G [00:09<00:15, 179MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  31% 1.23G/3.90G [00:09<00:14, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  32% 1.25G/3.90G [00:09<00:14, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  33% 1.27G/3.90G [00:10<00:23, 113MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  33% 1.29G/3.90G [00:10<00:20, 128MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  34% 1.31G/3.90G [00:10<00:18, 139MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  34% 1.33G/3.90G [00:10<00:17, 150MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  35% 1.35G/3.90G [00:10<00:16, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  35% 1.37G/3.90G [00:12<01:24, 29.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  36% 1.41G/3.90G [00:12<00:55, 45.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  37% 1.44G/3.90G [00:13<00:39, 63.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  37% 1.46G/3.90G [00:13<00:33, 72.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  38% 1.48G/3.90G [00:13<00:29, 82.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  38% 1.50G/3.90G [00:13<00:24, 98.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  39% 1.53G/3.90G [00:13<00:19, 124MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  40% 1.55G/3.90G [00:13<00:17, 132MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  40% 1.57G/3.90G [00:13<00:16, 143MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  41% 1.59G/3.90G [00:14<00:15, 153MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  41% 1.61G/3.90G [00:14<00:14, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  42% 1.64G/3.90G [00:14<00:13, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  43% 1.66G/3.90G [00:14<00:13, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  43% 1.68G/3.90G [00:14<00:12, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  44% 1.70G/3.90G [00:14<00:12, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  44% 1.72G/3.90G [00:14<00:12, 173MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  45% 1.74G/3.90G [00:14<00:12, 175MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  45% 1.76G/3.90G [00:14<00:11, 179MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  46% 1.78G/3.90G [00:15<00:12, 172MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  46% 1.80G/3.90G [00:15<00:12, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  47% 1.82G/3.90G [00:15<00:11, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  47% 1.85G/3.90G [00:16<00:28, 71.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  48% 1.87G/3.90G [00:16<00:23, 87.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  49% 1.90G/3.90G [00:16<00:16, 118MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  49% 1.92G/3.90G [00:16<00:14, 132MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  50% 1.94G/3.90G [00:16<00:13, 143MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  50% 1.96G/3.90G [00:16<00:12, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  51% 1.98G/3.90G [00:16<00:13, 142MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  51% 2.00G/3.90G [00:16<00:13, 144MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  52% 2.02G/3.90G [00:17<00:12, 144MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  52% 2.04G/3.90G [00:17<00:12, 148MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  53% 2.07G/3.90G [00:17<00:12, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  54% 2.09G/3.90G [00:17<00:22, 81.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  54% 2.12G/3.90G [00:18<00:16, 107MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  55% 2.14G/3.90G [00:18<00:14, 119MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  55% 2.16G/3.90G [00:18<00:14, 123MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  56% 2.18G/3.90G [00:18<00:13, 131MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  57% 2.21G/3.90G [00:18<00:10, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  57% 2.23G/3.90G [00:18<00:10, 162MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  58% 2.25G/3.90G [00:18<00:10, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  59% 2.29G/3.90G [00:18<00:09, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  59% 2.31G/3.90G [00:19<00:08, 178MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  60% 2.33G/3.90G [00:19<00:08, 180MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  60% 2.35G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  61% 2.37G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  61% 2.39G/3.90G [00:19<00:08, 181MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  62% 2.41G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  62% 2.43G/3.90G [00:19<00:08, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  63% 2.45G/3.90G [00:19<00:08, 177MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  64% 2.47G/3.90G [00:20<00:11, 124MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  64% 2.51G/3.90G [00:20<00:09, 149MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  65% 2.53G/3.90G [00:22<00:40, 34.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  66% 2.56G/3.90G [00:22<00:26, 50.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  66% 2.58G/3.90G [00:22<00:21, 60.1MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  67% 2.60G/3.90G [00:22<00:18, 69.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  67% 2.62G/3.90G [00:22<00:15, 84.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  68% 2.64G/3.90G [00:22<00:12, 99.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  68% 2.66G/3.90G [00:23<00:12, 96.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  69% 2.68G/3.90G [00:23<00:12, 95.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  69% 2.71G/3.90G [00:23<00:14, 84.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.73G/3.90G [00:23<00:14, 82.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.74G/3.90G [00:24<00:14, 80.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  70% 2.75G/3.90G [00:24<00:15, 75.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.76G/3.90G [00:24<00:15, 75.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.77G/3.90G [00:24<00:15, 72.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  71% 2.78G/3.90G [00:24<00:14, 74.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.79G/3.90G [00:24<00:14, 74.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.80G/3.90G [00:25<00:15, 69.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.81G/3.90G [00:25<00:15, 71.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  72% 2.82G/3.90G [00:25<00:13, 77.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.84G/3.90G [00:25<00:12, 84.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.85G/3.90G [00:25<00:12, 83.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  73% 2.86G/3.90G [00:25<00:12, 81.6MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  74% 2.88G/3.90G [00:25<00:10, 97.2MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  75% 2.90G/3.90G [00:26<00:08, 118MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  75% 2.93G/3.90G [00:26<00:07, 134MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  76% 2.95G/3.90G [00:26<00:06, 149MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  76% 2.97G/3.90G [00:26<00:05, 159MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  77% 2.99G/3.90G [00:27<00:23, 37.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  77% 3.02G/3.90G [00:27<00:15, 57.4MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  78% 3.04G/3.90G [00:28<00:12, 67.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  79% 3.06G/3.90G [00:28<00:10, 78.8MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  79% 3.08G/3.90G [00:28<00:08, 92.9MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  80% 3.10G/3.90G [00:28<00:07, 109MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  80% 3.14G/3.90G [00:28<00:05, 138MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  81% 3.16G/3.90G [00:28<00:05, 146MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  82% 3.18G/3.90G [00:28<00:04, 152MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  82% 3.20G/3.90G [00:29<00:04, 161MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  83% 3.22G/3.90G [00:29<00:03, 170MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  83% 3.24G/3.90G [00:29<00:04, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  84% 3.26G/3.90G [00:29<00:04, 156MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  84% 3.28G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  85% 3.30G/3.90G [00:29<00:03, 162MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  85% 3.32G/3.90G [00:29<00:03, 160MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  86% 3.34G/3.90G [00:29<00:03, 171MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  87% 3.38G/3.90G [00:30<00:02, 191MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  87% 3.40G/3.90G [00:30<00:02, 188MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  88% 3.42G/3.90G [00:30<00:02, 187MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  88% 3.44G/3.90G [00:30<00:02, 182MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  89% 3.46G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  89% 3.48G/3.90G [00:30<00:02, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  90% 3.50G/3.90G [00:30<00:02, 184MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  90% 3.52G/3.90G [00:30<00:02, 185MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  91% 3.54G/3.90G [00:30<00:01, 183MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  91% 3.57G/3.90G [00:31<00:05, 55.5MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  92% 3.59G/3.90G [00:32<00:08, 38.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  93% 3.61G/3.90G [00:32<00:05, 50.7MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  93% 3.63G/3.90G [00:33<00:04, 65.0MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  94% 3.65G/3.90G [00:33<00:03, 80.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  94% 3.67G/3.90G [00:33<00:02, 97.3MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  95% 3.69G/3.90G [00:33<00:01, 113MB/s] \u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  95% 3.71G/3.90G [00:33<00:01, 128MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  96% 3.73G/3.90G [00:33<00:01, 139MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  96% 3.75G/3.90G [00:33<00:00, 153MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  97% 3.77G/3.90G [00:33<00:00, 158MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  97% 3.80G/3.90G [00:34<00:00, 165MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  98% 3.82G/3.90G [00:34<00:00, 167MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  98% 3.84G/3.90G [00:34<00:00, 169MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors:  99% 3.86G/3.90G [00:34<00:00, 174MB/s]\u001b[A\u001b[A\n",
+            "\n",
+            "Downloading model.safetensors: 100% 3.90G/3.90G [00:34<00:00, 113MB/s]\n",
+            "Fetching 15 files: 100% 15/15 [00:36<00:00,  2.41s/it]\n",
+            "/content/llama2-webui\n",
+            "Running on GPU with backend torch transformers.\n",
+            "2023-08-26 07:14:25.222792: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.\n",
+            "Caching examples at: '/content/llama2-webui/gradio_cached_examples/19'\n",
+            "Caching example 1/5\n",
+            "Caching example 2/5\n",
+            "Caching example 3/5\n",
+            "Caching example 4/5\n",
+            "Caching example 5/5\n",
+            "Caching complete\n",
+            "\n",
+            "Running on local URL:  http://127.0.0.1:7860\n",
+            "Running on public URL: https://71c3606942c440e7dd.gradio.live\n",
+            "\n",
+            "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n",
+            "Keyboard interruption in main thread... closing server.\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2130, in block_thread\n",
+            "    time.sleep(0.1)\n",
+            "KeyboardInterrupt\n",
+            "\n",
+            "During handling of the above exception, another exception occurred:\n",
+            "\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/content/llama2-webui/app.py\", line 322, in <module>\n",
+            "    main()\n",
+            "  File \"/content/llama2-webui/app.py\", line 318, in main\n",
+            "    demo.queue(max_size=20).launch(share=args.share)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2046, in launch\n",
+            "    self.block_thread()\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/gradio/blocks.py\", line 2132, in block_thread\n",
+            "    print(\"Keyboard interruption in main thread... closing server.\")\n",
+            "KeyboardInterrupt\n",
+            "Killing tunnel 127.0.0.1:7860 <> https://71c3606942c440e7dd.gradio.live\n",
+            "terminate called without an active exception\n"
+          ]
+        }
+      ]
+    }
+  ]
+}

docs/issues.md ADDED Viewed

File without changes

docs/news.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# News
+- [2023/09] The newest `llama2-wrapper>=0.1.14` supports llama.cpp's `gguf` models.
+- [2023/08] 🔥 For developers, we offer a web server that acts as a drop-in replacement for the OpenAI API.
+  - Usage:
+    ```
+    python3 -m llama2_wrapper.server
+    ```
+- [2023/08] 🔥 For developers, we released `llama2-wrapper`  as a llama2 backend wrapper in [PYPI](https://pypi.org/project/llama2-wrapper/).
+  - Install: `pip install llama2-wrapper`
+  - Usage:
+    ```python
+    from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+    llama2_wrapper = LLAMA2_WRAPPER(
+        model_path="./models/Llama-2-7B-Chat-GGML/llama-2-7b-chat.ggmlv3.q4_0.bin",
+        backend_type="llama.cpp", #options: llama.cpp, transformers, gptq
+    )
+    prompt = "Do you know Pytorch"
+    llama2_promt = get_prompt(prompt)
+    answer = llama2_wrapper(llama2_promt, temperature=0.9)
+    ```
+- [2023/08] 🔥 We added `benchmark.py` for users to benchmark llama2 models on their local devices.
+  - Check/contribute the performance of your device in the full [performance doc](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md).
+- [2023/07] We released **[llama2-webui](https://github.com/liltom-eth/llama2-webui)**, a gradio web UI to run Llama 2 on GPU or CPU from anywhere (Linux/Windows/Mac).
+  - Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), all [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), all [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML) ...
+  - Supporting model backends:  [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)

docs/performance.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# Benchmark Performance
+## Performance on Nvidia GPU
+| Model                             | Precision | Device | GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
+| Llama-2-7b-chat-hf | 16 bit |  |  |              |              |
+| Llama-2-7b-chat-hf          |   8bit   | NVIDIA RTX 2080 Ti    | 7.7 GB VRAM | 3.76 | 641.36 |
+| Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA RTX 2080 Ti    | 5.8 GB VRAM | 18.85 | 192.91 |
+| Llama-2-7b-Chat-GPTQ        |   4bit   | NVIDIA GTX 1660 Super | 4.8 GB VRAM | 8.5   | 262.74        |
+| Llama-2-7b-Chat-GPTQ | 4 bit | Google Colab T4 | 5.8 GB VRAM | 18.19 | 37.44 |
+| Llama-2-13b-chat-hf               |   16 bit   |  |                  |                  |                  |
+|  |  | |  | | |
+## Performance on CPU / OpenBLAS / cuBLAS / CLBlast / Metal
+| Model                             | Precision | Device | RAM / GPU VRAM | Speed (tokens/sec) | load time (s) |
+| --------------------------------- | --------- | ---------- | ---------------------- | ---------------- | ---------------- |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit     | Intel i7-8700 | 4.5 GB RAM     | 7.88               | 31.90         |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 CPU | 4.5 GB RAM | 11.10 | 0.10 |
+| llama-2-7b-chat.ggmlv3.q2_K | 2 bit | Apple M2 Metal | 4.5 GB RAM | 12.10 | 0.12 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-8700 | 5.4 GB RAM     | 6.27            | 173.15 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Intel i7-9700 | 4.8 GB RAM   | 4.2                 | 87.9        |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M1 Pro CPU | 5.4 GB RAM | 17.90 | 0.18 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit     | Apple M2 CPU | 5.4 GB RAM | 13.70 | 0.13 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Apple M2 Metal | 5.4 GB RAM | 12.60 | 0.10 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | AMD Ryzen 9 5900HS | 4.1 GB RAM | 6.01 | 0.15 |
+| llama-2-7b-chat.ggmlv3.q4_0 | 4 bit | Intel vServer 4 threads, eth services | 8 GB RAM | 1.31 | 0.5|
+| llama-2-7b-chat.ggmlv3.q8_0 | 8 bit | Intel i7-8700 | 8.6 GB RAM | 2.63 | 336.57 |
+| llama-2-7b-chat.ggmlv3.q8_0 | 8 bit     | Intel i7-9700 | 7.6 GB RAM   | 2.05              | 302.9    |
+|  |  |  |  |  |  |

docs/pypi.md ADDED Viewed

	@@ -0,0 +1,187 @@

+# llama2-wrapper
+- Use [llama2-wrapper](https://pypi.org/project/llama2-wrapper/) as your local llama2 backend for Generative Agents/Apps, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
+## Features
+- Supporting models: [Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)/[13b](https://huggingface.co/llamaste/Llama-2-13b-chat-hf)/[70b](https://huggingface.co/llamaste/Llama-2-70b-chat-hf), [Llama-2-GPTQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ), [Llama-2-GGML](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML), [CodeLlama](https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GPTQ)...
+- Supporting model backends: [tranformers](https://github.com/huggingface/transformers), [bitsandbytes(8-bit inference)](https://github.com/TimDettmers/bitsandbytes), [AutoGPTQ(4-bit inference)](https://github.com/PanQiWei/AutoGPTQ), [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Demos: [Run Llama2 on MacBook Air](https://twitter.com/liltom_eth/status/1682791729207070720?s=20); [Run Llama2 on Colab T4 GPU](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb)
+- Use  [llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  as your local llama2 backend for Generative Agents/Apps; [colab example](./colab/Llama_2_7b_Chat_GPTQ.ipynb).
+- [Run OpenAI Compatible API](https://github.com/liltom-eth/llama2-webui#start-openai-compatible-api) on Llama2 models.
+- [News](https://github.com/liltom-eth/llama2-webui/blob/main/docs/news.md), [Benchmark](https://github.com/liltom-eth/llama2-webui/blob/main/docs/performance.md), [Issue Solutions](https://github.com/liltom-eth/llama2-webui/blob/main/docs/issues.md)
+[llama2-wrapper](https://pypi.org/project/llama2-wrapper/)  is the backend and part of [llama2-webui](https://github.com/liltom-eth/llama2-webui), which can run any Llama 2 locally with gradio UI on GPU or CPU from anywhere (Linux/Windows/Mac).
+## Install
+```bash
+pip install llama2-wrapper
+```
+## Start OpenAI Compatible  API
+```
+python -m llama2_wrapper.server
+```
+it will use `llama.cpp` as the backend by default to run `llama-2-7b-chat.ggmlv3.q4_0.bin` model.
+Start Fast API for `gptq` backend:
+```
+python -m llama2_wrapper.server --backend_type gptq
+```
+Navigate to http://localhost:8000/docs to see the OpenAPI documentation.
+## API Usage
+###  `__call__`
+`__call__()` is the function to generate text from a prompt.
+For example, run ggml llama2 model on CPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/ggmlv3_q4_0.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER, get_prompt
+llama2_wrapper = LLAMA2_WRAPPER()
+# Default running on backend llama.cpp.
+# Automatically downloading model to: ./models/llama-2-7b-chat.ggmlv3.q4_0.bin
+prompt = "Do you know Pytorch"
+# llama2_wrapper() will run __call__()
+answer = llama2_wrapper(get_prompt(prompt), temperature=0.9)
+```
+Run gptq llama2 model on Nvidia GPU, [colab example](https://github.com/liltom-eth/llama2-webui/blob/main/colab/Llama_2_7b_Chat_GPTQ.ipynb):
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(backend_type="gptq")
+# Automatically downloading model to: ./models/Llama-2-7b-Chat-GPTQ
+```
+Run llama2 7b with bitsandbytes 8 bit with a `model_path`:
+```python
+from llama2_wrapper import LLAMA2_WRAPPER
+llama2_wrapper = LLAMA2_WRAPPER(
+	model_path = "./models/Llama-2-7b-chat-hf",
+  backend_type = "transformers",
+  load_in_8bit = True
+)
+```
+### completion
+  `completion()`  is the function to generate text from a prompt for OpenAI compatible API `/v1/completions`.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+prompt = get_prompt("Hi do you know Pytorch?")
+print(llm.completion(prompt))
+```
+### chat_completion
+  `chat_completion()`  is the function to generate text from a dialog (chat history) for OpenAI compatible API `/v1/chat/completions`.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+dialog = [
+    {
+        "role":"system",
+        "content":"You are a helpful, respectful and honest assistant. "
+    },{
+        "role":"user",
+        "content":"Hi do you know Pytorch?",
+    },
+]
+print(llm.chat_completion(dialog))
+```
+### generate
+`generate()` is the function to create a generator of response from a prompt.
+This is useful when you want to stream the output like typing in the chatbot.
+```python
+llama2_wrapper = LLAMA2_WRAPPER()
+prompt = get_prompt("Hi do you know Pytorch?")
+for response in llama2_wrapper.generate(prompt):
+	print(response)
+```
+The response will be like:
+```
+Yes,
+Yes, I'm
+Yes, I'm familiar
+Yes, I'm familiar with
+Yes, I'm familiar with PyTorch!
+...
+```
+### run
+`run()` is similar to `generate()`, but `run()`can also accept `chat_history`and `system_prompt` from the users.
+It will process the input message to llama2 prompt template with `chat_history` and `system_prompt` for a chatbot-like app.
+### get_prompt
+`get_prompt()` will process the input message to llama2 prompt with `chat_history` and `system_prompt`for chatbot.
+By default, `chat_history` and `system_prompt` are empty and `get_prompt()` will add llama2 prompt template to your message:
+```python
+prompt = get_prompt("Hi do you know Pytorch?")
+```
+prompt will be:
+```
+[INST] <<SYS>>
+<</SYS>>
+Hi do you know Pytorch? [/INST]
+```
+If use `get_prompt("Hi do you know Pytorch?", system_prompt="You are a helpful...")`:
+```
+[INST] <<SYS>>
+You are a helpful, respectful and honest assistant.
+<</SYS>>
+Hi do you know Pytorch? [/INST]
+```
+### get_prompt_for_dialog
+`get_prompt_for_dialog()` will process dialog (chat history) to llama2 prompt for OpenAI compatible API `/v1/chat/completions`.
+```python
+dialog = [
+    {
+        "role":"system",
+        "content":"You are a helpful, respectful and honest assistant. "
+    },{
+        "role":"user",
+        "content":"Hi do you know Pytorch?",
+    },
+]
+prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
+# [INST] <<SYS>>
+# You are a helpful, respectful and honest assistant.
+# <</SYS>>
+#
+# Hi do you know Pytorch? [/INST]
+```

env_examples/.env.13b_example ADDED Viewed

	@@ -0,0 +1,13 @@

+MODEL_PATH = "./models/Llama-2-13b-chat-hf"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "transformers"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = True
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_8bit_example ADDED Viewed

	@@ -0,0 +1,13 @@

+MODEL_PATH = "./models/Llama-2-7b-chat-hf"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "transformers"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = True
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_ggmlv3_q4_0_example ADDED Viewed

	@@ -0,0 +1,18 @@

+MODEL_PATH = ""
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example ggml path:
+# MODEL_PATH = "./models/llama-2-7b-chat.ggmlv3.q4_0.bin"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "llama.cpp"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

env_examples/.env.7b_gptq_example ADDED Viewed

	@@ -0,0 +1,18 @@

+MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
+# if MODEL_PATH is "", default llama.cpp/gptq models
+# will be downloaded to: ./models
+# Example gptq path:
+# MODEL_PATH = "./models/Llama-2-7b-Chat-GPTQ"
+# options: llama.cpp, gptq, transformers
+BACKEND_TYPE = "gptq"
+# only for transformers bitsandbytes 8 bit
+LOAD_IN_8BIT = False
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 4000
+DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."

llama2_wrapper/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LLAMA2_WRAPPER, get_prompt, get_prompt_for_dialog

llama2_wrapper/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (294 Bytes). View file

llama2_wrapper/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (18.7 kB). View file

llama2_wrapper/__pycache__/types.cpython-310.pyc ADDED Viewed

Binary file (4.1 kB). View file

llama2_wrapper/download/__init__.py ADDED Viewed

File without changes

llama2_wrapper/download/__main__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import argparse
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="",
+        required=True,
+        help="Repo ID like 'TheBloke/Llama-2-7B-Chat-GGML' ",
+    )
+    parser.add_argument(
+        "--filename",
+        type=str,
+        default=None,
+        help="Filename like llama-2-7b-chat.ggmlv3.q4_0.bin",
+    )
+    parser.add_argument(
+        "--save_dir", type=str, default="./models", help="Directory to save models"
+    )
+    args = parser.parse_args()
+    repo_id = args.repo_id
+    save_dir = args.save_dir
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    if args.filename:
+        filename = args.filename
+        from huggingface_hub import hf_hub_download
+        print(f"Start downloading model {repo_id} {filename} to: {save_dir}")
+        hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_dir=save_dir,
+        )
+    else:
+        repo_name = repo_id.split("/")[1]
+        save_path = os.path.join(save_dir, repo_name)
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        print(f"Start downloading model {repo_id} to: {save_path}")
+        from huggingface_hub import snapshot_download
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=save_path,
+        )
+if __name__ == "__main__":
+    main()

llama2_wrapper/download/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (206 Bytes). View file

llama2_wrapper/download/__pycache__/__main__.cpython-310.pyc ADDED Viewed

Binary file (1.29 kB). View file

llama2_wrapper/model.py ADDED Viewed

	@@ -0,0 +1,787 @@

+import os
+import time
+import uuid
+from enum import Enum
+from threading import Thread
+from typing import Any, Iterator, Union, List
+from llama2_wrapper.types import (
+    Completion,
+    CompletionChunk,
+    ChatCompletion,
+    ChatCompletionChunk,
+    # ChatCompletionMessage,
+    Message,
+    B_INST,
+    E_INST,
+    B_SYS,
+    E_SYS,
+)
+class LLAMA2_WRAPPER:
+    def __init__(
+        self,
+        model_path: str = "",
+        backend_type: str = "llama.cpp",
+        max_tokens: int = 4000,
+        load_in_8bit: bool = True,
+        verbose: bool = False,
+    ):
+        """Load a llama2 model from `model_path`.
+        Args:
+            model_path: Path to the model.
+            backend_type: Backend for llama2, options: llama.cpp, gptq, transformers
+            max_tokens: Maximum context size.
+            load_in_8bit: Use bitsandbytes to run model in 8 bit mode (only for transformers models).
+            verbose: Print verbose output to stderr.
+        Raises:
+            ValueError: If the model path does not exist.
+        Returns:
+            A LLAMA2_WRAPPER instance.
+        """
+        self.model_path = model_path
+        self.backend_type = BackendType.get_type(backend_type)
+        self.max_tokens = max_tokens
+        self.load_in_8bit = load_in_8bit
+        self.model = None
+        self.tokenizer = None
+        self.verbose = verbose
+        if self.backend_type is BackendType.LLAMA_CPP:
+            print("Running on backend llama.cpp.")
+        else:
+            import torch
+            if torch.cuda.is_available():
+                print("Running on GPU with backend torch transformers.")
+            else:
+                print("GPU CUDA not found.")
+        self.default_llamacpp_path = "./models/llama-2-7b-chat.Q4_0.gguf"
+        self.default_gptq_path = "./models/Llama-2-7b-Chat-GPTQ"
+        # Download default ggml/gptq model
+        if self.model_path == "":
+            print("Model path is empty.")
+            if self.backend_type is BackendType.LLAMA_CPP:
+                print("Use default llama.cpp model path: " + self.default_llamacpp_path)
+                if not os.path.exists(self.default_llamacpp_path):
+                    print("Start downloading model to: " + self.default_llamacpp_path)
+                    from huggingface_hub import hf_hub_download
+                    hf_hub_download(
+                        repo_id="TheBloke/Llama-2-7b-Chat-GGUF",
+                        filename="llama-2-7b-chat.Q4_0.gguf",
+                        local_dir="./models/",
+                    )
+                else:
+                    print("Model exists in ./models/llama-2-7b-chat.Q4_0.gguf.")
+                self.model_path = self.default_llamacpp_path
+            elif self.backend_type is BackendType.GPTQ:
+                print("Use default gptq model path: " + self.default_gptq_path)
+                if not os.path.exists(self.default_gptq_path):
+                    print("Start downloading model to: " + self.default_gptq_path)
+                    from huggingface_hub import snapshot_download
+                    snapshot_download(
+                        "TheBloke/Llama-2-7b-Chat-GPTQ",
+                        local_dir=self.default_gptq_path,
+                    )
+                else:
+                    print("Model exists in " + self.default_gptq_path)
+                self.model_path = self.default_gptq_path
+        self.init_tokenizer()
+        self.init_model()
+    def init_model(self):
+        if self.model is None:
+            self.model = LLAMA2_WRAPPER.create_llama2_model(
+                self.model_path,
+                self.backend_type,
+                self.max_tokens,
+                self.load_in_8bit,
+                self.verbose,
+            )
+        if self.backend_type is not BackendType.LLAMA_CPP:
+            self.model.eval()
+    def init_tokenizer(self):
+        if self.backend_type is not BackendType.LLAMA_CPP:
+            if self.tokenizer is None:
+                self.tokenizer = LLAMA2_WRAPPER.create_llama2_tokenizer(self.model_path)
+    @classmethod
+    def create_llama2_model(
+        cls, model_path, backend_type, max_tokens, load_in_8bit, verbose
+    ):
+        if backend_type is BackendType.LLAMA_CPP:
+            from llama_cpp import Llama
+            model = Llama(
+                model_path=model_path,
+                n_ctx=max_tokens,
+                n_batch=max_tokens,
+                verbose=verbose,
+            )
+        elif backend_type is BackendType.GPTQ:
+            from auto_gptq import AutoGPTQForCausalLM
+            model = AutoGPTQForCausalLM.from_quantized(
+                model_path,
+                use_safetensors=True,
+                trust_remote_code=True,
+                device="cuda:0",
+                use_triton=False,
+                quantize_config=None,
+            )
+        elif backend_type is BackendType.TRANSFORMERS:
+            import torch
+            from transformers import AutoModelForCausalLM
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",
+                torch_dtype=torch.float16,
+                load_in_8bit=load_in_8bit,
+            )
+        else:
+            print(backend_type + "not implemented.")
+        return model
+    @classmethod
+    def create_llama2_tokenizer(cls, model_path):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        return tokenizer
+    def get_token_length(
+        self,
+        prompt: str,
+    ) -> int:
+        if self.backend_type is BackendType.LLAMA_CPP:
+            input_ids = self.model.tokenize(bytes(prompt, "utf-8"))
+            return len(input_ids)
+        else:
+            input_ids = self.tokenizer([prompt], return_tensors="np")["input_ids"]
+            return input_ids.shape[-1]
+    def get_input_token_length(
+        self,
+        message: str,
+        chat_history: list[tuple[str, str]] = [],
+        system_prompt: str = "",
+    ) -> int:
+        prompt = get_prompt(message, chat_history, system_prompt)
+        return self.get_token_length(prompt)
+    def generate(
+        self,
+        prompt: str,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Iterator[str]:
+        """Create a generator of response from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> for response in llama2_wrapper.generate(prompt):
+            ...     print(response)
+        Args:
+            prompt: The prompt to generate text from.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Yields:
+            The generated text.
+        """
+        if self.backend_type is BackendType.LLAMA_CPP:
+            result = self.model(
+                prompt=prompt,
+                stream=True,
+                max_tokens=max_new_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            outputs = []
+            for part in result:
+                text = part["choices"][0]["text"]
+                outputs.append(text)
+                yield "".join(outputs)
+        else:
+            from transformers import TextIteratorStreamer
+            inputs = self.tokenizer([prompt], return_tensors="pt").to("cuda")
+            streamer = TextIteratorStreamer(
+                self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+            )
+            generate_kwargs = dict(
+                inputs,
+                streamer=streamer,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+            t.start()
+            outputs = []
+            for text in streamer:
+                outputs.append(text)
+                yield "".join(outputs)
+    def run(
+        self,
+        message: str,
+        chat_history: list[tuple[str, str]] = [],
+        system_prompt: str = "",
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+    ) -> Iterator[str]:
+        """Create a generator of response from a chat message.
+        Process message to llama2 prompt with chat history
+        and system_prompt for chatbot.
+        Args:
+            message: The origianl chat message to generate text from.
+            chat_history: Chat history list from chatbot.
+            system_prompt: System prompt for chatbot.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Yields:
+            The generated text.
+        """
+        prompt = get_prompt(message, chat_history, system_prompt)
+        return self.generate(
+            prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty
+        )
+    def __call__(
+        self,
+        prompt: str,
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[str, Iterator[str]]:
+        """Generate text from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> print(llama2_wrapper(prompt))
+        Args:
+            prompt: The prompt to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Generated text.
+        """
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.__call__(
+                prompt,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                def chunk_generator(chunks):
+                    for part in chunks:
+                        chunk = part["choices"][0]["text"]
+                        yield chunk
+                chunks: Iterator[str] = chunk_generator(completion_or_chunks)
+                return chunks
+            return completion_or_chunks["choices"][0]["text"]
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                return streamer
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                # skip prompt, skip special tokens
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                return output
+    def completion(
+        self,
+        prompt: str,
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[Completion, Iterator[CompletionChunk]]:
+        """For OpenAI compatible API /v1/completions
+        Generate text from a prompt.
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> prompt = get_prompt("Hi do you know Pytorch?")
+            >>> print(llm.completion(prompt))
+        Args:
+            prompt: The prompt to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        model_name: str = (
+            self.backend_type + " default model"
+            if self.model_path == ""
+            else self.model_path
+        )
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.__call__(
+                prompt,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                chunks: Iterator[CompletionChunk] = completion_or_chunks
+                return chunks
+            return completion_or_chunks
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                def chunk_generator(chunks):
+                    for part in chunks:
+                        yield {
+                            "id": completion_id,
+                            "object": "text_completion",
+                            "created": created,
+                            "model": model_name,
+                            "choices": [
+                                {
+                                    "text": part,
+                                    "index": 0,
+                                    "logprobs": None,
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+                chunks: Iterator[CompletionChunk] = chunk_generator(streamer)
+                return chunks
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                total_tokens_len = len(output_ids[0])
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                completion: Completion = {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "text": output,
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens_len,
+                        "completion_tokens": total_tokens_len - prompt_tokens_len,
+                        "total_tokens": total_tokens_len,
+                    },
+                }
+                return completion
+    def chat_completion(
+        self,
+        messages: List[Message],
+        stream: bool = False,
+        max_new_tokens: int = 1000,
+        temperature: float = 0.9,
+        top_p: float = 1.0,
+        top_k: int = 40,
+        repetition_penalty: float = 1.0,
+        **kwargs: Any,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        """For OpenAI compatible API /v1/chat/completions
+        Generate text from a dialog (chat history).
+        Examples:
+            >>> llama2_wrapper = LLAMA2_WRAPPER()
+            >>> dialog = [
+                    {
+                        "role":"system",
+                        "content":"You are a helpful, respectful and honest assistant. "
+                    },{
+                        "role":"user",
+                        "content":"Hi do you know Pytorch?",
+                    },
+                ]
+            >>> print(llm.chat_completion(dialog))
+        Args:
+            dialog: The dialog (chat history) to generate text from.
+            stream: Whether to stream the results.
+            max_new_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            top_k: The top-k value to use for sampling.
+            repetition_penalty: The penalty to apply to repeated tokens.
+            kwargs: all other arguments.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
+        created: int = int(time.time())
+        model_name: str = (
+            self.backend_type + " default model"
+            if self.model_path == ""
+            else self.model_path
+        )
+        if self.backend_type is BackendType.LLAMA_CPP:
+            completion_or_chunks = self.model.create_chat_completion(
+                messages,
+                stream=stream,
+                max_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repetition_penalty,
+                **kwargs,
+            )
+            if stream:
+                chunks: Iterator[ChatCompletionChunk] = completion_or_chunks
+                return chunks
+            return completion_or_chunks
+        else:
+            prompt = get_prompt_for_dialog(messages)
+            inputs = self.tokenizer([prompt], return_tensors="pt").input_ids
+            prompt_tokens_len = len(inputs[0])
+            inputs = inputs.to("cuda")
+            generate_kwargs = dict(
+                inputs=inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                # num_beams=1,
+            )
+            generate_kwargs = (
+                generate_kwargs if kwargs is None else {**generate_kwargs, **kwargs}
+            )
+            if stream:
+                from transformers import TextIteratorStreamer
+                streamer = TextIteratorStreamer(
+                    self.tokenizer,
+                    timeout=10.0,
+                    skip_prompt=True,
+                    skip_special_tokens=True,
+                )
+                generate_kwargs["streamer"] = streamer
+                t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+                t.start()
+                def chunk_generator(chunks):
+                    yield {
+                        "id": "chat" + completion_id,
+                        "model": model_name,
+                        "created": created,
+                        "object": "chat.completion.chunk",
+                        "choices": [
+                            {
+                                "index": 0,
+                                "delta": {
+                                    "role": "assistant",
+                                },
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                    for part in enumerate(chunks):
+                        yield {
+                            "id": "chat" + completion_id,
+                            "model": model_name,
+                            "created": created,
+                            "object": "chat.completion.chunk",
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": part,
+                                    },
+                                    "finish_reason": None,
+                                }
+                            ],
+                        }
+                chunks: Iterator[ChatCompletionChunk] = chunk_generator(streamer)
+                return chunks
+            else:
+                output_ids = self.model.generate(
+                    **generate_kwargs,
+                )
+                total_tokens_len = len(output_ids[0])
+                output = self.tokenizer.decode(
+                    output_ids[0][prompt_tokens_len:], skip_special_tokens=True
+                )
+                chatcompletion: ChatCompletion = {
+                    "id": "chat" + completion_id,
+                    "object": "chat.completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": output,
+                            },
+                            "finish_reason": None,
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": prompt_tokens_len,
+                        "completion_tokens": total_tokens_len - prompt_tokens_len,
+                        "total_tokens": total_tokens_len,
+                    },
+                }
+                return chatcompletion
+def get_prompt_for_dialog(dialog: List[Message]) -> str:
+    """Process dialog (chat history) to llama2 prompt for
+    OpenAI compatible API /v1/chat/completions.
+    Examples:
+        >>> dialog = [
+                {
+                    "role":"system",
+                    "content":"You are a helpful, respectful and honest assistant. "
+                },{
+                    "role":"user",
+                    "content":"Hi do you know Pytorch?",
+                },
+            ]
+        >>> prompt = get_prompt_for_dialog("Hi do you know Pytorch?")
+    Args:
+        dialog: The dialog (chat history) to generate text from.
+    Yields:
+        prompt string.
+    """
+    # add "<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" in first dialog
+    if dialog[0]["role"] == "system":
+        dialog = [
+            {
+                "role": dialog[1]["role"],
+                "content": B_SYS + dialog[0]["content"] + E_SYS + dialog[1]["content"],
+            }
+        ] + dialog[2:]
+    # check roles
+    assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
+        [msg["role"] == "assistant" for msg in dialog[1::2]]
+    ), (
+        "model only supports 'system', 'user' and 'assistant' roles, "
+        "starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
+    )
+    # add chat history
+    texts = []
+    for prompt, answer in zip(
+        dialog[::2],
+        dialog[1::2],
+    ):
+        texts.append(
+            f"{B_INST} {(prompt['content']).strip()} {E_INST} {(answer['content']).strip()} "
+        )
+    # check last message if role is user, then add it to prompt text
+    assert (
+        dialog[-1]["role"] == "user"
+    ), f"Last message must be from user, got {dialog[-1]['role']}"
+    texts.append(f"{B_INST} {(dialog[-1]['content']).strip()} {E_INST}")
+    return "".join(texts)
+def get_prompt(
+    message: str, chat_history: list[tuple[str, str]] = [], system_prompt: str = ""
+) -> str:
+    """Process message to llama2 prompt with chat history
+    and system_prompt for chatbot.
+    Examples:
+        >>> prompt = get_prompt("Hi do you know Pytorch?")
+    Args:
+        message: The origianl chat message to generate text from.
+        chat_history: Chat history list from chatbot.
+        system_prompt: System prompt for chatbot.
+    Yields:
+        prompt string.
+    """
+    texts = [f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"]
+    for user_input, response in chat_history:
+        texts.append(f"{user_input.strip()} [/INST] {response.strip()} </s><s> [INST] ")
+    texts.append(f"{message.strip()} [/INST]")
+    return "".join(texts)
+class BackendType(Enum):
+    UNKNOWN = 0
+    TRANSFORMERS = 1
+    GPTQ = 2
+    LLAMA_CPP = 3
+    @classmethod
+    def get_type(cls, backend_name: str):
+        backend_type = None
+        backend_name_lower = backend_name.lower()
+        if "transformers" in backend_name_lower:
+            backend_type = BackendType.TRANSFORMERS
+        elif "gptq" in backend_name_lower:
+            backend_type = BackendType.GPTQ
+        elif "cpp" in backend_name_lower:
+            backend_type = BackendType.LLAMA_CPP
+        else:
+            raise Exception("Unknown backend: " + backend_name)
+            # backend_type = BackendType.UNKNOWN
+        return backend_type

llama2_wrapper/server/__init__.py ADDED Viewed

File without changes

llama2_wrapper/server/__main__.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Example FastAPI server for llama2_wrapper.
+To run this example:
+```
+python3 -m llama2_wrapper.server
+```
+or
+```
+uvicorn llama2_wrapper.server.app:app --reload
+```
+Then visit http://localhost:8000/docs to see the interactive API docs.
+"""
+import os
+import argparse
+import uvicorn
+from llama2_wrapper.server.app import create_app, Settings
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    for name, field in Settings.model_fields.items():
+        description = field.description
+        if field.default is not None and description is not None:
+            description += f" (default: {field.default})"
+        parser.add_argument(
+            f"--{name}",
+            dest=name,
+            type=field.annotation if field.annotation is not None else str,
+            help=description,
+        )
+    args = parser.parse_args()
+    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
+    app = create_app(settings=settings)
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", settings.host),
+        port=int(os.getenv("PORT", settings.port)),
+    )

llama2_wrapper/server/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (204 Bytes). View file

llama2_wrapper/server/__pycache__/__main__.cpython-310.pyc ADDED Viewed

Binary file (1.25 kB). View file

llama2_wrapper/server/__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (13 kB). View file

llama2_wrapper/server/app.py ADDED Viewed

	@@ -0,0 +1,526 @@

+import json
+import multiprocessing
+from re import compile, Match, Pattern
+from threading import Lock
+from functools import partial
+from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict
+from typing_extensions import TypedDict, Literal
+import anyio
+from anyio.streams.memory import MemoryObjectSendStream
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
+from fastapi import Depends, FastAPI, APIRouter, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.routing import APIRoute
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings
+from sse_starlette.sse import EventSourceResponse
+from llama2_wrapper.model import LLAMA2_WRAPPER
+from llama2_wrapper.types import (
+    Completion,
+    CompletionChunk,
+    ChatCompletion,
+    ChatCompletionChunk,
+)
+class Settings(BaseSettings):
+    model_path: str = Field(
+        default="",
+        description="The path to the model to use for generating completions.",
+    )
+    backend_type: str = Field(
+        default="llama.cpp",
+        description="Backend for llama2, options: llama.cpp, gptq, transformers",
+    )
+    max_tokens: int = Field(default=4000, ge=1, description="Maximum context size.")
+    load_in_8bit: bool = Field(
+        default=False,
+        description="`Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models).",
+    )
+    verbose: bool = Field(
+        default=False,
+        description="Whether to print verbose output to stderr.",
+    )
+    host: str = Field(default="localhost", description="API address")
+    port: int = Field(default=8000, description="API port")
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
+class ErrorResponse(TypedDict):
+    """OpenAI style error response"""
+    message: str
+    type: str
+    param: Optional[str]
+    code: Optional[str]
+class ErrorResponseFormatters:
+    """Collection of formatters for error responses.
+    Args:
+        request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
+            Request body
+        match (Match[str]): Match object from regex pattern
+    Returns:
+        Tuple[int, ErrorResponse]: Status code and error response
+    """
+    @staticmethod
+    def context_length_exceeded(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for context length exceeded error"""
+        context_window = int(match.group(2))
+        prompt_tokens = int(match.group(1))
+        completion_tokens = request.max_new_tokens
+        if hasattr(request, "messages"):
+            # Chat completion
+            message = (
+                "This model's maximum context length is {} tokens. "
+                "However, you requested {} tokens "
+                "({} in the messages, {} in the completion). "
+                "Please reduce the length of the messages or completion."
+            )
+        else:
+            # Text completion
+            message = (
+                "This model's maximum context length is {} tokens, "
+                "however you requested {} tokens "
+                "({} in your prompt; {} for the completion). "
+                "Please reduce your prompt; or completion length."
+            )
+        return 400, ErrorResponse(
+            message=message.format(
+                context_window,
+                completion_tokens + prompt_tokens,
+                prompt_tokens,
+                completion_tokens,
+            ),
+            type="invalid_request_error",
+            param="messages",
+            code="context_length_exceeded",
+        )
+    @staticmethod
+    def model_not_found(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for model_not_found error"""
+        model_path = str(match.group(1))
+        message = f"The model `{model_path}` does not exist"
+        return 400, ErrorResponse(
+            message=message,
+            type="invalid_request_error",
+            param=None,
+            code="model_not_found",
+        )
+class RouteErrorHandler(APIRoute):
+    """Custom APIRoute that handles application errors and exceptions"""
+    # key: regex pattern for original error message from llama_cpp
+    # value: formatter function
+    pattern_and_formatters: Dict[
+        "Pattern",
+        Callable[
+            [
+                Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+                "Match[str]",
+            ],
+            Tuple[int, ErrorResponse],
+        ],
+    ] = {
+        compile(
+            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
+        ): ErrorResponseFormatters.context_length_exceeded,
+        compile(
+            r"Model path does not exist: (.+)"
+        ): ErrorResponseFormatters.model_not_found,
+    }
+    def error_message_wrapper(
+        self,
+        error: Exception,
+        body: Optional[
+            Union[
+                "CreateChatCompletionRequest",
+                "CreateCompletionRequest",
+            ]
+        ] = None,
+    ) -> Tuple[int, ErrorResponse]:
+        """Wraps error message in OpenAI style error response"""
+        if body is not None and isinstance(
+            body,
+            (
+                CreateCompletionRequest,
+                CreateChatCompletionRequest,
+            ),
+        ):
+            # When text completion or chat completion
+            for pattern, callback in self.pattern_and_formatters.items():
+                match = pattern.search(str(error))
+                if match is not None:
+                    return callback(body, match)
+        # Wrap other errors as internal server error
+        return 500, ErrorResponse(
+            message=str(error),
+            type="internal_server_error",
+            param=None,
+            code=None,
+        )
+    def get_route_handler(
+        self,
+    ) -> Callable[[Request], Coroutine[None, None, Response]]:
+        """Defines custom route handler that catches exceptions and formats
+        in OpenAI style error response"""
+        original_route_handler = super().get_route_handler()
+        async def custom_route_handler(request: Request) -> Response:
+            try:
+                return await original_route_handler(request)
+            except Exception as exc:
+                json_body = await request.json()
+                try:
+                    if "messages" in json_body:
+                        # Chat completion
+                        body: Optional[
+                            Union[
+                                CreateChatCompletionRequest,
+                                CreateCompletionRequest,
+                            ]
+                        ] = CreateChatCompletionRequest(**json_body)
+                    elif "prompt" in json_body:
+                        # Text completion
+                        body = CreateCompletionRequest(**json_body)
+                    # else:
+                    #     # Embedding
+                    #     body = CreateEmbeddingRequest(**json_body)
+                except Exception:
+                    # Invalid request body
+                    body = None
+                # Get proper error message from the exception
+                (
+                    status_code,
+                    error_message,
+                ) = self.error_message_wrapper(error=exc, body=body)
+                return JSONResponse(
+                    {"error": error_message},
+                    status_code=status_code,
+                )
+        return custom_route_handler
+router = APIRouter(route_class=RouteErrorHandler)
+settings: Optional[Settings] = None
+llama2: Optional[LLAMA2_WRAPPER] = None
+def create_app(settings: Optional[Settings] = None):
+    if settings is None:
+        settings = Settings()
+    app = FastAPI(
+        title="llama2-wrapper Fast API",
+        version="0.0.1",
+    )
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    app.include_router(router)
+    global llama2
+    llama2 = LLAMA2_WRAPPER(
+        model_path=settings.model_path,
+        backend_type=settings.backend_type,
+        max_tokens=settings.max_tokens,
+        load_in_8bit=settings.load_in_8bit,
+        verbose=settings.load_in_8bit,
+    )
+    def set_settings(_settings: Settings):
+        global settings
+        settings = _settings
+    set_settings(settings)
+    return app
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+def get_llama():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield llama2
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+def get_settings():
+    yield settings
+async def get_event_publisher(
+    request: Request,
+    inner_send_chan: MemoryObjectSendStream,
+    iterator: Iterator,
+):
+    async with inner_send_chan:
+        try:
+            async for chunk in iterate_in_threadpool(iterator):
+                await inner_send_chan.send(dict(data=json.dumps(chunk)))
+                if await request.is_disconnected():
+                    raise anyio.get_cancelled_exc_class()()
+                if settings.interrupt_requests and llama_outer_lock.locked():
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                    raise anyio.get_cancelled_exc_class()()
+            await inner_send_chan.send(dict(data="[DONE]"))
+        except anyio.get_cancelled_exc_class() as e:
+            print("disconnected")
+            with anyio.move_on_after(1, shield=True):
+                print(f"Disconnected from client (via refresh/close) {request.client}")
+                raise e
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
+)
+max_new_tokens_field = Field(
+    default=1000, ge=1, description="The maximum number of tokens to generate."
+)
+temperature_field = Field(
+    default=0.9,
+    ge=0.0,
+    le=2.0,
+    description="The temperature to use for sampling.",
+)
+top_p_field = Field(
+    default=1.0,
+    ge=0.0,
+    le=1.0,
+    description="The top-p value to use for sampling.",
+)
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="The top-k value to use for sampling.",
+)
+repetition_penalty_field = Field(
+    default=1.0,
+    ge=0.0,
+    description="The penalty to apply to repeated tokens.",
+)
+# stop_field = Field(
+#     default=None,
+#     description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
+# )
+class CreateCompletionRequest(BaseModel):
+    prompt: Union[str, List[str]] = Field(
+        default="", description="The prompt to generate text from."
+    )
+    stream: bool = stream_field
+    max_new_tokens: int = max_new_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    top_k: int = top_k_field
+    repetition_penalty: float = repetition_penalty_field
+    # stop: Optional[Union[str, List[str]]] = stop_field
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    # "stop": ["\n", "###"],
+                }
+            ]
+        }
+    }
+@router.post(
+    "/v1/completions",
+)
+async def create_completion(
+    request: Request,
+    body: CreateCompletionRequest,
+    llama2: LLAMA2_WRAPPER = Depends(get_llama),
+) -> Completion:
+    if isinstance(body.prompt, list):
+        assert len(body.prompt) <= 1
+        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+    kwargs = body.model_dump()
+    iterator_or_completion: Union[
+        Completion, Iterator[CompletionChunk]
+    ] = await run_in_threadpool(llama2.completion, **kwargs)
+    if isinstance(iterator_or_completion, Iterator):
+        first_response = await run_in_threadpool(next, iterator_or_completion)
+        # If no exception was raised from first_response, we can assume that
+        # the iterator is valid and we can use it to stream the response.
+        def iterator() -> Iterator[CompletionChunk]:
+            yield first_response
+            yield from iterator_or_completion
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator(),
+            ),
+        )
+    else:
+        return iterator_or_completion
+class ChatCompletionRequestMessage(BaseModel):
+    role: Literal["system", "user", "assistant"] = Field(
+        default="user", description="The role of the message."
+    )
+    content: str = Field(default="", description="The content of the message.")
+class CreateChatCompletionRequest(BaseModel):
+    messages: List[ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
+    )
+    stream: bool = stream_field
+    max_new_tokens: int = max_new_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    top_k: int = top_k_field
+    repetition_penalty: float = repetition_penalty_field
+    # stop: Optional[List[str]] = stop_field
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
+        }
+    }
+@router.post(
+    "/v1/chat/completions",
+)
+async def create_chat_completion(
+    request: Request,
+    body: CreateChatCompletionRequest,
+    llama2: LLAMA2_WRAPPER = Depends(get_llama),
+    settings: Settings = Depends(get_settings),
+) -> ChatCompletion:
+    kwargs = body.model_dump()
+    iterator_or_completion: Union[
+        ChatCompletion, Iterator[ChatCompletionChunk]
+    ] = await run_in_threadpool(llama2.chat_completion, **kwargs)
+    if isinstance(iterator_or_completion, Iterator):
+        first_response = await run_in_threadpool(next, iterator_or_completion)
+        # If no exception was raised from first_response, we can assume that
+        # the iterator is valid and we can use it to stream the response.
+        def iterator() -> Iterator[ChatCompletionChunk]:
+            yield first_response
+            yield from iterator_or_completion
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        return EventSourceResponse(
+            recv_chan,
+            data_sender_callable=partial(  # type: ignore
+                get_event_publisher,
+                request=request,
+                inner_send_chan=send_chan,
+                iterator=iterator(),
+            ),
+        )
+    else:
+        return iterator_or_completion
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]
+@router.get("/v1/models")
+async def get_models(
+    settings: Settings = Depends(get_settings),
+) -> ModelList:
+    assert llama2 is not None
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": settings.backend_type + " default model"
+                if settings.model_path == ""
+                else settings.model_path,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+        ],
+    }

llama2_wrapper/types.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import Any, List, Optional, Dict, Union
+from typing_extensions import TypedDict, NotRequired, Literal
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+# Role = Literal["system", "user", "assistant"]
+# class Message(TypedDict):
+#     role: Role
+#     content: str
+class ChatCompletionMessage(TypedDict):
+    role: Literal["assistant", "user", "system"]
+    content: str
+    user: NotRequired[str]
+# transformers: Message; llama.cpp: ChatCompletionMessage
+Message = ChatCompletionMessage
+Dialog = List[Message]
+class EmbeddingUsage(TypedDict):
+    prompt_tokens: int
+    total_tokens: int
+class EmbeddingData(TypedDict):
+    index: int
+    object: str
+    embedding: List[float]
+class Embedding(TypedDict):
+    object: Literal["list"]
+    model: str
+    data: List[EmbeddingData]
+    usage: EmbeddingUsage
+class CompletionLogprobs(TypedDict):
+    text_offset: List[int]
+    token_logprobs: List[Optional[float]]
+    tokens: List[str]
+    top_logprobs: List[Optional[Dict[str, float]]]
+class CompletionChoice(TypedDict):
+    text: str
+    index: int
+    logprobs: Optional[CompletionLogprobs]
+    finish_reason: Optional[str]
+class CompletionUsage(TypedDict):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+class CompletionChunk(TypedDict):
+    id: str
+    object: Literal["text_completion"]
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+class Completion(TypedDict):
+    id: str
+    object: Literal["text_completion"]
+    created: int
+    model: str
+    choices: List[CompletionChoice]
+    usage: CompletionUsage
+class ChatCompletionChoice(TypedDict):
+    index: int
+    message: ChatCompletionMessage
+    finish_reason: Optional[str]
+class ChatCompletion(TypedDict):
+    id: str
+    object: Literal["chat.completion"]
+    created: int
+    model: str
+    choices: List[ChatCompletionChoice]
+    usage: CompletionUsage
+class ChatCompletionChunkDeltaEmpty(TypedDict):
+    pass
+class ChatCompletionChunkDelta(TypedDict):
+    role: NotRequired[Literal["assistant"]]
+    content: NotRequired[str]
+class ChatCompletionChunkChoice(TypedDict):
+    index: int
+    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
+    finish_reason: Optional[str]
+class ChatCompletionChunk(TypedDict):
+    id: str
+    model: str
+    object: Literal["chat.completion.chunk"]
+    created: int
+    choices: List[ChatCompletionChunkChoice]

models/CodeLlama-7B-Python-GPTQ/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

models/CodeLlama-7B-Python-GPTQ/LICENSE ADDED Viewed

	@@ -0,0 +1 @@


1	+ Please refer to license: https://github.com/facebookresearch/llama/blob/main/LICENSE

models/CodeLlama-7B-Python-GPTQ/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,126 @@

+LLAMA 2 COMMUNITY LICENSE AGREEMENT
+Llama 2 Version Release Date: July 18, 2023
+"Agreement" means the terms and conditions for use, reproduction, distribution and
+modification of the Llama Materials set forth herein.
+"Documentation" means the specifications, manuals and documentation
+accompanying Llama 2 distributed by Meta at ai.meta.com/resources/models-and-
+libraries/llama-downloads/.
+"Licensee" or "you" means you, or your employer or any other person or entity (if
+you are entering into this Agreement on such person or entity's behalf), of the age
+required under applicable laws, rules or regulations to provide legal consent and that
+has legal authority to bind your employer or such other person or entity if you are
+entering in this Agreement on their behalf.
+"Llama 2" means the foundational large language models and software and
+algorithms, including machine-learning model code, trained model weights,
+inference-enabling code, training-enabling code, fine-tuning enabling code and other
+elements of the foregoing distributed by Meta at ai.meta.com/resources/models-and-
+libraries/llama-downloads/.
+"Llama Materials" means, collectively, Meta's proprietary Llama 2 and
+Documentation (and any portion thereof) made available under this Agreement.
+"Meta" or "we" means Meta Platforms Ireland Limited (if you are located in or, if you
+are an entity, your principal place of business is in the EEA or Switzerland) and Meta
+Platforms, Inc. (if you are located outside of the EEA or Switzerland).
+By clicking "I Accept" below or by using or distributing any portion or element of the
+Llama Materials, you agree to be bound by this Agreement.
+1. License Rights and Redistribution.
+      a. Grant of Rights. You are granted a non-exclusive, worldwide, non-
+transferable and royalty-free limited license under Meta's intellectual property or
+other rights owned by Meta embodied in the Llama Materials to use, reproduce,
+distribute, copy, create derivative works of, and make modifications to the Llama
+Materials.
+      b. Redistribution and Use.
+            i. If you distribute or make the Llama Materials, or any derivative works
+thereof, available to a third party, you shall provide a copy of this Agreement to such
+third party.
+            ii.  If you receive Llama Materials, or any derivative works thereof, from
+a Licensee as part of an integrated end user product, then Section 2 of this
+Agreement will not apply to you.
+            iii. You must retain in all copies of the Llama Materials that you
+distribute the following attribution notice within a "Notice" text file distributed as a
+part of such copies: "Llama 2 is licensed under the LLAMA 2 Community License,
+Copyright (c) Meta Platforms, Inc. All Rights Reserved."
+            iv. Your use of the Llama Materials must comply with applicable laws
+and regulations (including trade compliance laws and regulations) and adhere to the
+Acceptable Use Policy for the Llama Materials (available at
+https://ai.meta.com/llama/use-policy), which is hereby incorporated by reference into
+this Agreement.
+            v. You will not use the Llama Materials or any output or results of the
+Llama Materials to improve any other large language model (excluding Llama 2 or
+derivative works thereof).
+2. Additional Commercial Terms. If, on the Llama 2 version release date, the
+monthly active users of the products or services made available by or for Licensee,
+or Licensee's affiliates, is greater than 700 million monthly active users in the
+preceding calendar month, you must request a license from Meta, which Meta may
+grant to you in its sole discretion, and you are not authorized to exercise any of the
+rights under this Agreement unless or until Meta otherwise expressly grants you
+such rights.
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE
+LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE
+PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
+EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY
+WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR
+FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE
+FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING
+THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR
+USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE
+LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT,
+NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS
+AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL,
+CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN
+IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF
+ANY OF THE FOREGOING.
+5. Intellectual Property.
+      a. No trademark licenses are granted under this Agreement, and in
+connection with the Llama Materials, neither Meta nor Licensee may use any name
+or mark owned by or associated with the other or any of its affiliates, except as
+required for reasonable and customary use in describing and redistributing the
+Llama Materials.
+      b. Subject to Meta's ownership of Llama Materials and derivatives made by or
+for Meta, with respect to any derivative works and modifications of the Llama
+Materials that are made by you, as between you and Meta, you are and will be the
+owner of such derivative works and modifications.
+      c. If you institute litigation or other proceedings against Meta or any entity
+(including a cross-claim or counterclaim in a lawsuit) alleging that the Llama
+Materials or Llama 2 outputs or results, or any portion of any of the foregoing,
+constitutes infringement of intellectual property or other rights owned or licensable
+by you, then any licenses granted to you under this Agreement shall terminate as of
+the date such litigation or claim is filed or instituted. You will indemnify and hold
+harmless Meta from and against any claim by any third party arising out of or related
+to your use or distribution of the Llama Materials.
+6. Term and Termination. The term of this Agreement will commence upon your
+acceptance of this Agreement or access to the Llama Materials and will continue in
+full force and effect until terminated in accordance with the terms and conditions
+herein. Meta may terminate this Agreement if you are in breach of any term or
+condition of this Agreement. Upon termination of this Agreement, you shall delete
+and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the
+termination of this Agreement.
+7. Governing Law and Jurisdiction. This Agreement will be governed and
+construed under the laws of the State of California without regard to choice of law
+principles, and the UN Convention on Contracts for the International Sale of Goods
+does not apply to this Agreement. The courts of California shall have exclusive
+jurisdiction of any dispute arising out of this Agreement.

models/CodeLlama-7B-Python-GPTQ/Notice ADDED Viewed

	@@ -0,0 +1 @@


1	+ Llama 2 is licensed under the LLAMA 2 Community License, Copyright © Meta Platforms, Inc. All Rights Reserved.

models/CodeLlama-7B-Python-GPTQ/README.md ADDED Viewed

	@@ -0,0 +1,338 @@

+---
+language:
+- code
+license: llama2
+tags:
+- llama-2
+model_name: CodeLlama 7B Python
+base_model: codellama/CodeLlama-7b-python-hf
+inference: false
+model_creator: Meta
+model_type: llama
+pipeline_tag: text-generation
+prompt_template: '[INST] Write code to solve the following coding problem that obeys
+  the constraints and passes the example test cases. Please wrap your code answer
+  using ```:
+  {prompt}
+  [/INST]
+  '
+quantized_by: TheBloke
+---
+<!-- header start -->
+<!-- 200823 -->
+<div style="width: auto; margin-left: auto; margin-right: auto">
+<img src="https://i.imgur.com/EBdldam.jpg" alt="TheBlokeAI" style="width: 100%; min-width: 400px; display: block; margin: auto;">
+</div>
+<div style="display: flex; justify-content: space-between; width: 100%;">
+    <div style="display: flex; flex-direction: column; align-items: flex-start;">
+        <p style="margin-top: 0.5em; margin-bottom: 0em;"><a href="https://discord.gg/theblokeai">Chat & support: TheBloke's Discord server</a></p>
+    </div>
+    <div style="display: flex; flex-direction: column; align-items: flex-end;">
+        <p style="margin-top: 0.5em; margin-bottom: 0em;"><a href="https://www.patreon.com/TheBlokeAI">Want to contribute? TheBloke's Patreon page</a></p>
+    </div>
+</div>
+<div style="text-align:center; margin-top: 0em; margin-bottom: 0em"><p style="margin-top: 0.25em; margin-bottom: 0em;">TheBloke's LLM work is generously supported by a grant from <a href="https://a16z.com">andreessen horowitz (a16z)</a></p></div>
+<hr style="margin-top: 1.0em; margin-bottom: 1.0em;">
+<!-- header end -->
+# CodeLlama 7B Python - GPTQ
+- Model creator: [Meta](https://huggingface.co/meta-llama)
+- Original model: [CodeLlama 7B Python](https://huggingface.co/codellama/CodeLlama-7b-python-hf)
+<!-- description start -->
+## Description
+This repo contains GPTQ model files for [Meta's CodeLlama 7B Python](https://huggingface.co/codellama/CodeLlama-7b-python-hf).
+Multiple GPTQ parameter permutations are provided; see Provided Files below for details of the options provided, their parameters, and the software used to create them.
+<!-- description end -->
+<!-- repositories-available start -->
+## Repositories available
+* [AWQ model(s) for GPU inference.](https://huggingface.co/TheBloke/CodeLlama-7B-Python-AWQ)
+* [GPTQ models for GPU inference, with multiple quantisation parameter options.](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ)
+* [2, 3, 4, 5, 6 and 8-bit GGUF models for CPU+GPU inference](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GGUF)
+* [Meta's original unquantised fp16 model in pytorch format, for GPU inference and for further conversions](https://huggingface.co/codellama/CodeLlama-7b-python-hf)
+<!-- repositories-available end -->
+<!-- prompt-template start -->
+## Prompt template: CodeLlama
+```
+[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
+{prompt}
+[/INST]
+```
+<!-- prompt-template end -->
+<!-- README_GPTQ.md-provided-files start -->
+## Provided files and GPTQ parameters
+Multiple quantisation parameters are provided, to allow you to choose the best one for your hardware and requirements.
+Each separate quant is in a different branch.  See below for instructions on fetching from different branches.
+All recent GPTQ files are made with AutoGPTQ, and all files in non-main branches are made with AutoGPTQ. Files in the `main` branch which were uploaded before August 2023 were made with GPTQ-for-LLaMa.
+<details>
+  <summary>Explanation of GPTQ parameters</summary>
+- Bits: The bit size of the quantised model.
+- GS: GPTQ group size. Higher numbers use less VRAM, but have lower quantisation accuracy. "None" is the lowest possible value.
+- Act Order: True or False. Also known as `desc_act`. True results in better quantisation accuracy. Some GPTQ clients have had issues with models that use Act Order plus Group Size, but this is generally resolved now.
+- Damp %: A GPTQ parameter that affects how samples are processed for quantisation. 0.01 is default, but 0.1 results in slightly better accuracy.
+- GPTQ dataset: The dataset used for quantisation. Using a dataset more appropriate to the model's training can improve quantisation accuracy. Note that the GPTQ dataset is not the same as the dataset used to train the model - please refer to the original model repo for details of the training dataset(s).
+- Sequence Length: The length of the dataset sequences used for quantisation. Ideally this is the same as the model sequence length. For some very long sequence models (16+K), a lower sequence length may have to be used.  Note that a lower sequence length does not limit the sequence length of the quantised model. It only impacts the quantisation accuracy on longer inference sequences.
+- ExLlama Compatibility: Whether this file can be loaded with ExLlama, which currently only supports Llama models in 4-bit.
+</details>
+| Branch | Bits | GS | Act Order | Damp % | GPTQ Dataset | Seq Len | Size | ExLlama | Desc |
+| ------ | ---- | -- | --------- | ------ | ------------ | ------- | ---- | ------- | ---- |
+| [main](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ/tree/main) | 4 | 128 | No | 0.1 | [Evol Instruct Code](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1) | 8192 | 3.90 GB | Yes | 4-bit, without Act Order and group size 128g. |
+| [gptq-4bit-32g-actorder_True](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ/tree/gptq-4bit-32g-actorder_True) | 4 | 32 | Yes | 0.1 | [Evol Instruct Code](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1) | 8192 | 4.28 GB | Yes | 4-bit, with Act Order and group size 32g. Gives highest possible inference quality, with maximum VRAM usage. |
+| [gptq-4bit-64g-actorder_True](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ/tree/gptq-4bit-64g-actorder_True) | 4 | 64 | Yes | 0.1 | [Evol Instruct Code](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1) | 8192 | 4.02 GB | Yes | 4-bit, with Act Order and group size 64g. Uses less VRAM than 32g, but with slightly lower accuracy. |
+| [gptq-4bit-128g-actorder_True](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ/tree/gptq-4bit-128g-actorder_True) | 4 | 128 | Yes | 0.1 | [Evol Instruct Code](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1) | 8192 | 3.90 GB | Yes | 4-bit, with Act Order and group size 128g. Uses even less VRAM than 64g, but with slightly lower accuracy. |
+| [gptq-8bit--1g-actorder_True](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ/tree/gptq-8bit--1g-actorder_True) | 8 | None | Yes | 0.1 | [Evol Instruct Code](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1) | 8192 | 7.01 GB | No | 8-bit, with Act Order. No group size, to lower VRAM requirements. |
+| [gptq-8bit-128g-actorder_True](https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ/tree/gptq-8bit-128g-actorder_True) | 8 | 128 | Yes | 0.1 | [Evol Instruct Code](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1) | 8192 | 7.16 GB | No | 8-bit, with group size 128g for higher inference quality and with Act Order for even higher accuracy. |
+<!-- README_GPTQ.md-provided-files end -->
+<!-- README_GPTQ.md-download-from-branches start -->
+## How to download from branches
+- In text-generation-webui, you can add `:branch` to the end of the download name, eg `TheBloke/CodeLlama-7B-Python-GPTQ:main`
+- With Git, you can clone a branch with:
+```
+git clone --single-branch --branch main https://huggingface.co/TheBloke/CodeLlama-7B-Python-GPTQ
+```
+- In Python Transformers code, the branch is the `revision` parameter; see below.
+<!-- README_GPTQ.md-download-from-branches end -->
+<!-- README_GPTQ.md-text-generation-webui start -->
+## How to easily download and use this model in [text-generation-webui](https://github.com/oobabooga/text-generation-webui).
+Please make sure you're using the latest version of [text-generation-webui](https://github.com/oobabooga/text-generation-webui).
+It is strongly recommended to use the text-generation-webui one-click-installers unless you're sure you know how to make a manual install.
+1. Click the **Model tab**.
+2. Under **Download custom model or LoRA**, enter `TheBloke/CodeLlama-7B-Python-GPTQ`.
+  - To download from a specific branch, enter for example `TheBloke/CodeLlama-7B-Python-GPTQ:main`
+  - see Provided Files above for the list of branches for each option.
+3. Click **Download**.
+4. The model will start downloading. Once it's finished it will say "Done".
+5. In the top left, click the refresh icon next to **Model**.
+6. In the **Model** dropdown, choose the model you just downloaded: `CodeLlama-7B-Python-GPTQ`
+7. The model will automatically load, and is now ready for use!
+8. If you want any custom settings, set them and then click **Save settings for this model** followed by **Reload the Model** in the top right.
+  * Note that you do not need to and should not set manual GPTQ parameters any more. These are set automatically from the file `quantize_config.json`.
+9. Once you're ready, click the **Text Generation tab** and enter a prompt to get started!
+<!-- README_GPTQ.md-text-generation-webui end -->
+<!-- README_GPTQ.md-use-from-python start -->
+## How to use this GPTQ model from Python code
+### Install the necessary packages
+Requires: Transformers 4.32.0 or later, Optimum 1.12.0 or later, and AutoGPTQ 0.4.2 or later.
+```shell
+pip3 install transformers>=4.32.0 optimum>=1.12.0
+pip3 install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7
+```
+If you have problems installing AutoGPTQ using the pre-built wheels, install it from source instead:
+```shell
+pip3 uninstall -y auto-gptq
+git clone https://github.com/PanQiWei/AutoGPTQ
+cd AutoGPTQ
+pip3 install .
+```
+### For CodeLlama models only: you must use Transformers 4.33.0 or later.
+If 4.33.0 is not yet released when you read this, you will need to install Transformers from source:
+```shell
+pip3 uninstall -y transformers
+pip3 install git+https://github.com/huggingface/transformers.git
+```
+### You can then use the following code
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+model_name_or_path = "TheBloke/CodeLlama-7B-Python-GPTQ"
+# To use a different branch, change revision
+# For example: revision="main"
+model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
+                                             device_map="auto",
+                                             trust_remote_code=True,
+                                             revision="main")
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
+prompt = "Tell me about AI"
+prompt_template=f'''[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
+{prompt}
+[/INST]
+'''
+print("\n\n*** Generate:")
+input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
+output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
+print(tokenizer.decode(output[0]))
+# Inference can also be done using transformers' pipeline
+print("*** Pipeline:")
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=512,
+    do_sample=True,
+    temperature=0.7,
+    top_p=0.95,
+    top_k=40,
+    repetition_penalty=1.1
+)
+print(pipe(prompt_template)[0]['generated_text'])
+```
+<!-- README_GPTQ.md-use-from-python end -->
+<!-- README_GPTQ.md-compatibility start -->
+## Compatibility
+The files provided are tested to work with AutoGPTQ, both via Transformers and using AutoGPTQ directly. They should also work with [Occ4m's GPTQ-for-LLaMa fork](https://github.com/0cc4m/KoboldAI).
+[ExLlama](https://github.com/turboderp/exllama) is compatible with Llama models in 4-bit. Please see the Provided Files table above for per-file compatibility.
+[Huggingface Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference) is compatible with all GPTQ models.
+<!-- README_GPTQ.md-compatibility end -->
+<!-- footer start -->
+<!-- 200823 -->
+## Discord
+For further support, and discussions on these models and AI in general, join us at:
+[TheBloke AI's Discord server](https://discord.gg/theblokeai)
+## Thanks, and how to contribute
+Thanks to the [chirper.ai](https://chirper.ai) team!
+Thanks to Clay from [gpus.llm-utils.org](llm-utils)!
+I've had a lot of people ask if they can contribute. I enjoy providing models and helping people, and would love to be able to spend even more time doing it, as well as expanding into new projects like fine tuning/training.
+If you're able and willing to contribute it will be most gratefully received and will help me to keep providing more models, and to start work on new AI projects.
+Donaters will get priority support on any and all AI/LLM/model questions and requests, access to a private Discord room, plus other benefits.
+* Patreon: https://patreon.com/TheBlokeAI
+* Ko-Fi: https://ko-fi.com/TheBlokeAI
+**Special thanks to**: Aemon Algiz.
+**Patreon special mentions**: Alicia Loh, Stephen Murray, K, Ajan Kanaga, RoA, Magnesian, Deo Leter, Olakabola, Eugene Pentland, zynix, Deep Realms, Raymond Fosdick, Elijah Stavena, Iucharbius, Erik Bjäreholt, Luis Javier Navarrete Lozano, Nicholas, theTransient, John Detwiler, alfie_i, knownsqashed, Mano Prime, Willem Michiel, Enrico Ros, LangChain4j, OG, Michael Dempsey, Pierre Kircher, Pedro Madruga, James Bentley, Thomas Belote, Luke @flexchar, Leonard Tan, Johann-Peter Hartmann, Illia Dulskyi, Fen Risland, Chadd, S_X, Jeff Scroggin, Ken Nordquist, Sean Connelly, Artur Olbinski, Swaroop Kallakuri, Jack West, Ai Maven, David Ziegler, Russ Johnson, transmissions 11, John Villwock, Alps Aficionado, Clay Pascal, Viktor Bowallius, Subspace Studios, Rainer Wilmers, Trenton Dambrowitz, vamX, Michael Levine, 준교 김, Brandon Frisco, Kalila, Trailburnt, Randy H, Talal Aujan, Nathan Dryer, Vadim, 阿明, ReadyPlayerEmma, Tiffany J. Kim, George Stoitzev, Spencer Kim, Jerry Meng, Gabriel Tamborski, Cory Kujawski, Jeffrey Morgan, Spiking Neurons AB, Edmond Seymore, Alexandros Triantafyllidis, Lone Striker, Cap'n Zoog, Nikolai Manek, danny, ya boyyy, Derek Yates, usrbinkat, Mandus, TL, Nathan LeClaire, subjectnull, Imad Khwaja, webtim, Raven Klaugh, Asp the Wyvern, Gabriel Puliatti, Caitlyn Gatomon, Joseph William Delisle, Jonathan Leane, Luke Pendergrass, SuperWojo, Sebastain Graf, Will Dee, Fred von Graf, Andrey, Dan Guido, Daniel P. Andersen, Nitin Borwankar, Elle, Vitor Caleffi, biorpg, jjj, NimbleBox.ai, Pieter, Matthew Berman, terasurfer, Michael Davis, Alex, Stanislav Ovsiannikov
+Thank you to all my generous patrons and donaters!
+And thank you again to a16z for their generous grant.
+<!-- footer end -->
+# Original model card: Meta's CodeLlama 7B Python
+# **Code Llama**
+Code Llama is a collection of pretrained and fine-tuned generative text models ranging in scale from 7 billion to 34 billion parameters. This is the repository for the 7B Python specialist version in the Hugging Face Transformers format. This model is designed for general code synthesis and understanding. Links to other models can be found in the index at the bottom.
+|     | Base Model                                                                    | Python                                                                                      | Instruct                                                                                        |
+| --- | ----------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- |
+| 7B  | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) | [codellama/CodeLlama-7b-Python-hf](https://huggingface.co/codellama/CodeLlama-7b-Python-hf) | [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) |
+| 13B  | [codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) | [codellama/CodeLlama-13b-Python-hf](https://huggingface.co/codellama/CodeLlama-13b-Python-hf) | [codellama/CodeLlama-13b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf) |
+| 34B  | [codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | [codellama/CodeLlama-34b-Python-hf](https://huggingface.co/codellama/CodeLlama-34b-Python-hf) | [codellama/CodeLlama-34b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf) |
+## Model Use
+To use this model, please make sure to install transformers from `main` until the next version is released:
+```bash
+pip install git+https://github.com/huggingface/transformers.git@main accelerate
+```
+Model capabilities:
+- [x] Code completion.
+- [ ] Infilling.
+- [ ] Instructions / chat.
+- [x] Python specialist.
+## Model Details
+*Note: Use of this model is governed by the Meta license. Meta developed and publicly released the Code Llama family of large language models (LLMs).
+**Model Developers** Meta
+**Variations** Code Llama comes in three model sizes, and three variants:
+* Code Llama: base models designed for general code synthesis and understanding
+* Code Llama - Python: designed specifically for Python
+* Code Llama - Instruct: for instruction following and safer deployment
+All variants are available in sizes of 7B, 13B and 34B parameters.
+**This repository contains the Python version of the 7B parameters model.**
+**Input** Models input text only.
+**Output** Models generate text only.
+**Model Architecture** Code Llama is an auto-regressive language model that uses an optimized transformer architecture.
+**Model Dates** Code Llama and its variants have been trained between January 2023 and July 2023.
+**Status** This is a static model trained on an offline dataset. Future versions of Code Llama - Instruct will be released as we improve model safety with community feedback.
+**License** A custom commercial license is available at: [https://ai.meta.com/resources/models-and-libraries/llama-downloads/](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
+**Research Paper** More information can be found in the paper "[Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)" or its [arXiv page](https://arxiv.org/abs/2308.12950).
+## Intended Use
+**Intended Use Cases** Code Llama and its variants is intended for commercial and research use in English and relevant programming languages. The base model Code Llama can be adapted for a variety of code synthesis and understanding tasks, Code Llama - Python is designed specifically to handle the Python programming language, and Code Llama - Instruct is intended to be safer to use for code assistant and generation applications.
+**Out-of-Scope Uses** Use in any manner that violates applicable laws or regulations (including trade compliance laws). Use in languages other than English. Use in any other way that is prohibited by the Acceptable Use Policy and Licensing Agreement for Code Llama and its variants.
+## Hardware and Software
+**Training Factors** We used custom training libraries. The training and fine-tuning of the released models have been performed Meta’s Research Super Cluster.
+**Carbon Footprint** In aggregate, training all 9 Code Llama models required 400K GPU hours of computation on hardware of type A100-80GB (TDP of 350-400W). Estimated total emissions were 65.3 tCO2eq, 100% of which were offset by Meta’s sustainability program.
+## Training Data
+All experiments reported here and the released models have been trained and fine-tuned using the same data as Llama 2 with different weights (see Section 2 and Table 1 in the [research paper](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) for details).
+## Evaluation Results
+See evaluations for the main models and detailed ablations in Section 3 and safety evaluations in Section 4 of the research paper.
+## Ethical Considerations and Limitations
+Code Llama and its variants are a new technology that carries risks with use. Testing conducted to date has been in English, and has not covered, nor could it cover all scenarios. For these reasons, as with all LLMs, Code Llama’s potential outputs cannot be predicted in advance, and the model may in some instances produce inaccurate or objectionable responses to user prompts. Therefore, before deploying any applications of Code Llama, developers should perform safety testing and tuning tailored to their specific applications of the model.
+Please see the Responsible Use Guide available available at [https://ai.meta.com/llama/responsible-user-guide](https://ai.meta.com/llama/responsible-user-guide).

models/CodeLlama-7B-Python-GPTQ/USE_POLICY.md ADDED Viewed

	@@ -0,0 +1,50 @@

+# Llama 2 Acceptable Use Policy
+Meta is committed to promoting safe and fair use of its tools and features, including Llama 2. If you access or use Llama 2, you agree to this Acceptable Use Policy (“Policy”). The most recent copy of this policy can be found at [ai.meta.com/llama/use-policy](http://ai.meta.com/llama/use-policy).
+## Prohibited Uses
+We want everyone to use Llama 2 safely and responsibly. You agree you will not use, or allow others to use, Llama 2 to:
+1. Violate the law or others’ rights, including to:
+    1. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal or unlawful activity or content, such as:
+        1. Violence or terrorism
+        2. Exploitation or harm to children, including the solicitation, creation, acquisition, or dissemination of child exploitative content or failure to report Child Sexual Abuse Material
+        3. Human trafficking, exploitation, and sexual violence
+        4. The illegal distribution of information or materials to minors, including obscene materials, or failure to employ legally required age-gating in connection with such information or materials.
+        5. Sexual solicitation
+        6. Any other criminal activity
+    2. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or bullying of individuals or groups of individuals
+    3. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful conduct in the provision of employment, employment benefits, credit, housing, other economic benefits, or other essential goods and services
+    4. Engage in the unauthorized or unlicensed practice of any profession including, but not limited to, financial, legal, medical/health, or related professional practices
+    5. Collect, process, disclose, generate, or infer health, demographic, or other sensitive personal or private information about individuals without rights and consents required by applicable laws
+    6. Engage in or facilitate any action or generate any content that infringes, misappropriates, or otherwise violates any third-party rights, including the outputs or results of any products or services using the Llama 2 Materials
+    7. Create, generate, or facilitate the creation of malicious code, malware, computer viruses or do anything else that could disable, overburden, interfere with or impair the proper working, integrity, operation or appearance of a website or computer system
+2. Engage in, promote, incite, facilitate, or assist in the planning or development of activities that present a risk of death or bodily harm to individuals, including use of Llama 2 related to the following:
+    1. Military, warfare, nuclear industries or applications, espionage, use for materials or activities that are subject to the International Traffic Arms Regulations (ITAR) maintained by the United States Department of State
+    2. Guns and illegal weapons (including weapon development)
+    3. Illegal drugs and regulated/controlled substances
+    4. Operation of critical infrastructure, transportation technologies, or heavy machinery
+    5. Self-harm or harm to others, including suicide, cutting, and eating disorders
+    6. Any content intended to incite or promote violence, abuse, or any infliction of bodily harm to an individual
+3. Intentionally deceive or mislead others, including use of Llama 2 related to the following:
+    1. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
+    2. Generating, promoting, or furthering defamatory content, including the creation of defamatory statements, images, or other content
+    3. Generating, promoting, or further distributing spam
+    4. Impersonating another individual without consent, authorization, or legal right
+    5. Representing that the use of Llama 2 or outputs are human-generated
+    6. Generating or facilitating false online engagement, including fake reviews and other means of fake online engagement
+4. Fail to appropriately disclose to end users any known dangers of your AI system
+Please report any violation of this Policy, software “bug,” or other problems that could lead to a violation of this Policy through one of the following means:
+* Reporting issues with the model: [github.com/facebookresearch/llama](http://github.com/facebookresearch/llama)
+* Reporting risky content generated by the model: [developers.facebook.com/llama_output_feedback](http://developers.facebook.com/llama_output_feedback)
+* Reporting bugs and security concerns: [facebook.com/whitehat/info](http://facebook.com/whitehat/info)
+* Reporting violations of the Acceptable Use Policy or unlicensed uses of Llama: [LlamaUseReport@meta.com](mailto:LlamaUseReport@meta.com)

models/CodeLlama-7B-Python-GPTQ/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+    "architectures": [
+        "LlamaForCausalLM"
+    ],
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 16384,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 32,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.32.0",
+    "use_cache": true,
+    "vocab_size": 32000,
+    "auto_map": {
+        "AutoConfig": "configuration_llama.LlamaConfig",
+        "AutoModel": "modeling_llama.LlamaModel",
+        "AutoModelForCausalLM": "modeling_llama.LlamaForCausalLM",
+        "AutoModelForSequenceClassification": "modeling_llama.LlamaForSequenceClassification"
+    },
+    "rope_theta": 1000000,
+    "quantization_config": {
+        "bits": 4,
+        "group_size": 128,
+        "damp_percent": 0.1,
+        "desc_act": false,
+        "sym": true,
+        "true_sequential": true,
+        "model_name_or_path": null,
+        "model_file_base_name": "model",
+        "quant_method": "gptq"
+    },
+    "pad_token_id": 0
+}

models/CodeLlama-7B-Python-GPTQ/configuration_llama.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LLaMA model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        pretraining_tp (`int`, *optional*, defaults to `1`):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        Example:
+    ```python
+    >>> from transformers import LlamaModel, LlamaConfig
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LlamaConfig()
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_scaling=None,
+        rope_theta=10000,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.rope_theta = rope_theta
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")

models/CodeLlama-7B-Python-GPTQ/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "pad_token_id": 0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.32.0"
+}

models/CodeLlama-7B-Python-GPTQ/modeling_llama.py ADDED Viewed

	@@ -0,0 +1,1020 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_llama import LlamaConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim, max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings,
+                    base=self.rope_theta, scaling_factor=scaling_factor
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim, max_position_embeddings=self.max_position_embeddings,
+                    base=self.rope_theta, scaling_factor=scaling_factor
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

models/CodeLlama-7B-Python-GPTQ/quantize_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bits": 4,
+  "group_size": 128,
+  "damp_percent": 0.1,
+  "desc_act": false,
+  "sym": true,
+  "true_sequential": true,
+  "model_name_or_path": null,
+  "model_file_base_name": "model"
+}