aelitta commited on
Commit
4bdb245
1 Parent(s): ecc410d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +20 -0
  3. BioMistral-7B-GGUF/.DS_Store +0 -0
  4. BioMistral-7B-GGUF/.gitattributes +45 -0
  5. BioMistral-7B-GGUF/README.md +238 -0
  6. BioMistral-7B-GGUF/config.json +3 -0
  7. BioMistral-7B.Q4_K_M.gguf +3 -0
  8. README.md +2 -8
  9. __pycache__/app.cpython-39.pyc +0 -0
  10. app.py +1 -0
  11. data/10.1177_1557988318780857.pdf +0 -0
  12. ingest.py +1 -0
  13. llama-cpp-python/.DS_Store +0 -0
  14. llama-cpp-python/.dockerignore +166 -0
  15. llama-cpp-python/.github/ISSUE_TEMPLATE/bug_report.md +96 -0
  16. llama-cpp-python/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  17. llama-cpp-python/.github/dependabot.yml +15 -0
  18. llama-cpp-python/.github/workflows/build-and-release.yaml +112 -0
  19. llama-cpp-python/.github/workflows/build-docker.yaml +50 -0
  20. llama-cpp-python/.github/workflows/build-wheels-cuda.yaml +131 -0
  21. llama-cpp-python/.github/workflows/build-wheels-metal.yaml +87 -0
  22. llama-cpp-python/.github/workflows/generate-index-from-release.yaml +50 -0
  23. llama-cpp-python/.github/workflows/publish-to-test.yaml +43 -0
  24. llama-cpp-python/.github/workflows/publish.yaml +32 -0
  25. llama-cpp-python/.github/workflows/test-pypi.yaml +64 -0
  26. llama-cpp-python/.github/workflows/test.yaml +126 -0
  27. llama-cpp-python/.gitignore +180 -0
  28. llama-cpp-python/.gitmodules +3 -0
  29. llama-cpp-python/.readthedocs.yaml +24 -0
  30. llama-cpp-python/CHANGELOG.md +630 -0
  31. llama-cpp-python/CMakeLists.txt +87 -0
  32. llama-cpp-python/LICENSE.md +9 -0
  33. llama-cpp-python/Makefile +82 -0
  34. llama-cpp-python/README.md +792 -0
  35. llama-cpp-python/docker/README.md +64 -0
  36. llama-cpp-python/docker/cuda_simple/Dockerfile +27 -0
  37. llama-cpp-python/docker/open_llama/Dockerfile +51 -0
  38. llama-cpp-python/docker/open_llama/build.sh +14 -0
  39. llama-cpp-python/docker/open_llama/hug_model.py +139 -0
  40. llama-cpp-python/docker/open_llama/start.sh +28 -0
  41. llama-cpp-python/docker/open_llama/start_server.sh +11 -0
  42. llama-cpp-python/docker/openblas_simple/Dockerfile +15 -0
  43. llama-cpp-python/docker/simple/Dockerfile +34 -0
  44. llama-cpp-python/docker/simple/run.sh +4 -0
  45. llama-cpp-python/docs/api-reference.md +88 -0
  46. llama-cpp-python/docs/changelog.md +1 -0
  47. llama-cpp-python/docs/index.md +5 -0
  48. llama-cpp-python/docs/install/macos.md +59 -0
  49. llama-cpp-python/docs/requirements.txt +3 -0
  50. llama-cpp-python/docs/server.md +222 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ BioMistral-7B.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
37
+ llama-cpp-python/vendor/llama.cpp/kompute/docs/images/komputer-2.gif filter=lfs diff=lfs merge=lfs -text
38
+ llama-cpp-python/vendor/llama.cpp/kompute/docs/images/komputer-godot-4.gif filter=lfs diff=lfs merge=lfs -text
39
+ llama-cpp-python/vendor/llama.cpp/kompute/docs/images/komputer-logos.gif filter=lfs diff=lfs merge=lfs -text
40
+ llama-cpp-python/vendor/llama.cpp/kompute/examples/android/android-simple/app/src/main/assets/komputer-2.gif filter=lfs diff=lfs merge=lfs -text
41
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-aquila.gguf filter=lfs diff=lfs merge=lfs -text
42
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-baichuan.gguf filter=lfs diff=lfs merge=lfs -text
43
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-deepseek-coder.gguf filter=lfs diff=lfs merge=lfs -text
44
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-deepseek-llm.gguf filter=lfs diff=lfs merge=lfs -text
45
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-falcon.gguf filter=lfs diff=lfs merge=lfs -text
46
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-gpt-2.gguf filter=lfs diff=lfs merge=lfs -text
47
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-gpt-neox.gguf filter=lfs diff=lfs merge=lfs -text
48
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-gpt2.gguf filter=lfs diff=lfs merge=lfs -text
49
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-llama-bpe.gguf filter=lfs diff=lfs merge=lfs -text
50
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-mpt.gguf filter=lfs diff=lfs merge=lfs -text
51
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-refact.gguf filter=lfs diff=lfs merge=lfs -text
52
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-stablelm.gguf filter=lfs diff=lfs merge=lfs -text
53
+ llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-starcoder.gguf filter=lfs diff=lfs merge=lfs -text
54
+ qdrant_storage/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
55
+ qdrant_storage/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text
BioMistral-7B-GGUF/.DS_Store ADDED
Binary file (6.15 kB). View file
 
BioMistral-7B-GGUF/.gitattributes ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ BioMistral-7B.Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
37
+ BioMistral-7B.Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
38
+ BioMistral-7B.Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
39
+ BioMistral-7B.Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
40
+ BioMistral-7B.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
41
+ BioMistral-7B.Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
42
+ BioMistral-7B.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
43
+ BioMistral-7B.Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
44
+ BioMistral-7B.Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
45
+ BioMistral-7B.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
BioMistral-7B-GGUF/README.md ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - quantized
4
+ - 2-bit
5
+ - 3-bit
6
+ - 4-bit
7
+ - 5-bit
8
+ - 6-bit
9
+ - 8-bit
10
+ - GGUF
11
+ - transformers
12
+ - pytorch
13
+ - tensorboard
14
+ - mistral
15
+ - text-generation
16
+ - medical
17
+ - biology
18
+ - conversational
19
+ - fr
20
+ - en
21
+ - de
22
+ - nl
23
+ - es
24
+ - pt
25
+ - pl
26
+ - ro
27
+ - it
28
+ - dataset:pubmed
29
+ - arxiv:2402.10373
30
+ - license:apache-2.0
31
+ - autotrain_compatible
32
+ - endpoints_compatible
33
+ - text-generation-inference
34
+ - region:us
35
+ - text-generation
36
+ model_name: BioMistral-7B-GGUF
37
+ base_model: BioMistral/BioMistral-7B
38
+ inference: false
39
+ model_creator: BioMistral
40
+ pipeline_tag: text-generation
41
+ quantized_by: MaziyarPanahi
42
+ ---
43
+ # [MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF)
44
+ - Model creator: [BioMistral](https://huggingface.co/BioMistral)
45
+ - Original model: [BioMistral/BioMistral-7B](https://huggingface.co/BioMistral/BioMistral-7B)
46
+
47
+ ## Description
48
+ [MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF) contains GGUF format model files for [BioMistral/BioMistral-7B](https://huggingface.co/BioMistral/BioMistral-7B).
49
+
50
+ ## How to use
51
+ Thanks to [TheBloke](https://huggingface.co/TheBloke) for preparing an amazing README on how to use GGUF models:
52
+
53
+ ### About GGUF
54
+
55
+ GGUF is a new format introduced by the llama.cpp team on August 21st 2023. It is a replacement for GGML, which is no longer supported by llama.cpp.
56
+
57
+ Here is an incomplete list of clients and libraries that are known to support GGUF:
58
+
59
+ * [llama.cpp](https://github.com/ggerganov/llama.cpp). The source project for GGUF. Offers a CLI and a server option.
60
+ * [text-generation-webui](https://github.com/oobabooga/text-generation-webui), the most widely used web UI, with many features and powerful extensions. Supports GPU acceleration.
61
+ * [KoboldCpp](https://github.com/LostRuins/koboldcpp), a fully featured web UI, with GPU accel across all platforms and GPU architectures. Especially good for story telling.
62
+ * [GPT4All](https://gpt4all.io/index.html), a free and open source local running GUI, supporting Windows, Linux and macOS with full GPU accel.
63
+ * [LM Studio](https://lmstudio.ai/), an easy-to-use and powerful local GUI for Windows and macOS (Silicon), with GPU acceleration. Linux available, in beta as of 27/11/2023.
64
+ * [LoLLMS Web UI](https://github.com/ParisNeo/lollms-webui), a great web UI with many interesting and unique features, including a full model library for easy model selection.
65
+ * [Faraday.dev](https://faraday.dev/), an attractive and easy to use character-based chat GUI for Windows and macOS (both Silicon and Intel), with GPU acceleration.
66
+ * [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), a Python library with GPU accel, LangChain support, and OpenAI-compatible API server.
67
+ * [candle](https://github.com/huggingface/candle), a Rust ML framework with a focus on performance, including GPU support, and ease of use.
68
+ * [ctransformers](https://github.com/marella/ctransformers), a Python library with GPU accel, LangChain support, and OpenAI-compatible AI server. Note, as of time of writing (November 27th 2023), ctransformers has not been updated in a long time and does not support many recent models.
69
+
70
+ ### Explanation of quantisation methods
71
+
72
+ <details>
73
+ <summary>Click to see details</summary>
74
+
75
+ The new methods available are:
76
+
77
+ * GGML_TYPE_Q2_K - "type-1" 2-bit quantization in super-blocks containing 16 blocks, each block having 16 weight. Block scales and mins are quantized with 4 bits. This ends up effectively using 2.5625 bits per weight (bpw)
78
+ * GGML_TYPE_Q3_K - "type-0" 3-bit quantization in super-blocks containing 16 blocks, each block having 16 weights. Scales are quantized with 6 bits. This end up using 3.4375 bpw.
79
+ * GGML_TYPE_Q4_K - "type-1" 4-bit quantization in super-blocks containing 8 blocks, each block having 32 weights. Scales and mins are quantized with 6 bits. This ends up using 4.5 bpw.
80
+ * GGML_TYPE_Q5_K - "type-1" 5-bit quantization. Same super-block structure as GGML_TYPE_Q4_K resulting in 5.5 bpw
81
+ * GGML_TYPE_Q6_K - "type-0" 6-bit quantization. Super-blocks with 16 blocks, each block having 16 weights. Scales are quantized with 8 bits. This ends up using 6.5625 bpw
82
+
83
+ ## How to download GGUF files
84
+
85
+ **Note for manual downloaders:** You almost never want to clone the entire repo! Multiple different quantisation formats are provided, and most users only want to pick and download a single file.
86
+
87
+ The following clients/libraries will automatically download models for you, providing a list of available models to choose from:
88
+
89
+ * LM Studio
90
+ * LoLLMS Web UI
91
+ * Faraday.dev
92
+
93
+ ### In `text-generation-webui`
94
+
95
+ Under Download Model, you can enter the model repo: [MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF) and below it, a specific filename to download, such as: BioMistral-7B-GGUF.Q4_K_M.gguf.
96
+
97
+ Then click Download.
98
+
99
+ ### On the command line, including multiple files at once
100
+
101
+ I recommend using the `huggingface-hub` Python library:
102
+
103
+ ```shell
104
+ pip3 install huggingface-hub
105
+ ```
106
+
107
+ Then you can download any individual model file to the current directory, at high speed, with a command like this:
108
+
109
+ ```shell
110
+ huggingface-cli download MaziyarPanahi/BioMistral-7B-GGUF BioMistral-7B-GGUF.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
111
+ ```
112
+ </details>
113
+ <details>
114
+ <summary>More advanced huggingface-cli download usage (click to read)</summary>
115
+
116
+ You can also download multiple files at once with a pattern:
117
+
118
+ ```shell
119
+ huggingface-cli download [MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF) --local-dir . --local-dir-use-symlinks False --include='*Q4_K*gguf'
120
+ ```
121
+
122
+ For more documentation on downloading with `huggingface-cli`, please see: [HF -> Hub Python Library -> Download files -> Download from the CLI](https://huggingface.co/docs/huggingface_hub/guides/download#download-from-the-cli).
123
+
124
+ To accelerate downloads on fast connections (1Gbit/s or higher), install `hf_transfer`:
125
+
126
+ ```shell
127
+ pip3 install hf_transfer
128
+ ```
129
+
130
+ And set environment variable `HF_HUB_ENABLE_HF_TRANSFER` to `1`:
131
+
132
+ ```shell
133
+ HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download MaziyarPanahi/BioMistral-7B-GGUF BioMistral-7B-GGUF.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
134
+ ```
135
+
136
+ Windows Command Line users: You can set the environment variable by running `set HF_HUB_ENABLE_HF_TRANSFER=1` before the download command.
137
+ </details>
138
+
139
+ ## Example `llama.cpp` command
140
+
141
+ Make sure you are using `llama.cpp` from commit [d0cee0d](https://github.com/ggerganov/llama.cpp/commit/d0cee0d36d5be95a0d9088b674dbb27354107221) or later.
142
+
143
+ ```shell
144
+ ./main -ngl 35 -m BioMistral-7B-GGUF.Q4_K_M.gguf --color -c 32768 --temp 0.7 --repeat_penalty 1.1 -n -1 -p "<|im_start|>system
145
+ {system_message}<|im_end|>
146
+ <|im_start|>user
147
+ {prompt}<|im_end|>
148
+ <|im_start|>assistant"
149
+ ```
150
+
151
+ Change `-ngl 32` to the number of layers to offload to GPU. Remove it if you don't have GPU acceleration.
152
+
153
+ Change `-c 32768` to the desired sequence length. For extended sequence models - eg 8K, 16K, 32K - the necessary RoPE scaling parameters are read from the GGUF file and set by llama.cpp automatically. Note that longer sequence lengths require much more resources, so you may need to reduce this value.
154
+
155
+ If you want to have a chat-style conversation, replace the `-p <PROMPT>` argument with `-i -ins`
156
+
157
+ For other parameters and how to use them, please refer to [the llama.cpp documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/main/README.md)
158
+
159
+ ## How to run in `text-generation-webui`
160
+
161
+ Further instructions can be found in the text-generation-webui documentation, here: [text-generation-webui/docs/04 ‐ Model Tab.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/04%20%E2%80%90%20Model%20Tab.md#llamacpp).
162
+
163
+ ## How to run from Python code
164
+
165
+ You can use GGUF models from Python using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) or [ctransformers](https://github.com/marella/ctransformers) libraries. Note that at the time of writing (Nov 27th 2023), ctransformers has not been updated for some time and is not compatible with some recent models. Therefore I recommend you use llama-cpp-python.
166
+
167
+ ### How to load this model in Python code, using llama-cpp-python
168
+
169
+ For full documentation, please see: [llama-cpp-python docs](https://abetlen.github.io/llama-cpp-python/).
170
+
171
+ #### First install the package
172
+
173
+ Run one of the following commands, according to your system:
174
+
175
+ ```shell
176
+ # Base ctransformers with no GPU acceleration
177
+ pip install llama-cpp-python
178
+ # With NVidia CUDA acceleration
179
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
180
+ # Or with OpenBLAS acceleration
181
+ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
182
+ # Or with CLBLast acceleration
183
+ CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
184
+ # Or with AMD ROCm GPU acceleration (Linux only)
185
+ CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
186
+ # Or with Metal GPU acceleration for macOS systems only
187
+ CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
188
+
189
+ # In windows, to set the variables CMAKE_ARGS in PowerShell, follow this format; eg for NVidia CUDA:
190
+ $env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on"
191
+ pip install llama-cpp-python
192
+ ```
193
+
194
+ #### Simple llama-cpp-python example code
195
+
196
+ ```python
197
+ from llama_cpp import Llama
198
+
199
+ # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
200
+ llm = Llama(
201
+ model_path="./BioMistral-7B-GGUF.Q4_K_M.gguf", # Download the model file first
202
+ n_ctx=32768, # The max sequence length to use - note that longer sequence lengths require much more resources
203
+ n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
204
+ n_gpu_layers=35 # The number of layers to offload to GPU, if you have GPU acceleration available
205
+ )
206
+
207
+ # Simple inference example
208
+ output = llm(
209
+ "<|im_start|>system
210
+ {system_message}<|im_end|>
211
+ <|im_start|>user
212
+ {prompt}<|im_end|>
213
+ <|im_start|>assistant", # Prompt
214
+ max_tokens=512, # Generate up to 512 tokens
215
+ stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
216
+ echo=True # Whether to echo the prompt
217
+ )
218
+
219
+ # Chat Completion API
220
+
221
+ llm = Llama(model_path="./BioMistral-7B-GGUF.Q4_K_M.gguf", chat_format="llama-2") # Set chat_format according to the model you are using
222
+ llm.create_chat_completion(
223
+ messages = [
224
+ {"role": "system", "content": "You are a story writing assistant."},
225
+ {
226
+ "role": "user",
227
+ "content": "Write a story about llamas."
228
+ }
229
+ ]
230
+ )
231
+ ```
232
+
233
+ ## How to use with LangChain
234
+
235
+ Here are guides on using llama-cpp-python and ctransformers with LangChain:
236
+
237
+ * [LangChain + llama-cpp-python](https://python.langchain.com/docs/integrations/llms/llamacpp)
238
+ * [LangChain + ctransformers](https://python.langchain.com/docs/integrations/providers/ctransformers)
BioMistral-7B-GGUF/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "model_type": "mistral"
3
+ }
BioMistral-7B.Q4_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a73107045dfe7e3f113b392b0a67e3e6ca9fa9dae2abe301424ce5abd1721a6
3
+ size 4368439424
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: BioMistral Gradio
3
- emoji: 📚
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.29.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: BioMistral_gradio
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.29.0
 
 
6
  ---
 
 
__pycache__/app.cpython-39.pyc ADDED
Binary file (2.41 kB). View file
 
app.py ADDED
@@ -0,0 +1 @@
 
0
  chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever)
1
  print("LLM or Vector Database not initialized")
2
  history_langchain_format = []
3
  prompt = PromptTemplate(template=prompt_template,
4
  input_variables=["chat_history", 'message'])
5
  response = chain({"question": message, "chat_history": chat_history})
6
 
7
  answer = response['answer']
8
 
9
  chat_history.append((message, answer))
10
 
11
  temp = []
12
  for input_question, bot_answer in history:
13
  temp.append(input_question)
14
  temp.append(bot_answer)
15
  history_langchain_format.append(temp)
16
  temp.clear()
17
  temp.append(message)
18
  temp.append(answer)
19
  history_langchain_format.append(temp)
20
 
21
  return answer
 
1
+
2
  chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever)
3
  print("LLM or Vector Database not initialized")
4
  history_langchain_format = []
5
  prompt = PromptTemplate(template=prompt_template,
6
  input_variables=["chat_history", 'message'])
7
  response = chain({"question": message, "chat_history": chat_history})
8
 
9
  answer = response['answer']
10
 
11
  chat_history.append((message, answer))
12
 
13
  temp = []
14
  for input_question, bot_answer in history:
15
  temp.append(input_question)
16
  temp.append(bot_answer)
17
  history_langchain_format.append(temp)
18
  temp.clear()
19
  temp.append(message)
20
  temp.append(answer)
21
  history_langchain_format.append(temp)
22
 
23
  return answer
data/10.1177_1557988318780857.pdf ADDED
The diff for this file is too large to render. See raw diff
 
ingest.py ADDED
@@ -0,0 +1 @@
 
0
  texts,
1
  embeddings,
2
  url=url,
3
  prefer_grpc=False,
4
  collection_name="vector_db"
 
1
+ import os
2
  texts,
3
  embeddings,
4
  url=url,
5
  prefer_grpc=False,
6
  collection_name="vector_db"
llama-cpp-python/.DS_Store ADDED
Binary file (8.2 kB). View file
 
llama-cpp-python/.dockerignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _skbuild/
2
+
3
+ .envrc
4
+
5
+ models/
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+
12
+ # C extensions
13
+ *.so
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ cover/
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+
80
+ # PyBuilder
81
+ .pybuilder/
82
+ target/
83
+
84
+ # Jupyter Notebook
85
+ .ipynb_checkpoints
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ # For a library or package, you might want to ignore these files since the code is
93
+ # intended to run in multiple environments; otherwise, check them in:
94
+ # .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/#use-with-ide
116
+ .pdm.toml
117
+
118
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
119
+ __pypackages__/
120
+
121
+ # Celery stuff
122
+ celerybeat-schedule
123
+ celerybeat.pid
124
+
125
+ # SageMath parsed files
126
+ *.sage.py
127
+
128
+ # Environments
129
+ .env
130
+ .venv
131
+ env/
132
+ venv/
133
+ ENV/
134
+ env.bak/
135
+ venv.bak/
136
+
137
+ # Spyder project settings
138
+ .spyderproject
139
+ .spyproject
140
+
141
+ # Rope project settings
142
+ .ropeproject
143
+
144
+ # mkdocs documentation
145
+ /site
146
+
147
+ # mypy
148
+ .mypy_cache/
149
+ .dmypy.json
150
+ dmypy.json
151
+
152
+ # Pyre type checker
153
+ .pyre/
154
+
155
+ # pytype static type analyzer
156
+ .pytype/
157
+
158
+ # Cython debug symbols
159
+ cython_debug/
160
+
161
+ # PyCharm
162
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
163
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
165
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166
+ .idea/
llama-cpp-python/.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: ''
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ # Prerequisites
11
+
12
+ Please answer the following questions for yourself before submitting an issue.
13
+
14
+ - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
15
+ - [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
16
+ - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
17
+ - [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
18
+
19
+ # Expected Behavior
20
+
21
+ Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
22
+
23
+ # Current Behavior
24
+
25
+ Please provide a detailed written description of what `llama-cpp-python` did, instead.
26
+
27
+ # Environment and Context
28
+
29
+ Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
30
+
31
+ * Physical (or virtual) hardware you are using, e.g. for Linux:
32
+
33
+ `$ lscpu`
34
+
35
+ * Operating System, e.g. for Linux:
36
+
37
+ `$ uname -a`
38
+
39
+ * SDK version, e.g. for Linux:
40
+
41
+ ```
42
+ $ python3 --version
43
+ $ make --version
44
+ $ g++ --version
45
+ ```
46
+
47
+ # Failure Information (for bugs)
48
+
49
+ Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
50
+
51
+ # Steps to Reproduce
52
+
53
+ Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
54
+
55
+ 1. step 1
56
+ 2. step 2
57
+ 3. step 3
58
+ 4. etc.
59
+
60
+ **Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
61
+
62
+ Try the following:
63
+
64
+ 1. `git clone https://github.com/abetlen/llama-cpp-python`
65
+ 2. `cd llama-cpp-python`
66
+ 3. `rm -rf _skbuild/` # delete any old builds
67
+ 4. `python -m pip install .`
68
+ 5. `cd ./vendor/llama.cpp`
69
+ 6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
70
+ 7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
71
+
72
+ # Failure Logs
73
+
74
+ Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
75
+
76
+ Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
77
+
78
+ Example environment info:
79
+ ```
80
+ llama-cpp-python$ git log | head -1
81
+ commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
82
+
83
+ llama-cpp-python$ python3 --version
84
+ Python 3.10.10
85
+
86
+ llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy"
87
+ fastapi 0.95.0
88
+ numpy 1.24.3
89
+ sse-starlette 1.3.3
90
+ uvicorn 0.21.1
91
+
92
+ llama-cpp-python/vendor/llama.cpp$ git log | head -3
93
+ commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
94
+ Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
95
+ Date: Thu May 25 20:18:01 2023 -0600
96
+ ```
llama-cpp-python/.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: ''
5
+ labels: ''
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12
+
13
+ **Describe the solution you'd like**
14
+ A clear and concise description of what you want to happen.
15
+
16
+ **Describe alternatives you've considered**
17
+ A clear and concise description of any alternative solutions or features you've considered.
18
+
19
+ **Additional context**
20
+ Add any other context or screenshots about the feature request here.
llama-cpp-python/.github/dependabot.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To get started with Dependabot version updates, you'll need to specify which
2
+ # package ecosystems to update and where the package manifests are located.
3
+ # Please see the documentation for all configuration options:
4
+ # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5
+
6
+ version: 2
7
+ updates:
8
+ - package-ecosystem: "pip" # See documentation for possible values
9
+ directory: "/" # Location of package manifests
10
+ schedule:
11
+ interval: "weekly"
12
+ - package-ecosystem: "github-actions"
13
+ directory: "/"
14
+ schedule:
15
+ interval: "weekly"
llama-cpp-python/.github/workflows/build-and-release.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build Release
2
+
3
+ on: workflow_dispatch
4
+
5
+ permissions:
6
+ contents: write
7
+
8
+ jobs:
9
+ build_wheels:
10
+ name: Build wheels on ${{ matrix.os }}
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ matrix:
14
+ os: [ubuntu-20.04, windows-2019, macos-11]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ with:
19
+ submodules: "recursive"
20
+
21
+ # Used to host cibuildwheel
22
+ - uses: actions/setup-python@v5
23
+ with:
24
+ python-version: "3.8"
25
+
26
+ - name: Install dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ python -m pip install -e .[all]
30
+
31
+ - name: Build wheels
32
+ uses: pypa/cibuildwheel@v2.17.0
33
+ env:
34
+ # disable repair
35
+ CIBW_REPAIR_WHEEL_COMMAND: ""
36
+ with:
37
+ package-dir: .
38
+ output-dir: wheelhouse
39
+
40
+ - uses: actions/upload-artifact@v4
41
+ with:
42
+ name: wheels-${{ matrix.os }}
43
+ path: ./wheelhouse/*.whl
44
+
45
+ build_wheels_arm64:
46
+ name: Build arm64 wheels
47
+ runs-on: ubuntu-latest
48
+ steps:
49
+ - uses: actions/checkout@v4
50
+ with:
51
+ submodules: "recursive"
52
+
53
+ - name: Set up QEMU
54
+ uses: docker/setup-qemu-action@v3
55
+ with:
56
+ platforms: linux/arm64
57
+
58
+ - name: Build wheels
59
+ uses: pypa/cibuildwheel@v2.17.0
60
+ env:
61
+ CIBW_SKIP: "*musllinux* pp*"
62
+ CIBW_REPAIR_WHEEL_COMMAND: ""
63
+ CIBW_ARCHS: "aarch64"
64
+ CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
65
+ with:
66
+ output-dir: wheelhouse
67
+
68
+ - name: Upload wheels as artifacts
69
+ uses: actions/upload-artifact@v4
70
+ with:
71
+ name: wheels_arm64
72
+ path: ./wheelhouse/*.whl
73
+
74
+ build_sdist:
75
+ name: Build source distribution
76
+ runs-on: ubuntu-latest
77
+
78
+ steps:
79
+ - uses: actions/checkout@v4
80
+ with:
81
+ submodules: "recursive"
82
+ - uses: actions/setup-python@v5
83
+ with:
84
+ python-version: "3.8"
85
+ - name: Install dependencies
86
+ run: |
87
+ python -m pip install --upgrade pip build
88
+ python -m pip install -e .[all]
89
+ - name: Build source distribution
90
+ run: |
91
+ python -m build --sdist
92
+ - uses: actions/upload-artifact@v4
93
+ with:
94
+ name: sdist
95
+ path: ./dist/*.tar.gz
96
+
97
+ release:
98
+ name: Release
99
+ needs: [build_wheels, build_wheels_arm64, build_sdist]
100
+ runs-on: ubuntu-latest
101
+
102
+ steps:
103
+ - uses: actions/download-artifact@v4
104
+ with:
105
+ merge-multiple: true
106
+ path: dist
107
+
108
+ - uses: softprops/action-gh-release@v2
109
+ with:
110
+ files: dist/*
111
+ env:
112
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
llama-cpp-python/.github/workflows/build-docker.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build Docker
2
+
3
+ on: workflow_dispatch
4
+
5
+ permissions:
6
+ contents: write
7
+ packages: write
8
+
9
+ jobs:
10
+ docker:
11
+ name: Build and push Docker image
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - name: Checkout
15
+ uses: actions/checkout@v4
16
+ with:
17
+ submodules: "recursive"
18
+
19
+ - name: Set up QEMU
20
+ uses: docker/setup-qemu-action@v3
21
+
22
+ - name: Set up Docker Buildx
23
+ uses: docker/setup-buildx-action@v3
24
+
25
+ - name: Login to GitHub Container Registry
26
+ uses: docker/login-action@v3
27
+ with:
28
+ registry: ghcr.io
29
+ username: ${{ github.repository_owner }}
30
+ password: ${{ secrets.GITHUB_TOKEN }}
31
+
32
+ - name: Build and push
33
+ id: docker_build
34
+ uses: docker/build-push-action@v5
35
+ with:
36
+ context: .
37
+ file: "docker/simple/Dockerfile"
38
+ push: ${{ startsWith(github.ref, 'refs/tags/') }}
39
+ pull: true
40
+ platforms: linux/amd64,linux/arm64
41
+ tags: |
42
+ ghcr.io/abetlen/llama-cpp-python:latest
43
+ ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }}
44
+ build-args: |
45
+ BUILDKIT_INLINE_CACHE=1
46
+
47
+ - name: Publish to GitHub Tag
48
+ if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/')
49
+ run: |
50
+ echo "Docker image published for tag: ${{ github.ref_name }}"
llama-cpp-python/.github/workflows/build-wheels-cuda.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build Wheels (CUDA)
2
+
3
+ on: workflow_dispatch
4
+
5
+ permissions:
6
+ contents: write
7
+
8
+ jobs:
9
+ define_matrix:
10
+ name: Define Build Matrix
11
+ runs-on: ubuntu-latest
12
+ outputs:
13
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
14
+ defaults:
15
+ run:
16
+ shell: pwsh
17
+
18
+ steps:
19
+ - name: Define Job Output
20
+ id: set-matrix
21
+ run: |
22
+ $matrix = @{
23
+ 'os' = @('ubuntu-20.04', 'windows-latest')
24
+ 'pyver' = @("3.10", "3.11", "3.12")
25
+ 'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
26
+ 'releasetag' = @("basic")
27
+ }
28
+
29
+ $matrixOut = ConvertTo-Json $matrix -Compress
30
+ Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
31
+
32
+ build_wheels:
33
+ name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
34
+ needs: define_matrix
35
+ runs-on: ${{ matrix.os }}
36
+ strategy:
37
+ matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
38
+ defaults:
39
+ run:
40
+ shell: pwsh
41
+ env:
42
+ CUDAVER: ${{ matrix.cuda }}
43
+ AVXVER: ${{ matrix.releasetag }}
44
+
45
+ steps:
46
+ - uses: actions/checkout@v4
47
+ with:
48
+ submodules: "recursive"
49
+
50
+ - uses: actions/setup-python@v5
51
+ with:
52
+ python-version: ${{ matrix.pyver }}
53
+
54
+ - name: Setup Mamba
55
+ uses: conda-incubator/setup-miniconda@v3.0.4
56
+ with:
57
+ activate-environment: "build"
58
+ python-version: ${{ matrix.pyver }}
59
+ miniforge-variant: Mambaforge
60
+ miniforge-version: latest
61
+ use-mamba: true
62
+ add-pip-as-python-dependency: true
63
+ auto-activate-base: false
64
+
65
+ - name: VS Integration Cache
66
+ id: vs-integration-cache
67
+ if: runner.os == 'Windows'
68
+ uses: actions/cache@v4.0.2
69
+ with:
70
+ path: ./MSBuildExtensions
71
+ key: cuda-${{ matrix.cuda }}-vs-integration
72
+
73
+ - name: Get Visual Studio Integration
74
+ if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
75
+ run: |
76
+ if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
77
+ $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
78
+ for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
79
+ Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
80
+ & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
81
+ Remove-Item 'cudainstaller.zip'
82
+
83
+ - name: Install Visual Studio Integration
84
+ if: runner.os == 'Windows'
85
+ run: |
86
+ $y = (gi '.\MSBuildExtensions').fullname + '\*'
87
+ (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
88
+ $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
89
+ echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
90
+
91
+ - name: Install Dependencies
92
+ env:
93
+ MAMBA_DOWNLOAD_FAILFAST: "0"
94
+ MAMBA_NO_LOW_SPEED_LIMIT: "1"
95
+ run: |
96
+ $cudaVersion = $env:CUDAVER
97
+ mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
98
+ python -m pip install build wheel
99
+
100
+ - name: Build Wheel
101
+ run: |
102
+ $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
103
+ $env:CUDA_PATH = $env:CONDA_PREFIX
104
+ $env:CUDA_HOME = $env:CONDA_PREFIX
105
+ $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
106
+ if ($IsLinux) {
107
+ $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
108
+ }
109
+ $env:VERBOSE = '1'
110
+ $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
111
+ $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
112
+ if ($env:AVXVER -eq 'AVX') {
113
+ $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
114
+ }
115
+ if ($env:AVXVER -eq 'AVX512') {
116
+ $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
117
+ }
118
+ if ($env:AVXVER -eq 'basic') {
119
+ $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
120
+ }
121
+ python -m build --wheel
122
+ # write the build tag to the output
123
+ Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
124
+
125
+ - uses: softprops/action-gh-release@v2
126
+ with:
127
+ files: dist/*
128
+ # Set tag_name to <tag>-cu<cuda_version>
129
+ tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
130
+ env:
131
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
llama-cpp-python/.github/workflows/build-wheels-metal.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build Wheels (Metal)
2
+
3
+ on: workflow_dispatch
4
+
5
+ permissions:
6
+ contents: write
7
+
8
+ jobs:
9
+ define_matrix:
10
+ name: Define Build Matrix
11
+ runs-on: ubuntu-latest
12
+ outputs:
13
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
14
+ defaults:
15
+ run:
16
+ shell: pwsh
17
+
18
+ steps:
19
+ - name: Define Job Output
20
+ id: set-matrix
21
+ run: |
22
+ $matrix = @{
23
+ 'os' = @('macos-11', 'macos-12', 'macos-13')
24
+ 'pyver' = @('3.10', '3.11', '3.12')
25
+ }
26
+
27
+ $matrixOut = ConvertTo-Json $matrix -Compress
28
+ Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
29
+
30
+ build_wheels:
31
+ name: ${{ matrix.os }} Python ${{ matrix.pyver }}
32
+ needs: define_matrix
33
+ runs-on: ${{ matrix.os }}
34
+ strategy:
35
+ matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
36
+ env:
37
+ OSVER: ${{ matrix.os }}
38
+
39
+ steps:
40
+ - uses: actions/checkout@v4
41
+ with:
42
+ submodules: "recursive"
43
+
44
+ - uses: actions/setup-python@v5
45
+ with:
46
+ python-version: ${{ matrix.pyver }}
47
+
48
+ - name: Install Dependencies
49
+ run: |
50
+ python -m pip install build wheel cmake
51
+
52
+ - name: Build Wheel
53
+ run: |
54
+ XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
55
+ XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
56
+ export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
57
+ [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
58
+ [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
59
+ [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
60
+
61
+ export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
62
+ VERBOSE=1 python -m build --wheel
63
+
64
+ if [[ "$OSVER" == "macos-13" ]]; then
65
+ export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
66
+ export MACOSX_DEPLOYMENT_TARGET="14.0"
67
+ VERBOSE=1 python -m build --wheel
68
+ fi
69
+
70
+ for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
71
+
72
+ export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
73
+ VERBOSE=1 python -m build --wheel
74
+
75
+ if [[ "$OSVER" == "macos-13" ]]; then
76
+ export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
77
+ export MACOSX_DEPLOYMENT_TARGET="14.0"
78
+ VERBOSE=1 python -m build --wheel
79
+ fi
80
+
81
+ - uses: softprops/action-gh-release@v2
82
+ with:
83
+ files: dist/*
84
+ # set release name to <tag>-metal
85
+ tag_name: ${{ github.ref_name }}-metal
86
+ env:
87
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
llama-cpp-python/.github/workflows/generate-index-from-release.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Wheels Index
2
+
3
+ on:
4
+ # Trigger on any new release
5
+ release:
6
+ types: [published]
7
+
8
+ # Allows you to run this workflow manually from the Actions tab
9
+ workflow_dispatch:
10
+
11
+ # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
12
+ permissions:
13
+ contents: read
14
+ pages: write
15
+ id-token: write
16
+
17
+ # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
18
+ # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
19
+ concurrency:
20
+ group: "pages"
21
+ cancel-in-progress: false
22
+
23
+ jobs:
24
+ # Single deploy job since we're just deploying
25
+ deploy:
26
+ environment:
27
+ name: github-pages
28
+ url: ${{ steps.deployment.outputs.page_url }}
29
+ runs-on: ubuntu-latest
30
+ steps:
31
+ - name: Checkout
32
+ uses: actions/checkout@v4
33
+ - name: Setup Pages
34
+ uses: actions/configure-pages@v5
35
+ - name: Build
36
+ run: |
37
+ ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
38
+ ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
39
+ ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
40
+ ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
41
+ ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
42
+ ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
43
+ - name: Upload artifact
44
+ uses: actions/upload-pages-artifact@v3
45
+ with:
46
+ # Upload entire repository
47
+ path: 'index'
48
+ - name: Deploy to GitHub Pages
49
+ id: deployment
50
+ uses: actions/deploy-pages@v4
llama-cpp-python/.github/workflows/publish-to-test.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
2
+
3
+ name: Publish to TestPyPI
4
+
5
+ on:
6
+ workflow_dispatch:
7
+ inputs:
8
+ dev_version:
9
+ description: 'Dev version N'
10
+ required: true
11
+
12
+
13
+ jobs:
14
+ build-n-publish:
15
+ name: Build and publish
16
+ runs-on: ubuntu-latest
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ with:
21
+ submodules: "recursive"
22
+ - name: Set up Python
23
+ uses: actions/setup-python@v5
24
+ with:
25
+ python-version: "3.8"
26
+ - name: Append Dev Version to __version__
27
+ run: |
28
+ DEV_VERSION=${{ github.event.inputs.dev_version }}
29
+ CURRENT_VERSION=$(awk -F= '/__version__ =/ {print $2}' llama_cpp/__init__.py | tr -d ' "')
30
+ NEW_VERSION="${CURRENT_VERSION}.dev${DEV_VERSION}"
31
+ sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py
32
+ - name: Install dependencies
33
+ run: |
34
+ python3 -m pip install --upgrade pip build
35
+ python3 -m pip install -e .[all]
36
+ - name: Build source distribution
37
+ run: |
38
+ python3 -m build --sdist
39
+ - name: Publish to Test PyPI
40
+ uses: pypa/gh-action-pypi-publish@release/v1
41
+ with:
42
+ password: ${{ secrets.TEST_PYPI_API_TOKEN }}
43
+ repository-url: https://test.pypi.org/legacy/
llama-cpp-python/.github/workflows/publish.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish to PyPI
2
+
3
+ # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
4
+
5
+ on: workflow_dispatch
6
+
7
+ jobs:
8
+ build-n-publish:
9
+ name: Build and publish
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ with:
15
+ submodules: "recursive"
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.8"
20
+ - name: Install dependencies
21
+ run: |
22
+ python3 -m pip install --upgrade pip build
23
+ python3 -m pip install -e .[all]
24
+ - name: Build source distribution
25
+ run: |
26
+ python3 -m build --sdist
27
+ - name: Publish distribution to PyPI
28
+ # TODO: move to tag based releases
29
+ # if: startsWith(github.ref, 'refs/tags')
30
+ uses: pypa/gh-action-pypi-publish@release/v1
31
+ with:
32
+ password: ${{ secrets.PYPI_API_TOKEN }}
llama-cpp-python/.github/workflows/test-pypi.yaml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Tests for PyPI package
2
+
3
+ on: workflow_dispatch
4
+
5
+ jobs:
6
+ build-linux:
7
+
8
+ runs-on: ubuntu-latest
9
+ strategy:
10
+ matrix:
11
+ python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
12
+
13
+ steps:
14
+ - name: Set up Python ${{ matrix.python-version }}
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: ${{ matrix.python-version }}
18
+ - name: Install dependencies
19
+ run: |
20
+ python3 -m pip install --upgrade pip
21
+ python3 -m pip install --verbose llama-cpp-python[all]
22
+ - name: Test with pytest
23
+ run: |
24
+ python3 -c "import llama_cpp"
25
+
26
+ build-windows:
27
+
28
+ runs-on: windows-latest
29
+ strategy:
30
+ matrix:
31
+ python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
32
+
33
+ steps:
34
+ - name: Set up Python ${{ matrix.python-version }}
35
+ uses: actions/setup-python@v5
36
+ with:
37
+ python-version: ${{ matrix.python-version }}
38
+ - name: Install dependencies
39
+ run: |
40
+ python3 -m pip install --upgrade pip
41
+ python3 -m pip install --verbose llama-cpp-python[all]
42
+ - name: Test with pytest
43
+ run: |
44
+ python3 -c "import llama_cpp"
45
+
46
+ build-macos:
47
+
48
+ runs-on: macos-latest
49
+ strategy:
50
+ matrix:
51
+ python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
52
+
53
+ steps:
54
+ - name: Set up Python ${{ matrix.python-version }}
55
+ uses: actions/setup-python@v5
56
+ with:
57
+ python-version: ${{ matrix.python-version }}
58
+ - name: Install dependencies
59
+ run: |
60
+ python3 -m pip install --upgrade pip
61
+ python3 -m pip install --verbose llama-cpp-python[all]
62
+ - name: Test with pytest
63
+ run: |
64
+ python3 -c "import llama_cpp"
llama-cpp-python/.github/workflows/test.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Tests
2
+
3
+ on:
4
+ pull_request:
5
+ branches:
6
+ - main
7
+ push:
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ build-linux:
13
+
14
+ runs-on: ubuntu-latest
15
+ strategy:
16
+ matrix:
17
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
18
+
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ with:
22
+ submodules: "recursive"
23
+ - name: Set up Python ${{ matrix.python-version }}
24
+ uses: actions/setup-python@v5
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ - name: Install dependencies
28
+ run: |
29
+ python3 -m pip install --upgrade pip
30
+ python3 -m pip install .[all] -v
31
+ - name: Test with pytest
32
+ run: |
33
+ python3 -m pytest
34
+
35
+ build-windows:
36
+
37
+ runs-on: windows-latest
38
+ strategy:
39
+ matrix:
40
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
41
+
42
+ steps:
43
+ - uses: actions/checkout@v4
44
+ with:
45
+ submodules: "recursive"
46
+ - name: Set up Python ${{ matrix.python-version }}
47
+ uses: actions/setup-python@v5
48
+ with:
49
+ python-version: ${{ matrix.python-version }}
50
+ - name: Install dependencies
51
+ run: |
52
+ python3 -m pip install --upgrade pip
53
+ python3 -m pip install .[all] -v
54
+ - name: Test with pytest
55
+ run: |
56
+ python3 -m pytest
57
+
58
+ build-macos:
59
+
60
+ runs-on: macos-13
61
+ strategy:
62
+ matrix:
63
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
64
+
65
+ steps:
66
+ - uses: actions/checkout@v4
67
+ with:
68
+ submodules: "recursive"
69
+ - name: Set up Python ${{ matrix.python-version }}
70
+ uses: actions/setup-python@v5
71
+ with:
72
+ python-version: ${{ matrix.python-version }}
73
+ - name: Install dependencies
74
+ run: |
75
+ python3 -m pip install --upgrade pip
76
+ python3 -m pip install .[all] --verbose
77
+ - name: Test with pytest
78
+ run: |
79
+ python3 -m pytest
80
+
81
+ # build-linux-opencl:
82
+
83
+ # runs-on: ubuntu-latest
84
+
85
+ # steps:
86
+ # - uses: actions/checkout@v4
87
+ # with:
88
+ # submodules: "recursive"
89
+ # - name: Set up Python 3.8
90
+ # uses: actions/setup-python@v5
91
+ # with:
92
+ # python-version: "3.8"
93
+ # - name: Set up OpenCL & CLBlast
94
+ # run: |
95
+ # wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
96
+ # echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
97
+ # sudo apt-get update
98
+ # sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
99
+ # - name: Install dependencies
100
+ # run: |
101
+ # python3 -m pip install --upgrade pip
102
+ # CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose
103
+ # - name: Test with pytest
104
+ # run: |
105
+ # python3 -m pytest
106
+
107
+
108
+ build-macos-metal:
109
+
110
+ runs-on: macos-13
111
+
112
+ steps:
113
+ - uses: actions/checkout@v4
114
+ with:
115
+ submodules: "recursive"
116
+ - name: Set up Python 3.8
117
+ uses: actions/setup-python@v5
118
+ with:
119
+ python-version: "3.8"
120
+ - name: Install dependencies
121
+ run: |
122
+ python3 -m pip install --upgrade pip
123
+ CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
124
+ - name: Test with pytest
125
+ run: |
126
+ python3 -m pytest
llama-cpp-python/.gitignore ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.local
2
+
3
+ .python-version
4
+
5
+ .vscode/
6
+
7
+ _skbuild/
8
+
9
+ .envrc
10
+ .direnv
11
+
12
+ models/
13
+
14
+ # Byte-compiled / optimized / DLL files
15
+ __pycache__/
16
+ *.py[cod]
17
+ *$py.class
18
+
19
+ # C extensions
20
+ llama_cpp/*.so
21
+ llama_cpp/*.dylib
22
+ llama_cpp/*.metal
23
+ llama_cpp/*.dll
24
+ llama_cpp/*.lib
25
+
26
+ # Distribution / packaging
27
+ .Python
28
+ build/
29
+ develop-eggs/
30
+ dist/
31
+ downloads/
32
+ eggs/
33
+ .eggs/
34
+ lib/
35
+ lib64/
36
+ parts/
37
+ sdist/
38
+ var/
39
+ wheels/
40
+ share/python-wheels/
41
+ *.egg-info/
42
+ .installed.cfg
43
+ *.egg
44
+ MANIFEST
45
+
46
+ # PyInstaller
47
+ # Usually these files are written by a python script from a template
48
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
49
+ *.manifest
50
+ *.spec
51
+
52
+ # Installer logs
53
+ pip-log.txt
54
+ pip-delete-this-directory.txt
55
+
56
+ # Unit test / coverage reports
57
+ htmlcov/
58
+ .tox/
59
+ .nox/
60
+ .coverage
61
+ .coverage.*
62
+ .cache
63
+ nosetests.xml
64
+ coverage.xml
65
+ *.cover
66
+ *.py,cover
67
+ .hypothesis/
68
+ .pytest_cache/
69
+ cover/
70
+
71
+ # Translations
72
+ *.mo
73
+ *.pot
74
+
75
+ # Django stuff:
76
+ *.log
77
+ local_settings.py
78
+ db.sqlite3
79
+ db.sqlite3-journal
80
+
81
+ # Flask stuff:
82
+ instance/
83
+ .webassets-cache
84
+
85
+ # Scrapy stuff:
86
+ .scrapy
87
+
88
+ # Sphinx documentation
89
+ docs/_build/
90
+
91
+ # PyBuilder
92
+ .pybuilder/
93
+ target/
94
+
95
+ # Jupyter Notebook
96
+ .ipynb_checkpoints
97
+
98
+ # IPython
99
+ profile_default/
100
+ ipython_config.py
101
+
102
+ # pyenv
103
+ # For a library or package, you might want to ignore these files since the code is
104
+ # intended to run in multiple environments; otherwise, check them in:
105
+ # .python-version
106
+
107
+ # pipenv
108
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
109
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
110
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
111
+ # install all needed dependencies.
112
+ #Pipfile.lock
113
+
114
+ # poetry
115
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
116
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
117
+ # commonly ignored for libraries.
118
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
119
+ #poetry.lock
120
+
121
+ # pdm
122
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
123
+ #pdm.lock
124
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
125
+ # in version control.
126
+ # https://pdm.fming.dev/#use-with-ide
127
+ .pdm.toml
128
+
129
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130
+ __pypackages__/
131
+
132
+ # Celery stuff
133
+ celerybeat-schedule
134
+ celerybeat.pid
135
+
136
+ # SageMath parsed files
137
+ *.sage.py
138
+
139
+ # Environments
140
+ .env
141
+ .venv
142
+ env/
143
+ venv/
144
+ ENV/
145
+ env.bak/
146
+ venv.bak/
147
+
148
+ # Spyder project settings
149
+ .spyderproject
150
+ .spyproject
151
+
152
+ # Rope project settings
153
+ .ropeproject
154
+
155
+ # mkdocs documentation
156
+ /site
157
+
158
+ # mypy
159
+ .mypy_cache/
160
+ .dmypy.json
161
+ dmypy.json
162
+
163
+ # Pyre type checker
164
+ .pyre/
165
+
166
+ # pytype static type analyzer
167
+ .pytype/
168
+
169
+ # Cython debug symbols
170
+ cython_debug/
171
+
172
+ # PyCharm
173
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
174
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
175
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
176
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
+ .idea/
178
+
179
+ # downloaded model .bin files
180
+ docker/open_llama/*.bin
llama-cpp-python/.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "vendor/llama.cpp"]
2
+ path = vendor/llama.cpp
3
+ url = https://github.com/ggerganov/llama.cpp.git
llama-cpp-python/.readthedocs.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the Docs configuration file for MkDocs projects
2
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3
+
4
+ # Required
5
+ version: 2
6
+
7
+ # Set the version of Python and other tools you might need
8
+ build:
9
+ os: ubuntu-22.04
10
+ tools:
11
+ python: "3.11"
12
+
13
+ mkdocs:
14
+ configuration: mkdocs.yml
15
+
16
+ python:
17
+ install:
18
+ - method: pip
19
+ path: .
20
+ - requirements: docs/requirements.txt
21
+
22
+ submodules:
23
+ include: all
24
+ recursive: true
llama-cpp-python/CHANGELOG.md ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.2.69]
11
+
12
+ - feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
13
+ - feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
14
+ - fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
15
+ - fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
16
+ - fix: UTF-8 handling with grammars by @jsoma in #1415
17
+
18
+ ## [0.2.68]
19
+
20
+ - feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
21
+ - feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
22
+ - fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
23
+
24
+ ## [0.2.67]
25
+
26
+ - fix: Ensure image renders before text in chat formats regardless of message content order by @abetlen in 3489ef09d3775f4a87fb7114f619e8ba9cb6b656
27
+ - fix(ci): Fix bug in use of upload-artifact failing to merge multiple artifacts into a single release by @abetlen in d03f15bb73a1d520970357b702a9e7d4cc2a7a62
28
+
29
+ ## [0.2.66]
30
+
31
+ - feat: Update llama.cpp to ggerganov/llama.cpp@8843a98c2ba97a25e93319a104f9ddfaf83ce4c4
32
+ - feat: Generic Chat Formats, Tool Calling, and Huggingface Pull Support for Multimodal Models (Obsidian, LLaVA1.6, Moondream) by @abetlen in #1147
33
+ - ci(fix): Workflow actions updates and fix arm64 wheels not included in release by @Smartappli in #1392
34
+ - ci: Add support for pre-built cuda 12.4.1 wheels by @Smartappli in #1388
35
+ - feat: Add support for str type kv_overrides by @abetlen in a411612b385cef100d76145da1fbd02a7b7cc894
36
+ - fix: Functionary bug fixes by @jeffrey-fong in #1385
37
+ - examples: fix quantize example by @iyubondyrev in #1387
38
+ - ci: Update dependabot.yml by @Smartappli in #1391
39
+
40
+ ## [0.2.65]
41
+
42
+ - feat: Update llama.cpp to ggerganov/llama.cpp@46e12c4692a37bdd31a0432fc5153d7d22bc7f72
43
+ - feat: Allow for possibly non-pooled embeddings by @iamlemec in #1380
44
+
45
+ ## [0.2.64]
46
+
47
+ - feat: Update llama.cpp to ggerganov/llama.cpp@4e96a812b3ce7322a29a3008db2ed73d9087b176
48
+ - feat: Add `llama-3` chat format by @andreabak in #1371
49
+ - feat: Use new llama_token_is_eog in create_completions by @abetlen in d40a250ef3cfaa8224d12c83776a2f1de96ae3d1
50
+ - feat(server): Provide ability to dynamically allocate all threads if desired using -1 by @sean-bailey in #1364
51
+ - ci: Build arm64 wheels by @gaby in 611781f5319719a3d05fefccbbf0cc321742a026
52
+ - fix: Update scikit-build-core build dependency avoid bug in 0.9.1 by @evelkey in #1370
53
+
54
+ ## [0.2.63]
55
+
56
+ - feat: Update llama.cpp to ggerganov/llama.cpp@0e4802b2ecbaab04b4f829fde4a3096ca19c84b5
57
+ - feat: Add stopping_criteria to ChatFormatter, allow stopping on arbitrary token ids, fixes llama3 instruct by @abetlen in cc81afebf04d26ca1ac3cf72f23f18da6ab58588
58
+
59
+ ## [0.2.62]
60
+
61
+ - feat: Update llama.cpp to ggerganov/llama.cpp@3b8f1ec4b18770531d0b1d792f3edf08254e4f0c
62
+ - feat: update grammar schema converter to match llama.cpp by @themrzmaster in #1353
63
+ - feat: add disable_ping_events flag by @khimaros in #1257
64
+ - feat: Make saved state more compact on-disk by @tc-wolf in #1296
65
+ - feat: Use all available CPUs for batch processing by @ddh0 in #1345
66
+
67
+ ## [0.2.61]
68
+
69
+ - feat: Update llama.cpp to ggerganov/llama.cpp@ba5e134e073ec6837078c874aba44a702944a676
70
+ - fix: pass correct type to chat handlers for chat completion logprobs by @abetlen in bb65b4d76411112c6fb0bf759efd746f99ef3c6b
71
+ - feat: Add support for yaml based server configs by @abetlen in 060bfa64d529ade2af9b1f4e207a3937bbc4138f
72
+ - feat: Add typechecking for ctypes structure attributes by @abetlen in 1347e1d050fc5a9a32ffe0bb3e22858da28003bd
73
+
74
+ ## [0.2.60]
75
+
76
+ - feat: Update llama.cpp to ggerganov/llama.cpp@75cd4c77292034ecec587ecb401366f57338f7c0
77
+ - fix: Always embed metal library by @abetlen in b3bfea6dbfb6ed9ce18f9a2723e0a9e4bd1da7ad
78
+ - fix: missing logprobs in response, incorrect response type for functionary by @abetlen in 1ae3abbcc3af7f4a25a3ffc40b246f18039565e8
79
+ - fix(docs): incorrect tool_choice example by @CISC in #1330
80
+
81
+ ## [0.2.59]
82
+
83
+ - feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
84
+ - feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247
85
+ - fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3
86
+ - fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295
87
+
88
+ ## [0.2.58]
89
+
90
+ - feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
91
+ - feat: add support for KV cache quantization options by @Limour-dev in #1307
92
+ - feat: Add logprobs support to chat completions by @windspirit95 in #1311
93
+ - fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
94
+ - feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
95
+ - fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
96
+
97
+ ## [0.2.57]
98
+
99
+ - feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
100
+ - fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
101
+ - fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
102
+ - fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
103
+
104
+ ## [0.2.56]
105
+
106
+ - feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
107
+ - feat(server): Add endpoints for tokenize, detokenize and count tokens by @felipelo in #1136
108
+ - feat: Switch embed to llama_get_embeddings_seq by @iamlemec in #1263
109
+ - fix: Fixed json strings grammar by blacklisting character control set by @ExtReMLapin in d02a9cf16ff88ad011e2eb1ce29f4d9400f13cd1
110
+ - fix: Check for existence of clip model path by @kejcao in #1264
111
+
112
+ ## [0.2.55]
113
+
114
+ - feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
115
+ - docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
116
+
117
+ ## [0.2.54]
118
+
119
+ - feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
120
+ - docs: fix typo in README.md embeddings example by @iamlemec in #1232
121
+
122
+ ## [0.2.53]
123
+
124
+ - feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
125
+ - fix: eos/bos_token set correctly for Jinja2ChatFormatter and automatic chat formatter by @CISC in #1230
126
+
127
+ ## [0.2.52]
128
+
129
+ - feat: Update llama.cpp to ggerganov/llama.cpp@a33e6a0d2a66104ea9a906bdbf8a94d050189d91
130
+ - fix: Llava15ChatHandler (this function takes at least 4 arguments) by @abetlen in 8383a9e5620f5df5a88f62da16813eac200dd706
131
+
132
+ ## [0.2.51]
133
+
134
+ - feat: Update llama.cpp to ggerganov/llama.cpp@c39373398803c669056304090050fe3f44b41bf9
135
+ - fix: Restore type hints for low-level api by @abetlen in 19234aa0dbd0c3c87656e65dd2b064665371925b
136
+
137
+ ## [0.2.50]
138
+
139
+ - docs: Update Functionary OpenAI Server Readme by @jeffrey-fong in #1193
140
+ - fix: LlamaHFTokenizer now receives pre_tokens by @abetlen in 47bad30dd716443652275099fa3851811168ff4a
141
+
142
+ ## [0.2.49]
143
+
144
+ - fix: module 'llama_cpp.llama_cpp' has no attribute 'c_uint8' in Llama.save_state by @abetlen in db776a885cd4c20811f22f8bd1a27ecc71dba927
145
+ - feat: Auto detect Mixtral's slightly different format by @lukestanley in #1214
146
+
147
+ ## [0.2.48]
148
+
149
+ - feat: Update llama.cpp to ggerganov/llama.cpp@15499eb94227401bdc8875da6eb85c15d37068f7
150
+ - feat: Add Google's Gemma formatting via chat_format="gemma" by @alvarobartt in #1210
151
+ - feat: support minItems/maxItems in JSON grammar converter by @nopperl in 3921e10770996d95a9eb22c8248bacef39f69365
152
+ - fix: Update from_pretrained defaults to match hf_hub_download and pull to local cache folder by @abetlen in e6d6260a91b7831733f7d1f73c7af46a3e8185ed
153
+ - fix: Raise exceptions when llama model or context fails to load by @abetlen in dd22010e85265ae840c76ec835d67a29ed852722
154
+ - docs: Update README.md to fix pip install llama cpp server by @audip in #1187
155
+
156
+ ## [0.2.47]
157
+
158
+ - feat: Update llama.cpp to ggerganov/llama.cpp@973053d8b0d04809836b3339a50f68d9c842de90
159
+
160
+ ## [0.2.46]
161
+
162
+ - feat: Update llama.cpp to ggerganov/llama.cpp@ba2135ccae7462470b3865c6e41d2e1d734eac05
163
+ - feat: Pull models directly from huggingface by @abetlen in #1206
164
+ - feat(low-level-api): Improve API static type-safety and performance. Low level api functions are positional args only now. by @abetlen in #1205
165
+
166
+ ## [0.2.45]
167
+
168
+ - feat: Update llama.cpp to ggerganov/llama.cpp@89febfed9322c8849520dc63c93ee4f5fd72556e
169
+
170
+ ## [0.2.44]
171
+
172
+ - feat: Update llama.cpp to ggerganov/llama.cpp@4524290e87b8e107cc2b56e1251751546f4b9051
173
+ - fix: create_embedding broken response for input type str by @abetlen in 0ce66bc080fe537590b05b24bf442480bf2dd045
174
+ - fix: Use '\n' seperator for EventSourceResponse by @khimaros in #1188
175
+ - fix: Incorporate embedding pooling layer fixes by @iamlemec in #1194
176
+
177
+ ## [0.2.43]
178
+
179
+ - feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
180
+ - feat: Support batch embeddings by @iamlemec in #1186
181
+ - fix: submodule kompute is not included in sdist by @abetlen in 7dbbfdecadebe7750be650d9409959640ff9a460
182
+ - fix: fix: Update openbuddy prompt format by @abetlen in 07a783779a62a4aac0b11161c7e0eb983ff215f8
183
+
184
+ ## [0.2.42]
185
+
186
+ - feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40
187
+ - fix: sample idx off-by-one error for logit_processors by @lapp0 in #1179
188
+ - fix: chat formatting bugs in `chatml-function-calling` by @abetlen in 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 and 68fb71b6a26a1e57331868f959b47ab4b87851e1
189
+
190
+ ## [0.2.41]
191
+
192
+ - feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b
193
+ - fix: Don't change order of json schema object properties in generated grammar unless prop_order is passed by @abetlen in d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa
194
+
195
+ ## [0.2.40]
196
+
197
+ - feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
198
+ - feat: Generic chatml Function Calling using chat_format="chatml-function-calling"` by @abetlen in #957
199
+ - fix: Circular dependancy preventing early Llama object free by @notwa in #1176
200
+ - docs: Set the correct command for compiling with syscl support by @akarshanbiswas in #1172
201
+ - feat: use gpu backend for clip if available by @iamlemec in #1175
202
+
203
+ ## [0.2.39]
204
+
205
+ - feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
206
+ - fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874
207
+
208
+ ## [0.2.38]
209
+
210
+ - feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
211
+ - feat: Add speculative decoding by @abetlen in #1120
212
+ - fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95
213
+
214
+ ## [0.2.37]
215
+
216
+ - feat: Update llama.cpp to ggerganov/llama.cpp@fea4fd4ba7f6b754ac795387b275e1a014a77bde
217
+ - feat: Automatically set chat format from gguf by @abetlen in #1110
218
+
219
+ ## [0.2.36]
220
+
221
+ - feat: Update llama.cpp to ggerganov/llama.cpp@2aed77eb06a329f0d82bb1c467f4244904d4073f
222
+ - feat: Add mistral instruct chat format as "mistral-instruct" by @Rafaelblsilva in #799
223
+
224
+ ## [0.2.35]
225
+
226
+ - feat: Update llama.cpp to ggerganov/llama.cpp@d2f650cb5b04ee2726663e79b47da5efe196ce00
227
+
228
+ ## [0.2.34]
229
+
230
+ - feat: Update llama.cpp to ggerganov/llama.cpp@6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855
231
+ - feat: Add json schema mode by @abetlen in #1122
232
+
233
+ ## [0.2.33]
234
+
235
+ - feat: Update llama.cpp to ggerganov/llama.cpp@faa3526a1eba458120987ed8269e5616385a76f4
236
+ - feat(server): include llama-cpp-python version in openapi spec by @abetlen in cde7514c3d28e6d52f272614e9957208c344dde5
237
+ - fix: use both eos and bos tokens as stop sequences for hf-tokenizer-config chat format. by @abetlen in 5b982d0f8c6f35242c8862ffdce00e17cea0b44f
238
+ - fix: GGUF metadata KV overrides, re #1011 by @phiharri in #1116
239
+ - fix: llama_log_set should be able to accept null pointer by @abetlen in c970d41a85381fd55235136f123422df0bf0c7e7
240
+
241
+ ## [0.2.32]
242
+
243
+ - feat: Update llama.cpp to ggerganov/llama.cpp@504dc37be8446fb09b1ede70300250ad41be32a2
244
+ - fix: from_json_schema oneof/anyof bug by @jndiogo in d3f5528ca8bcb9d69d4f27e21631e911f1fb9bfe
245
+ - fix: pass chat handler not chat formatter for huggingface autotokenizer and tokenizer_config formats by @abetlen in 24f39454e91cf5dddbc4b6041aead4accc7c7a2d
246
+ - feat: Add add_generation_prompt option for jinja2chatformatter by @abetlen in 7f3209b1eb4ad3260ba063801fab80a8c25a2f4c
247
+ - feat: Add Jinja2ChatFormatter by @abetlen in be09318c26add8674ce494ae7cc480cce72a4146
248
+ - feat: Expose gguf model metadata in metadata property by @abetlen in 5a34c57e5479e50c99aba9b38218cc48e6560b81
249
+
250
+ ## [0.2.31]
251
+
252
+ - feat: Update llama.cpp to ggerganov/llama.cpp@a5cacb22b2114fd9adf61c00cbb237384d86bced
253
+ - fix: Mirostat sampling now passes correct type to ctypes and tracks state during generation by @abetlen in 3babe3512cb95743108f2b595210c38ed6f1b904
254
+ - fix: Python3.8 support in server by @abetlen in 141293a75b564a8699e0acba1da24d9aa1cf0ab1
255
+
256
+ ## [0.2.30]
257
+
258
+ - feat: Update llama.cpp to ggerganov/llama.cpp@57e2a7a52a819883f40dada8a2edc24ecf48186b
259
+ - feat(server): Add ability to load chat format from huggingface autotokenizer or tokenizer_config.json files by @abetlen in b8fc1c7d83ad4a9207c707ba1d954fe580286a01
260
+ - feat: Integration of Jinja2 Templating for chat formats by @teleprint-me in #875
261
+ - fix: Offload KQV by default by @abetlen in 48c3b77e6f558a9899de0e1155c7dc0c7958d8e8
262
+ - fix: Support Accept text/event-stream in chat and completion endpoints, resolves #1083 by @aniljava in #1088
263
+ - fix(cli): allow passing n_ctx=0 to openAI API server args to use model n_ctx_train field per #1015 by @K-Mistele in #1093
264
+
265
+ ## [0.2.29]
266
+
267
+ - feat: Update llama.cpp to ggerganov/llama.cpp@4483396751c79dea540808b9cb9238245d06da2b
268
+ - feat: Add split_mode option by @abetlen in 84615adbc6855c8384807c42f0130f9a1763f99d
269
+ - feat: Implement GGUF metadata KV overrides by @phiharri in #1011
270
+ - fix: Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor by @yieldthought in #1012
271
+ - fix: Fix low_level_api_chat_cpp example to match current API by @aniljava in #1086
272
+ - fix: Fix Pydantic model parsing by @DeNeutoy in #1087
273
+
274
+ ## [0.2.28]
275
+
276
+ - feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
277
+ - feat: Add ability to pass in penalize_nl param by @shankinson in #1068
278
+ - fix: print_grammar to stderr by @turian in #1052
279
+
280
+ ## [0.2.27]
281
+
282
+ - feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a
283
+ - feat: Add `saiga` chat format by @femoiseev in #1050
284
+ - feat: Added `chatglm3` chat format by @xaviviro in #1059
285
+ - fix: Correct typo in README.md by @qeleb in (#1058)
286
+
287
+ ## [0.2.26]
288
+
289
+ - feat: Update llama.cpp to ggerganov/llama.cpp@f6793491b5af6da75edad34d6f503ef86d31b09f
290
+
291
+ ## [0.2.25]
292
+
293
+ - feat(server): Multi model support by @D4ve-R in #931
294
+ - feat(server): Support none defaulting to infinity for completions by @swg in #111
295
+ - feat(server): Implement openai api compatible authentication by @docmeth2 in #1010
296
+ - fix: text_offset of multi-token characters by @twaka in #1037
297
+ - fix: ctypes bindings for kv override by @phiharri in #1011
298
+ - fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
299
+
300
+ ## [0.2.24]
301
+
302
+ - feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a
303
+ - feat: Add offload_kqv option to llama and server by @abetlen in 095c65000642a3cf73055d7428232fb18b73c6f3
304
+ - feat: n_ctx=0 now uses the n_ctx_train of the model by @DanieleMorotti in #1015
305
+ - feat: logits_to_logprobs supports both 2-D and 3-D logits arrays by @kddubey in #1002
306
+ - fix: Remove f16_kv, add offload_kqv fields in low level and llama apis by @brandonrobertz in #1019
307
+ - perf: Don't convert logprobs arrays to lists by @kddubey in #1021
308
+ - docs: Fix README.md functionary demo typo by @evelynmitchell in #996
309
+ - examples: Update low_level_api_llama_cpp.py to match current API by @jsoma in #1023
310
+
311
+ ## [0.2.23]
312
+
313
+ - Update llama.cpp to ggerganov/llama.cpp@948ff137ec37f1ec74c02905917fa0afc9b97514
314
+ - Add qwen chat format by @yhfgyyf in #1005
315
+ - Add support for running the server with SSL by @rgerganov in #994
316
+ - Replace logits_to_logprobs implementation with numpy equivalent to llama.cpp by @player1537 in #991
317
+ - Fix UnsupportedOperation: fileno in suppress_stdout_stderr by @zocainViken in #961
318
+ - Add Pygmalion chat format by @chiensen in #986
319
+ - README.md multimodal params fix by @zocainViken in #967
320
+ - Fix minor typo in README by @aniketmaurya in #958
321
+
322
+ ## [0.2.22]
323
+
324
+ - Update llama.cpp to ggerganov/llama.cpp@8a7b2fa528f130631a5f43648481596ab320ed5a
325
+ - Fix conflict with transformers library by kddubey in #952
326
+
327
+ ## [0.2.21]
328
+
329
+ - Update llama.cpp to ggerganov/llama.cpp@64e64aa2557d97490b2fe1262b313e2f4a1607e3
330
+ - Make building llava optional by setting `CMAKE_ARGS="-DLLAVA_BUILD=OFF"` and using `LLAVA_CPP_LIB` to specify alternative path to shared library by @abetlen in e3941d9c674dbd9891dc3ceda390daeb21f05fd1
331
+
332
+ ## [0.2.20]
333
+
334
+ - Update llama.cpp to ggerganov/llama.cpp@b38a16dfcff88d547f78f52d1bea31b84a05aff7
335
+ - Add `zephyr` chat format by @fakerybakery in #938
336
+ - Add `baichuan` chat format by @caiyesd in #938
337
+ - Add `baichuan-2` chat format by @caiyesd in #936
338
+ - Improve documentation for server chat formats by @jooray in #934
339
+ - Fix typo in README by @antonvice in 940
340
+ - Fix typo in the Open Orca chat format by @gardner in #947
341
+
342
+ ## [0.2.19]
343
+
344
+ - Update llama.cpp to ggerganov/llama.cpp@0b871f1a04ef60e114bbe43004fd9c21114e802d
345
+ - Fix #569: stop parameter in chat completion api should accept str by @abetlen in 128dc4731fa846ead7e684a137ca57d8931b8899
346
+ - Document server host and port parameters by @jamesbraza in #768
347
+ - Do not set grammar to None when initializing LlamaGrammar by @mthuurne in #834
348
+ - Add mistrallite, intel, and openchat formats by @fakerybakery in #927
349
+ - Add support for min_p parameter by @tk-master in #921
350
+ - Fix #929: tokenizer adding leading space when generating from empty prompt by @abetlen in a34d48014192771d2e308a76c22f33bc0318d983
351
+ - Fix low level api example by @zocainViken in #925
352
+ - Fix missing package in openblas docker image by @ZisisTsatsas in #920
353
+
354
+ ## [0.2.18]
355
+
356
+ - Update llama.cpp to ggerganov/llama.cpp@6bb4908a17150b49373b5f977685b2e180a04f6f
357
+
358
+ ## [0.2.17]
359
+
360
+ - Update llama.cpp to ggerganov/llama.cpp@df9d1293defe783f42bc83af732d3c670552c541
361
+ - Hotfix: Set `CUDA_ARCHITECTURES=OFF` for `llava_shared` target on Windows by @abetlen in 4388f3341413110217b98c4f097ac5c590bdf40b
362
+
363
+ ## [0.2.16]
364
+
365
+ - Update llama.cpp to ggerganov/llama.cp@a75fa576abba9d37f463580c379e4bbf1e1ad03c
366
+ - Add `set_seed` to `Llama` class by @abetlen in fd41ed3a908761d286102a019a34c2938a15118d
367
+ - Fix server doc arguments by @kjunggithub in #892
368
+ - Fix response_format handler in llava chat handler by @abetlen in b62c44983921197ed10a7d29dc4ba920e9979380
369
+ - Fix default max_tokens, chat completion is now unlimited (to context length) and completion is 16 tokens to match OpenAI defaults by @abetlen in e7962d2c733cbbeec5a37392c81f64185a9a39e8
370
+ - Fix json_schema_to_gbnf helper so that it takes a json schema string as input instead by @abetlen in faeae181b1e868643c0dc28fcf039f077baf0829
371
+ - Add support for $ref and $def in json_schema_to_gbnf to handle more complex function schemas by @abetlen in 770df344369c0630df1be14be9f9e301e7c56d24
372
+ - Update functionary chat handler for new OpenAI api by abetlen in 1b376c62b775b401653facf25a519d116aafe99a
373
+ - Fix add default stop sequence to chatml chat format by @abetlen in b84d76a844149216d511cfd8cdb9827148a1853c
374
+ - Fix sampling bug when logits_all=False by @abetlen in 6f0b0b1b840af846938ed74d0e8170a91c40e617
375
+
376
+ ## [0.2.15]
377
+
378
+ - Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4
379
+ - Add support for Llava1.5 multimodal models by @damian0815 and @abetlen in #821
380
+ - Update OpenAI API compatibility to match dev day update by @abetlen in #821
381
+ - Add seed parameter to completion and chat_completion functions of Llama class by @abetlen in 86aeb9f3a14808575d2bb0076e6acb4a30907e6a
382
+ - Add JSON mode support to constrain chat completion to JSON objects by @abetlen in b30b9c338bf9af316d497ea501d39f5c246900db
383
+
384
+ ## [0.2.14]
385
+
386
+ - Update llama.cpp to ggerganov/llama.cpp@f0b30ef7dc1360922ccbea0a8cd3918ecf15eaa7
387
+ - Add support for Huggingface Autotokenizer Chat Formats by @bioshazard and @abetlen in #790 and bbffdaebaa7bb04b543dbf683a07276087251f86
388
+ - Fix llama-2 chat format by @earonesty in #869
389
+ - Add support for functionary chat format by @abetlen in #784
390
+ - Migrate inference from deprecated `llama_eval`API to `llama_batch` and `llama_decode` by @abetlen in #795
391
+
392
+ ## [0.2.13]
393
+
394
+ - Update llama.cpp to ggerganov/llama.cpp@51b2fc11f7f605fff49725a4540e9a6ef7b51b70
395
+ - Fix name 'open' is not defined exception when deleting model by @abetlen in 011b95d7f34cbfc528af75a892757bd9a20838ab
396
+ - Fix tokenization of special characters by @antoine-lizee in #850
397
+
398
+ ## [0.2.12]
399
+
400
+ - Update llama.cpp to ggerganov/llama.cpp@50337961a678fce4081554b24e56e86b67660163
401
+ - Fix missing `n_seq_id` in `llama_batch` by @NickAlgra in #842
402
+ - Fix for shared libraries on Windows that start with `lib` prefix by @sujeendran in #848
403
+ - Fix exception raised in `__del__` when freeing models by @cebtenzzre in #846
404
+ - Performance improvement for logit bias by @zolastro in #851
405
+ - Fix suffix check arbitrary code execution bug by @mtasic85 in #854
406
+ - Fix typo in `function_call` parameter in `llama_types.py` by @akatora28 in #849
407
+ - Fix streaming not returning `finish_reason` by @gmcgoldr in #798
408
+ - Fix `n_gpu_layers` check to allow values less than 1 for server by @hxy9243 in #826
409
+ - Supppress stdout and stderr when freeing model by @paschembri in #803
410
+ - Fix `llama2` chat format by @delock in #808
411
+ - Add validation for tensor_split size by @eric1932 #820
412
+ - Print stack trace on server error by @abetlen in d6a130a052db3a50975a719088a9226abfebb266
413
+ - Update docs for gguf by @johnccshen in #783
414
+ - Add `chatml` chat format by @abetlen in 305482bd4156c70802fc054044119054806f4126
415
+
416
+ ## [0.2.11]
417
+
418
+ - Fix bug in `llama_model_params` object has no attribute `logits_all` by @abetlen in d696251fbe40015e8616ea7a7d7ad5257fd1b896
419
+
420
+ ## [0.2.10]
421
+
422
+ - Fix bug 'llama_model_params' object has no attribute 'embedding' by @abetlen in 42bb721d64d744242f9f980f2b89d5a6e335b5e4
423
+
424
+ ## [0.2.9]
425
+
426
+ - Fix critical bug in pip installation of v0.2.8 due to `.git` directory in ac853e01e1a217a578080a4e1b851d2d08450adf
427
+
428
+ ## [0.2.8]
429
+
430
+ - Update llama.cpp to ggerganov/llama.cpp@40e07a60f9ce06e79f3ccd4c903eba300fb31b5e
431
+ - Add configurable chat formats by @abetlen in #711
432
+ - Fix rope scaling bug by @Josh-XT in #767
433
+ - Fix missing numa parameter in server by @abetlen in d9bce17794d0dd6f7962d10aad768fedecf3ab89
434
+
435
+ ## [0.2.7]
436
+
437
+ - Update llama.cpp to ggerganov/llama.cpp@a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850
438
+ - Install required runtime dlls to package directory on windows by @abetlen in 8d75016549e2ff62a511b1119d966ffc0df5c77b
439
+ - Add openai-processing-ms to server response header by @Tradunsky in #748
440
+ - Bump minimum version of scikit-build-core to 0.5.1 to fix msvc cmake issue by @abetlen in 1ed0f3ebe16993a0f961155aa4b2c85f1c68f668
441
+ - Update `llama_types.py` to better match the openai api, old names are aliased to new ones by @abetlen in dbca136feaaf7f8b1182c4c3c90c32918b1d0bb3
442
+
443
+ ## [0.2.6]
444
+
445
+ - Update llama.cpp to 80291a1d02a07f7f66666fb576c5b1e75aa48b46
446
+
447
+ ## [0.2.5]
448
+
449
+ - Fix docker images missing starlette-context dependency by @abetlen in 22917989003c5e67623d54ab45affa1e0e475410
450
+ - Fix loading dll in Windows Isolation Containers by @abetlen in 847466562573191efa655753d9252f308c4fbdb0
451
+ - Fix build issue on m1 macs by @abetlen in dbd3a6d1ed8416a8fd800127251e730153afa305
452
+ - Update docs to gguf and add hw acceleration docs for server by @jasonacox in #688
453
+
454
+ ## [0.2.4]
455
+
456
+ - Add NUMA support. **NOTE** low level api users must call llama_backend_init at the start of their programs by abetlen in f4090a0bb2a2a25acfe28d31c82cc1aa273bedee
457
+ - Fix tensor_split server cli argument by @abetlen in c4c440ba2dc86d9de728a751311fdd1c8e3756fa
458
+ - Made all `Llama` init parameters into keyword-only parameters by @abetlen in c8f9b8a734b5b040379bbd93995ba177affab1fe
459
+ - Added server params for `low_vram`, `main_gpu`, `lora_base`, and `lora_path` by @abetlen in 2920c4bf7ee1412d6bba7846e0e1b7ef6d34043b
460
+ - Removed server params for `rms_norm_eps` and `n_gqa` by @abetlen in 2920c4bf7ee1412d6bba7846e0e1b7ef6d34043b
461
+ - Fix boolean cli options by @abetlen in c999325e8e4507f6c6249dd2fb8de7f8bf57f71e and 0449d29b9f940e437231a07b9d56550226558bac
462
+ - Silence Pydantic Settings warnings about `model_alias` setting by @earonesty in #705
463
+
464
+ ## [0.2.3]
465
+
466
+ - Update llama.cpp to ggerganov/llama.cpp@71ca2fad7d6c0ef95ef9944fb3a1a843e481f314
467
+ - Add X-Request-ID request header for mirroring custom IDs by @devrimcavusoglu in #703
468
+ - Add pyproject extra for scikit-build-core to ensure compatible pathspec version by @abetlen in 6cfc54284b99ef1bff8193e2d5e483dbd89ada02
469
+ - Fix issue with Literal and Optional cli arguments not working by @abetlen in #702
470
+
471
+ ## [0.2.2]
472
+
473
+ - Fix bug in pip install of v0.2.1 due to scikit-build-core removing all `.metal` files in the source distribution (see #701)
474
+
475
+ ## [0.2.1]
476
+
477
+ - Fix bug in pip install of v0.2.0 due to .git folder being included in the source distribution (see #701)
478
+
479
+ ## [0.2.0]
480
+
481
+ - Migrated to scikit-build-core build system by @abetlen in #499
482
+ - Use `numpy` views for `LogitsProcessor` and `StoppingCriteria` instead of python lists by @abetlen in #499
483
+ - Drop support for end-of-life Python3.7 by @abetlen in #499
484
+ - Convert low level `llama.cpp` constants to use basic python types instead of `ctypes` types by @abetlen in #499
485
+
486
+ ## [0.1.85]
487
+
488
+ - Add `llama_cpp.__version__` attribute by @janvdp in #684
489
+ - Fix low level api examples by @jbochi in #680
490
+
491
+ ## [0.1.84]
492
+
493
+ - Update llama.cpp
494
+
495
+ ## [0.1.83]
496
+
497
+ - Update llama.cpp
498
+
499
+ ## [0.1.82]
500
+
501
+ - Update llama.cpp
502
+
503
+ ## [0.1.81]
504
+
505
+ - Update llama.cpp
506
+
507
+ ## [0.1.80]
508
+
509
+ - Update llama.cpp
510
+
511
+ ## [0.1.79]
512
+
513
+ - GGUF Support (breaking change requiring new model format)
514
+
515
+ ## [0.1.78]
516
+
517
+ - Grammar based sampling via LlamaGrammar which can be passed to completions
518
+ - Make n_gpu_layers == -1 offload all layers
519
+
520
+ ## [0.1.77]
521
+
522
+ - (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
523
+ - (server) Add temporary n_gqa and rms_norm_eps parameters required for LLaMa 2 70B
524
+
525
+ ## [0.1.76]
526
+
527
+ - (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
528
+
529
+ ## [0.1.75]
530
+
531
+ - Update llama.cpp
532
+
533
+ ## [0.1.74]
534
+
535
+ - (server) OpenAI style error responses
536
+
537
+ ## [0.1.73]
538
+
539
+ - (server) Add rope parameters to server settings
540
+
541
+ ## [0.1.72]
542
+
543
+ - (llama.cpp) Update llama.cpp added custom_rope for extended context lengths
544
+
545
+ ## [0.1.71]
546
+
547
+ - (llama.cpp) Update llama.cpp
548
+
549
+ - (server) Fix several pydantic v2 migration bugs
550
+
551
+ ## [0.1.70]
552
+
553
+ - (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
554
+ - (server) Fixed changed settings field names from pydantic v2 migration
555
+
556
+ ## [0.1.69]
557
+
558
+ - (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
559
+ - (server) Moved to fastapi v0.100.0 and pydantic v2
560
+ - (docker) Added a new "simple" image that builds llama.cpp from source when started.
561
+ - (server) performance improvements by avoiding unnecessary memory allocations during sampling
562
+
563
+ ## [0.1.68]
564
+
565
+ - (llama.cpp) Update llama.cpp
566
+
567
+ ## [0.1.67]
568
+
569
+ - Fix performance bug in Llama model by pre-allocating memory tokens and logits.
570
+ - Fix bug in Llama model where the model was not free'd after use.
571
+
572
+ ## [0.1.66]
573
+
574
+ - (llama.cpp) New model API
575
+
576
+ - Performance issue during eval caused by looped np.concatenate call
577
+ - State pickling issue when saving cache to disk
578
+
579
+ ## [0.1.65]
580
+
581
+ - (llama.cpp) Fix struct misalignment bug
582
+
583
+ ## [0.1.64]
584
+
585
+ - (llama.cpp) Update llama.cpp
586
+ - Fix docs for seed. Set -1 for random.
587
+
588
+ ## [0.1.63]
589
+
590
+ - (llama.cpp) Add full gpu utilisation in CUDA
591
+ - (llama.cpp) Add get_vocab
592
+ - (llama.cpp) Add low_vram parameter
593
+ - (server) Add logit_bias parameter
594
+
595
+ ## [0.1.62]
596
+
597
+ - Metal support working
598
+ - Cache re-enabled
599
+
600
+ ## [0.1.61]
601
+
602
+ - Fix broken pip installation
603
+
604
+ ## [0.1.60]
605
+
606
+ NOTE: This release was deleted due to a bug with the packaging system that caused pip installations to fail.
607
+
608
+ - Truncate max_tokens in create_completion so requested tokens doesn't exceed context size.
609
+ - Temporarily disable cache for completion requests
610
+
611
+ ## [v0.1.59]
612
+
613
+ - (llama.cpp) k-quants support
614
+ - (server) mirostat sampling parameters to server
615
+ - Support both `.so` and `.dylib` for `libllama` on MacOS
616
+
617
+ ## [v0.1.58]
618
+
619
+ - (llama.cpp) Metal Silicon support
620
+
621
+ ## [v0.1.57]
622
+
623
+ - (llama.cpp) OpenLlama 3B support
624
+
625
+ ## [v0.1.56]
626
+
627
+ - (misc) Added first version of the changelog
628
+ - (server) Use async routes
629
+ - (python-api) Use numpy for internal buffers to reduce memory usage and improve performance.
630
+ - (python-api) Performance bug in stop sequence check slowing down streaming.
llama-cpp-python/CMakeLists.txt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmake_minimum_required(VERSION 3.21)
2
+
3
+ project(llama_cpp)
4
+
5
+ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
6
+ option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
7
+
8
+ if (LLAMA_BUILD)
9
+ set(BUILD_SHARED_LIBS "On")
10
+
11
+ # Building llama
12
+ if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
13
+ # Need to disable these llama.cpp flags on Apple x86_64,
14
+ # otherwise users may encounter invalid instruction errors
15
+ set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
16
+ set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
17
+ set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
18
+ set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
19
+ endif()
20
+
21
+ if (APPLE)
22
+ set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
23
+ endif()
24
+
25
+ add_subdirectory(vendor/llama.cpp)
26
+ install(
27
+ TARGETS llama
28
+ LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
29
+ RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
30
+ ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
31
+ FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
32
+ RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
33
+ )
34
+ # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
35
+ install(
36
+ TARGETS llama
37
+ LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
38
+ RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
39
+ ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
40
+ FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
41
+ RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
42
+ )
43
+ # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
44
+ install(
45
+ FILES $<TARGET_RUNTIME_DLLS:llama>
46
+ DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
47
+ )
48
+ install(
49
+ FILES $<TARGET_RUNTIME_DLLS:llama>
50
+ DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
51
+ )
52
+
53
+ if (LLAVA_BUILD)
54
+ if (LLAMA_CUBLAS)
55
+ add_compile_definitions(GGML_USE_CUBLAS)
56
+ endif()
57
+
58
+ if (LLAMA_METAL)
59
+ add_compile_definitions(GGML_USE_METAL)
60
+ endif()
61
+
62
+ # Building llava
63
+ add_subdirectory(vendor/llama.cpp/examples/llava)
64
+ set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
65
+ # Set CUDA_ARCHITECTURES to OFF on windows
66
+ if (WIN32)
67
+ set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
68
+ endif()
69
+ install(
70
+ TARGETS llava_shared
71
+ LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
72
+ RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
73
+ ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
74
+ FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
75
+ RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
76
+ )
77
+ # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
78
+ install(
79
+ TARGETS llava_shared
80
+ LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
81
+ RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
82
+ ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
83
+ FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
84
+ RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
85
+ )
86
+ endif()
87
+ endif()
llama-cpp-python/LICENSE.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Andrei Betlen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
llama-cpp-python/Makefile ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ update:
2
+ poetry install
3
+ git submodule update --init --recursive
4
+
5
+ update.vendor:
6
+ cd vendor/llama.cpp && git pull origin master
7
+
8
+ deps:
9
+ python3 -m pip install --upgrade pip
10
+ python3 -m pip install -e ".[all]"
11
+
12
+ build:
13
+ python3 -m pip install --verbose -e .
14
+
15
+ build.debug:
16
+ CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
17
+
18
+ build.cuda:
19
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
20
+
21
+ build.opencl:
22
+ CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
23
+
24
+ build.openblas:
25
+ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
26
+
27
+ build.blis:
28
+ CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e .
29
+
30
+ build.metal:
31
+ CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e .
32
+
33
+ build.vulkan:
34
+ CMAKE_ARGS="-DLLAMA_VULKAN=on" python3 -m pip install --verbose -e .
35
+
36
+ build.kompute:
37
+ CMAKE_ARGS="-DLLAMA_KOMPUTE=on" python3 -m pip install --verbose -e .
38
+
39
+ build.sycl:
40
+ CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
41
+
42
+ build.sdist:
43
+ python3 -m build --sdist
44
+
45
+ deploy.pypi:
46
+ python3 -m twine upload dist/*
47
+
48
+ deploy.gh-docs:
49
+ mkdocs build
50
+ mkdocs gh-deploy
51
+
52
+ test:
53
+ python3 -m pytest
54
+
55
+ docker:
56
+ docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile .
57
+
58
+ run-server:
59
+ uvicorn --factory llama.server:app --host ${HOST} --port ${PORT}
60
+
61
+ clean:
62
+ - cd vendor/llama.cpp && make clean
63
+ - cd vendor/llama.cpp && rm libllama.so
64
+ - rm -rf _skbuild
65
+ - rm llama_cpp/*.so
66
+ - rm llama_cpp/*.dylib
67
+ - rm llama_cpp/*.metal
68
+ - rm llama_cpp/*.dll
69
+ - rm llama_cpp/*.lib
70
+
71
+ .PHONY: \
72
+ update \
73
+ update.vendor \
74
+ build \
75
+ build.cuda \
76
+ build.opencl \
77
+ build.openblas \
78
+ build.sdist \
79
+ deploy.pypi \
80
+ deploy.gh-docs \
81
+ docker \
82
+ clean
llama-cpp-python/README.md ADDED
@@ -0,0 +1,792 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🦙 Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
2
+
3
+ [![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
4
+ [![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
5
+ [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
6
+ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
7
+ [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
8
+ [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
9
+ [![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
10
+
11
+ Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
12
+ This package provides:
13
+
14
+ - Low-level access to C API via `ctypes` interface.
15
+ - High-level Python API for text completion
16
+ - OpenAI-like API
17
+ - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
18
+ - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
19
+ - OpenAI compatible web server
20
+ - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
21
+ - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
22
+ - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
23
+ - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
24
+
25
+ Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
26
+
27
+ ## Installation
28
+
29
+ Requirements:
30
+
31
+ - Python 3.8+
32
+ - C compiler
33
+ - Linux: gcc or clang
34
+ - Windows: Visual Studio or MinGW
35
+ - MacOS: Xcode
36
+
37
+ To install the package, run:
38
+
39
+ ```bash
40
+ pip install llama-cpp-python
41
+ ```
42
+
43
+ This will also build `llama.cpp` from source and install it alongside this python package.
44
+
45
+ If this fails, add `--verbose` to the `pip install` see the full cmake build log.
46
+
47
+ **Pre-built Wheel (New)**
48
+
49
+ It is also possible to install a pre-built wheel with basic CPU support.
50
+
51
+ ```bash
52
+ pip install llama-cpp-python \
53
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
54
+ ```
55
+
56
+ ### Installation Configuration
57
+
58
+ `llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
59
+
60
+ All `llama.cpp` cmake build options can be set via the `CMAKE_ARGS` environment variable or via the `--config-settings / -C` cli flag during installation.
61
+
62
+ <details open>
63
+ <summary>Environment Variables</summary>
64
+
65
+ ```bash
66
+ # Linux and Mac
67
+ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" \
68
+ pip install llama-cpp-python
69
+ ```
70
+
71
+ ```powershell
72
+ # Windows
73
+ $env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
74
+ pip install llama-cpp-python
75
+ ```
76
+ </details>
77
+
78
+ <details>
79
+ <summary>CLI / requirements.txt</summary>
80
+
81
+ They can also be set via `pip install -C / --config-settings` command and saved to a `requirements.txt` file:
82
+
83
+ ```bash
84
+ pip install --upgrade pip # ensure pip is up to date
85
+ pip install llama-cpp-python \
86
+ -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"
87
+ ```
88
+
89
+ ```txt
90
+ # requirements.txt
91
+
92
+ llama-cpp-python -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"
93
+ ```
94
+
95
+ </details>
96
+
97
+ ### Supported Backends
98
+
99
+ Below are some common backends, their build commands and any additional environment variables required.
100
+
101
+ <details open>
102
+ <summary>OpenBLAS (CPU)</summary>
103
+
104
+ To install with OpenBLAS, set the `LLAMA_BLAS` and `LLAMA_BLAS_VENDOR` environment variables before installing:
105
+
106
+ ```bash
107
+ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
108
+ ```
109
+ </details>
110
+
111
+ <details>
112
+ <summary>CUDA</summary>
113
+
114
+ To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing:
115
+
116
+ ```bash
117
+ CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
118
+ ```
119
+
120
+ **Pre-built Wheel (New)**
121
+
122
+ It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
123
+
124
+ - CUDA Version is 12.1, 12.2, 12.3, or 12.4
125
+ - Python Version is 3.10, 3.11 or 3.12
126
+
127
+ ```bash
128
+ pip install llama-cpp-python \
129
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
130
+ ```
131
+
132
+ Where `<cuda-version>` is one of the following:
133
+ - `cu121`: CUDA 12.1
134
+ - `cu122`: CUDA 12.2
135
+ - `cu123`: CUDA 12.3
136
+ - `cu124`: CUDA 12.4
137
+
138
+ For example, to install the CUDA 12.1 wheel:
139
+
140
+ ```bash
141
+ pip install llama-cpp-python \
142
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
143
+ ```
144
+
145
+ </details>
146
+
147
+ <details>
148
+ <summary>Metal</summary>
149
+
150
+ To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing:
151
+
152
+ ```bash
153
+ CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
154
+ ```
155
+
156
+ **Pre-built Wheel (New)**
157
+
158
+ It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:
159
+
160
+ - MacOS Version is 11.0 or later
161
+ - Python Version is 3.10, 3.11 or 3.12
162
+
163
+ ```bash
164
+ pip install llama-cpp-python \
165
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
166
+ ```
167
+
168
+ </details>
169
+ <details>
170
+
171
+ <summary>CLBlast (OpenCL)</summary>
172
+
173
+ To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
174
+
175
+ ```bash
176
+ CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
177
+ ```
178
+
179
+ </details>
180
+
181
+ <details>
182
+ <summary>hipBLAS (ROCm)</summary>
183
+
184
+ To install with hipBLAS / ROCm support for AMD cards, set the `LLAMA_HIPBLAS=on` environment variable before installing:
185
+
186
+ ```bash
187
+ CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
188
+ ```
189
+
190
+ </details>
191
+
192
+ <details>
193
+ <summary>Vulkan</summary>
194
+
195
+ To install with Vulkan support, set the `LLAMA_VULKAN=on` environment variable before installing:
196
+
197
+ ```bash
198
+ CMAKE_ARGS="-DLLAMA_VULKAN=on" pip install llama-cpp-python
199
+ ```
200
+
201
+ </details>
202
+
203
+ <details>
204
+ <summary>Kompute</summary>
205
+
206
+ To install with Kompute support, set the `LLAMA_KOMPUTE=on` environment variable before installing:
207
+
208
+ ```bash
209
+ CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
210
+ ```
211
+ </details>
212
+
213
+ <details>
214
+ <summary>SYCL</summary>
215
+
216
+ To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
217
+
218
+ ```bash
219
+ source /opt/intel/oneapi/setvars.sh
220
+ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
221
+ ```
222
+ </details>
223
+
224
+
225
+ ### Windows Notes
226
+
227
+ <details>
228
+ <summary>Error: Can't find 'nmake' or 'CMAKE_C_COMPILER'</summary>
229
+
230
+ If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:
231
+
232
+ ```ps
233
+ $env:CMAKE_GENERATOR = "MinGW Makefiles"
234
+ $env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
235
+ ```
236
+
237
+ See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.
238
+ </details>
239
+
240
+ ### MacOS Notes
241
+
242
+ Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/)
243
+
244
+ <details>
245
+ <summary>M1 Mac Performance Issue</summary>
246
+
247
+ Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
248
+
249
+ ```bash
250
+ wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
251
+ bash Miniforge3-MacOSX-arm64.sh
252
+ ```
253
+
254
+ Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
255
+ </details>
256
+
257
+ <details>
258
+ <summary>M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))`</summary>
259
+
260
+ Try installing with
261
+
262
+ ```bash
263
+ CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python
264
+ ```
265
+ </details>
266
+
267
+ ### Upgrading and Reinstalling
268
+
269
+ To upgrade and rebuild `llama-cpp-python` add `--upgrade --force-reinstall --no-cache-dir` flags to the `pip install` command to ensure the package is rebuilt from source.
270
+
271
+ ## High-level API
272
+
273
+ [API Reference](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#high-level-api)
274
+
275
+ The high-level API provides a simple managed interface through the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
276
+
277
+ Below is a short example demonstrating how to use the high-level API to for basic text completion:
278
+
279
+ ```python
280
+ from llama_cpp import Llama
281
+
282
+ llm = Llama(
283
+ model_path="./models/7B/llama-model.gguf",
284
+ # n_gpu_layers=-1, # Uncomment to use GPU acceleration
285
+ # seed=1337, # Uncomment to set a specific seed
286
+ # n_ctx=2048, # Uncomment to increase the context window
287
+ )
288
+ output = llm(
289
+ "Q: Name the planets in the solar system? A: ", # Prompt
290
+ max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
291
+ stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
292
+ echo=True # Echo the prompt back in the output
293
+ ) # Generate a completion, can also call create_completion
294
+ print(output)
295
+ ```
296
+
297
+ By default `llama-cpp-python` generates completions in an OpenAI compatible format:
298
+
299
+ ```python
300
+ {
301
+ "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
302
+ "object": "text_completion",
303
+ "created": 1679561337,
304
+ "model": "./models/7B/llama-model.gguf",
305
+ "choices": [
306
+ {
307
+ "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
308
+ "index": 0,
309
+ "logprobs": None,
310
+ "finish_reason": "stop"
311
+ }
312
+ ],
313
+ "usage": {
314
+ "prompt_tokens": 14,
315
+ "completion_tokens": 28,
316
+ "total_tokens": 42
317
+ }
318
+ }
319
+ ```
320
+
321
+ Text completion is available through the [`__call__`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__) and [`create_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_completion) methods of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
322
+
323
+ ### Pulling models from Hugging Face Hub
324
+
325
+ You can download `Llama` models in `gguf` format directly from Hugging Face using the [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) method.
326
+ You'll need to install the `huggingface-hub` package to use this feature (`pip install huggingface-hub`).
327
+
328
+ ```python
329
+ llm = Llama.from_pretrained(
330
+ repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
331
+ filename="*q8_0.gguf",
332
+ verbose=False
333
+ )
334
+ ```
335
+
336
+ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool.
337
+
338
+ ### Chat Completion
339
+
340
+ The high-level API also provides a simple interface for chat completion.
341
+
342
+ Chat completion requires that the model knows how to format the messages into a single prompt.
343
+ The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.
344
+
345
+ The model will will format the messages into a single prompt using the following order of precedence:
346
+ - Use the `chat_handler` if provided
347
+ - Use the `chat_format` if provided
348
+ - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this)
349
+ - else, fallback to the `llama-2` chat format
350
+
351
+ Set `verbose=True` to see the selected chat format.
352
+
353
+ ```python
354
+ from llama_cpp import Llama
355
+ llm = Llama(
356
+ model_path="path/to/llama-2/llama-model.gguf",
357
+ chat_format="llama-2"
358
+ )
359
+ llm.create_chat_completion(
360
+ messages = [
361
+ {"role": "system", "content": "You are an assistant who perfectly describes images."},
362
+ {
363
+ "role": "user",
364
+ "content": "Describe this image in detail please."
365
+ }
366
+ ]
367
+ )
368
+ ```
369
+
370
+ Chat completion is available through the [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
371
+
372
+ For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion_openai_v1) method which will return pydantic models instead of dicts.
373
+
374
+
375
+ ### JSON and JSON Schema Mode
376
+
377
+ To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
378
+
379
+ #### JSON Mode
380
+
381
+ The following example will constrain the response to valid JSON strings only.
382
+
383
+ ```python
384
+ from llama_cpp import Llama
385
+ llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
386
+ llm.create_chat_completion(
387
+ messages=[
388
+ {
389
+ "role": "system",
390
+ "content": "You are a helpful assistant that outputs in JSON.",
391
+ },
392
+ {"role": "user", "content": "Who won the world series in 2020"},
393
+ ],
394
+ response_format={
395
+ "type": "json_object",
396
+ },
397
+ temperature=0.7,
398
+ )
399
+ ```
400
+
401
+ #### JSON Schema Mode
402
+
403
+ To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument.
404
+
405
+ ```python
406
+ from llama_cpp import Llama
407
+ llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
408
+ llm.create_chat_completion(
409
+ messages=[
410
+ {
411
+ "role": "system",
412
+ "content": "You are a helpful assistant that outputs in JSON.",
413
+ },
414
+ {"role": "user", "content": "Who won the world series in 2020"},
415
+ ],
416
+ response_format={
417
+ "type": "json_object",
418
+ "schema": {
419
+ "type": "object",
420
+ "properties": {"team_name": {"type": "string"}},
421
+ "required": ["team_name"],
422
+ },
423
+ },
424
+ temperature=0.7,
425
+ )
426
+ ```
427
+
428
+ ### Function Calling
429
+
430
+ The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
431
+
432
+ ```python
433
+ from llama_cpp import Llama
434
+ llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
435
+ llm.create_chat_completion(
436
+ messages = [
437
+ {
438
+ "role": "system",
439
+ "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
440
+
441
+ },
442
+ {
443
+ "role": "user",
444
+ "content": "Extract Jason is 25 years old"
445
+ }
446
+ ],
447
+ tools=[{
448
+ "type": "function",
449
+ "function": {
450
+ "name": "UserDetail",
451
+ "parameters": {
452
+ "type": "object",
453
+ "title": "UserDetail",
454
+ "properties": {
455
+ "name": {
456
+ "title": "Name",
457
+ "type": "string"
458
+ },
459
+ "age": {
460
+ "title": "Age",
461
+ "type": "integer"
462
+ }
463
+ },
464
+ "required": [ "name", "age" ]
465
+ }
466
+ }
467
+ }],
468
+ tool_choice={
469
+ "type": "function",
470
+ "function": {
471
+ "name": "UserDetail"
472
+ }
473
+ }
474
+ )
475
+ ```
476
+
477
+ <details>
478
+ <summary>Functionary v2</summary>
479
+
480
+ The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.
481
+
482
+ Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
483
+
484
+ ```python
485
+ from llama_cpp import Llama
486
+ from llama_cpp.llama_tokenizer import LlamaHFTokenizer
487
+ llm = Llama.from_pretrained(
488
+ repo_id="meetkai/functionary-small-v2.2-GGUF",
489
+ filename="functionary-small-v2.2.q4_0.gguf",
490
+ chat_format="functionary-v2",
491
+ tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
492
+ )
493
+ ```
494
+
495
+ **NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).
496
+ </details>
497
+
498
+ ### Multi-modal Models
499
+
500
+ `llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.
501
+
502
+ You'll first need to download one of the available multi-modal models in GGUF format:
503
+
504
+ - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
505
+ - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
506
+ - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
507
+ - [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
508
+ - [moondream2](https://huggingface.co/vikhyatk/moondream2)
509
+
510
+ Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
511
+
512
+ ```python
513
+ from llama_cpp import Llama
514
+ from llama_cpp.llama_chat_format import Llava15ChatHandler
515
+ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
516
+ llm = Llama(
517
+ model_path="./path/to/llava/llama-model.gguf",
518
+ chat_handler=chat_handler,
519
+ n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
520
+ )
521
+ llm.create_chat_completion(
522
+ messages = [
523
+ {"role": "system", "content": "You are an assistant who perfectly describes images."},
524
+ {
525
+ "role": "user",
526
+ "content": [
527
+ {"type" : "text", "text": "What's in this image?"},
528
+ {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
529
+ ]
530
+ }
531
+ ]
532
+ )
533
+ ```
534
+
535
+ You can also pull the model from the Hugging Face Hub using the `from_pretrained` method.
536
+
537
+ ```python
538
+ from llama_cpp import Llama
539
+ from llama_cpp.llama_chat_format import MoondreamChatHandler
540
+
541
+ chat_handler = MoondreamChatHandler.from_pretrained(
542
+ repo_id="vikhyatk/moondream2",
543
+ filename="*mmproj*",
544
+ )
545
+
546
+ llm = Llama.from_pretrained(
547
+ repo_id="vikhyatk/moondream2",
548
+ filename="*text-model*",
549
+ chat_handler=chat_handler,
550
+ n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
551
+ )
552
+
553
+ respoonse = llm.create_chat_completion(
554
+ messages = [
555
+ {
556
+ "role": "user",
557
+ "content": [
558
+ {"type" : "text", "text": "What's in this image?"},
559
+ {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
560
+
561
+ ]
562
+ }
563
+ ]
564
+ )
565
+ print(response["choices"][0]["text"])
566
+ ```
567
+
568
+ **Note**: Multi-modal models also support tool calling and JSON mode.
569
+
570
+ <details>
571
+ <summary>Loading a Local Image</summary>
572
+
573
+ Images can be passed as base64 encoded data URIs. The following example demonstrates how to do this.
574
+
575
+ ```python
576
+ import base64
577
+
578
+ def image_to_base64_data_uri(file_path):
579
+ with open(file_path, "rb") as img_file:
580
+ base64_data = base64.b64encode(img_file.read()).decode('utf-8')
581
+ return f"data:image/png;base64,{base64_data}"
582
+
583
+ # Replace 'file_path.png' with the actual path to your PNG file
584
+ file_path = 'file_path.png'
585
+ data_uri = image_to_base64_data_uri(file_path)
586
+
587
+ messages = [
588
+ {"role": "system", "content": "You are an assistant who perfectly describes images."},
589
+ {
590
+ "role": "user",
591
+ "content": [
592
+ {"type": "image_url", "image_url": {"url": data_uri }},
593
+ {"type" : "text", "text": "Describe this image in detail please."}
594
+ ]
595
+ }
596
+ ]
597
+
598
+ ```
599
+
600
+ </details>
601
+
602
+ ### Speculative Decoding
603
+
604
+ `llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
605
+
606
+ The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
607
+
608
+ Just pass this as a draft model to the `Llama` class during initialization.
609
+
610
+ ```python
611
+ from llama_cpp import Llama
612
+ from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
613
+
614
+ llama = Llama(
615
+ model_path="path/to/model.gguf",
616
+ draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
617
+ )
618
+ ```
619
+
620
+ ### Embeddings
621
+
622
+ To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly.
623
+
624
+ ```python
625
+ import llama_cpp
626
+
627
+ llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True)
628
+
629
+ embeddings = llm.create_embedding("Hello, world!")
630
+
631
+ # or create multiple embeddings at once
632
+
633
+ embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
634
+ ```
635
+
636
+ There are two primary notions of embeddings in a Transformer-style model: *token level* and *sequence level*. Sequence level embeddings are produced by "pooling" token level embeddings together, usually by averaging them or using the first token.
637
+
638
+ Models that are explicitly geared towards embeddings will usually return sequence level embeddings by default, one for each input string. Non-embedding models such as those designed for text generation will typically return only token level embeddings, one for each token in each sequence. Thus the dimensionality of the return type will be one higher for token level embeddings.
639
+
640
+ It is possible to control pooling behavior in some cases using the `pooling_type` flag on model creation. You can ensure token level embeddings from any model using `LLAMA_POOLING_TYPE_NONE`. The reverse, getting a generation oriented model to yield sequence level embeddings is currently not possible, but you can always do the pooling manually.
641
+
642
+ ### Adjusting the Context Window
643
+
644
+ The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
645
+
646
+ For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:
647
+
648
+ ```python
649
+ llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048)
650
+ ```
651
+
652
+ ## OpenAI Compatible Web Server
653
+
654
+ `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
655
+ This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc).
656
+
657
+ To install the server package and get started:
658
+
659
+ ```bash
660
+ pip install 'llama-cpp-python[server]'
661
+ python3 -m llama_cpp.server --model models/7B/llama-model.gguf
662
+ ```
663
+
664
+ Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
665
+
666
+ ```bash
667
+ CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
668
+ python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
669
+ ```
670
+
671
+ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
672
+
673
+ To bind to `0.0.0.0` to enable remote connections, use `python3 -m llama_cpp.server --host 0.0.0.0`.
674
+ Similarly, to change the port (default is 8000), use `--port`.
675
+
676
+ You probably also want to set the prompt format. For chatml, use
677
+
678
+ ```bash
679
+ python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format chatml
680
+ ```
681
+
682
+ That will format the prompt according to how model expects it. You can find the prompt format in the model card.
683
+ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format".
684
+
685
+ If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
686
+
687
+ ```bash
688
+ python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
689
+ ```
690
+
691
+ ### Web Server Features
692
+
693
+ - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
694
+ - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
695
+ - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
696
+ - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
697
+
698
+ ## Docker image
699
+
700
+ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
701
+
702
+ ```bash
703
+ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest
704
+ ```
705
+
706
+ [Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
707
+
708
+ ## Low-level API
709
+
710
+ [API Reference](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#low-level-api)
711
+
712
+ The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
713
+ The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
714
+
715
+ Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
716
+
717
+ ```python
718
+ import llama_cpp
719
+ import ctypes
720
+ llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
721
+ params = llama_cpp.llama_context_default_params()
722
+ # use bytes for char * params
723
+ model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
724
+ ctx = llama_cpp.llama_new_context_with_model(model, params)
725
+ max_tokens = params.n_ctx
726
+ # use ctypes arrays for array params
727
+ tokens = (llama_cpp.llama_token * int(max_tokens))()
728
+ n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
729
+ llama_cpp.llama_free(ctx)
730
+ ```
731
+
732
+ Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
733
+
734
+ ## Documentation
735
+
736
+ Documentation is available via [https://llama-cpp-python.readthedocs.io/](https://llama-cpp-python.readthedocs.io/).
737
+ If you find any issues with the documentation, please open an issue or submit a PR.
738
+
739
+ ## Development
740
+
741
+ This package is under active development and I welcome any contributions.
742
+
743
+ To get started, clone the repository and install the package in editable / development mode:
744
+
745
+ ```bash
746
+ git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
747
+ cd llama-cpp-python
748
+
749
+ # Upgrade pip (required for editable mode)
750
+ pip install --upgrade pip
751
+
752
+ # Install with pip
753
+ pip install -e .
754
+
755
+ # if you want to use the fastapi / openapi server
756
+ pip install -e .[server]
757
+
758
+ # to install all optional dependencies
759
+ pip install -e .[all]
760
+
761
+ # to clear the local build cache
762
+ make clean
763
+ ```
764
+
765
+ You can also test out specific commits of `lama.cpp` by checking out the desired commit in the `vendor/llama.cpp` submodule and then running `make clean` and `pip install -e .` again. Any changes in the `llama.h` API will require
766
+ changes to the `llama_cpp/llama_cpp.py` file to match the new API (additional changes may be required elsewhere).
767
+
768
+ ## FAQ
769
+
770
+ ### Are there pre-built binaries / binary wheels available?
771
+
772
+ The recommended installation method is to install from source as described above.
773
+ The reason for this is that `llama.cpp` is built with compiler optimizations that are specific to your system.
774
+ Using pre-built binaries would require disabling these optimizations or supporting a large number of pre-built binaries for each platform.
775
+
776
+ That being said there are some pre-built binaries available through the Releases as well as some community provided wheels.
777
+
778
+ In the future, I would like to provide pre-built binaries and wheels for common platforms and I'm happy to accept any useful contributions in this area.
779
+ This is currently being tracked in [#741](https://github.com/abetlen/llama-cpp-python/issues/741)
780
+
781
+ ### How does this compare to other Python bindings of `llama.cpp`?
782
+
783
+ I originally wrote this package for my own use with two goals in mind:
784
+
785
+ - Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python
786
+ - Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp`
787
+
788
+ Any contributions and changes to this package will be made with these goals in mind.
789
+
790
+ ## License
791
+
792
+ This project is licensed under the terms of the MIT license.
llama-cpp-python/docker/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Install Docker Server
2
+ > [!IMPORTANT]
3
+ > This was tested with Docker running on Linux. <br>If you can get it working on Windows or MacOS, please update this `README.md` with a PR!<br>
4
+
5
+ [Install Docker Engine](https://docs.docker.com/engine/install)
6
+
7
+
8
+ ## Simple Dockerfiles for building the llama-cpp-python server with external model bin files
9
+ ### openblas_simple
10
+ A simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image:
11
+ ```
12
+ cd ./openblas_simple
13
+ docker build -t openblas_simple .
14
+ docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
15
+ ```
16
+ where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
17
+
18
+ ### cuda_simple
19
+ > [!WARNING]
20
+ > Nvidia GPU CuBLAS support requires an Nvidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker Nvidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) <br>
21
+
22
+ A simple Dockerfile for CUDA-accelerated CuBLAS, where the model is located outside the Docker image:
23
+
24
+ ```
25
+ cd ./cuda_simple
26
+ docker build -t cuda_simple .
27
+ docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
28
+ ```
29
+ where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
30
+
31
+ --------------------------------------------------------------------------
32
+
33
+ ### "Open-Llama-in-a-box"
34
+ Download an Apache V2.0 licensed 3B params Open LLaMA model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server:
35
+ ```
36
+ $ cd ./open_llama
37
+ ./build.sh
38
+ ./start.sh
39
+ ```
40
+
41
+ ### Manually choose your own Llama model from Hugging Face
42
+ `python3 ./hug_model.py -a TheBloke -t llama`
43
+ You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
44
+ ```
45
+ docker $ ls -lh *.bin
46
+ -rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
47
+ lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
48
+ ```
49
+
50
+ > [!NOTE]
51
+ > Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
52
+ **TWICE** as much disk space as the size of the model:<br>
53
+
54
+ | Model | Quantized size |
55
+ |------:|----------------:|
56
+ | 3B | 3 GB |
57
+ | 7B | 5 GB |
58
+ | 13B | 10 GB |
59
+ | 33B | 25 GB |
60
+ | 65B | 50 GB |
61
+
62
+
63
+ > [!NOTE]
64
+ > If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
llama-cpp-python/docker/cuda_simple/Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
2
+ FROM nvidia/cuda:${CUDA_IMAGE}
3
+
4
+ # We need to set the host to 0.0.0.0 to allow outside access
5
+ ENV HOST 0.0.0.0
6
+
7
+ RUN apt-get update && apt-get upgrade -y \
8
+ && apt-get install -y git build-essential \
9
+ python3 python3-pip gcc wget \
10
+ ocl-icd-opencl-dev opencl-headers clinfo \
11
+ libclblast-dev libopenblas-dev \
12
+ && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
13
+
14
+ COPY . .
15
+
16
+ # setting build related env vars
17
+ ENV CUDA_DOCKER_ARCH=all
18
+ ENV LLAMA_CUBLAS=1
19
+
20
+ # Install depencencies
21
+ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
22
+
23
+ # Install llama-cpp-python (build with cuda)
24
+ RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
25
+
26
+ # Run the server
27
+ CMD python3 -m llama_cpp.server
llama-cpp-python/docker/open_llama/Dockerfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define the image argument and provide a default value
2
+ ARG IMAGE=python:3-slim-bullseye
3
+
4
+ # Use the image as specified
5
+ FROM ${IMAGE}
6
+
7
+ # Re-declare the ARG after FROM
8
+ ARG IMAGE
9
+
10
+ # Update and upgrade the existing packages
11
+ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
12
+ python3 \
13
+ python3-pip \
14
+ ninja-build \
15
+ build-essential
16
+
17
+ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
18
+
19
+ # Perform the conditional installations based on the image
20
+ RUN echo "Image: ${IMAGE}" && \
21
+ if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
22
+ echo "OpenBLAS install:" && \
23
+ apt-get install -y --no-install-recommends libopenblas-dev && \
24
+ LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
25
+ else \
26
+ echo "CuBLAS install:" && \
27
+ LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
28
+ fi
29
+
30
+ # Clean up apt cache
31
+ RUN rm -rf /var/lib/apt/lists/*
32
+
33
+ # Set a working directory for better clarity
34
+ WORKDIR /app
35
+
36
+ # Copy files to the app directory
37
+ RUN echo "Installing model...this can take some time..."
38
+ COPY ./model.bin /app/model.bin
39
+ COPY ./start_server.sh /app/start_server.sh
40
+
41
+ # Make the server start script executable
42
+ RUN chmod +x /app/start_server.sh
43
+
44
+ # Set environment variable for the host
45
+ ENV HOST=0.0.0.0
46
+
47
+ # Expose a port for the server
48
+ EXPOSE 8000
49
+
50
+ # Run the server start script
51
+ CMD ["/bin/sh", "/app/start_server.sh"]
llama-cpp-python/docker/open_llama/build.sh ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ MODEL="open_llama_3b"
4
+ # Get open_llama_3b_ggml q5_1 quantization
5
+ python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
6
+ ls -lh *.bin
7
+
8
+ # Build the default OpenBLAS image
9
+ docker build -t $MODEL .
10
+ docker images | egrep "^(REPOSITORY|$MODEL)"
11
+
12
+ echo
13
+ echo "To start the docker container run:"
14
+ echo "docker run -t -p 8000:8000 $MODEL"
llama-cpp-python/docker/open_llama/hug_model.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import os
4
+ import struct
5
+ import argparse
6
+
7
+ def make_request(url, params=None):
8
+ print(f"Making request to {url}...")
9
+ response = requests.get(url, params=params)
10
+ if response.status_code == 200:
11
+ return json.loads(response.text)
12
+ else:
13
+ print(f"Request failed with status code {response.status_code}")
14
+ return None
15
+
16
+ def check_magic_and_version(filename):
17
+ with open(filename, 'rb') as f:
18
+ # Read the first 6 bytes from the file
19
+ data = f.read(6)
20
+
21
+ # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
22
+ # and the next 2 bytes as a little-endian unsigned short
23
+ magic, version = struct.unpack('<I H', data)
24
+
25
+ print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
26
+
27
+ return magic, version
28
+
29
+ def download_file(url, destination):
30
+ print(f"Downloading {url} to {destination}...")
31
+ response = requests.get(url, stream=True)
32
+ if response.status_code == 200:
33
+ with open(destination, 'wb') as f:
34
+ total_downloaded = 0
35
+ for chunk in response.iter_content(chunk_size=1024):
36
+ if chunk: # filter out keep-alive new chunks
37
+ f.write(chunk)
38
+ total_downloaded += len(chunk)
39
+ if total_downloaded >= 10485760: # 10 MB
40
+ print('.', end='', flush=True)
41
+ total_downloaded = 0
42
+ print("\nDownload complete.")
43
+
44
+ # Creating a symbolic link from destination to "model.bin"
45
+ if os.path.isfile("model.bin"):
46
+ os.remove("model.bin") # remove the existing link if any
47
+ os.symlink(destination, "model.bin")
48
+ else:
49
+ print(f"Download failed with status code {response.status_code}")
50
+
51
+ def get_user_choice(model_list):
52
+ # Print the enumerated list
53
+ print("\n")
54
+ for i, (model_id, rfilename) in enumerate(model_list):
55
+ print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
56
+
57
+ # Get user's choice
58
+ choice = input("Choose a model to download by entering the corresponding number: ")
59
+ try:
60
+ index = int(choice) - 1
61
+ if 0 <= index < len(model_list):
62
+ # Return the chosen model
63
+ return model_list[index]
64
+ else:
65
+ print("Invalid choice.")
66
+ except ValueError:
67
+ print("Invalid input. Please enter a number corresponding to a model.")
68
+ except IndexError:
69
+ print("Invalid choice. Index out of range.")
70
+
71
+ return None
72
+
73
+ def main():
74
+ # Create an argument parser
75
+ parser = argparse.ArgumentParser(description='Process some parameters.')
76
+
77
+ # Arguments
78
+ parser.add_argument('-v', '--version', type=int, default=0x0003,
79
+ help='hexadecimal version number of ggml file')
80
+ parser.add_argument('-a', '--author', type=str, default='TheBloke',
81
+ help='HuggingFace author filter')
82
+ parser.add_argument('-t', '--tag', type=str, default='llama',
83
+ help='HuggingFace tag filter')
84
+ parser.add_argument('-s', '--search', type=str, default='',
85
+ help='HuggingFace search filter')
86
+ parser.add_argument('-f', '--filename', type=str, default='q5_1',
87
+ help='HuggingFace model repository filename substring match')
88
+
89
+ # Parse the arguments
90
+ args = parser.parse_args()
91
+
92
+ # Define the parameters
93
+ params = {
94
+ "author": args.author,
95
+ "tags": args.tag,
96
+ "search": args.search
97
+ }
98
+
99
+ models = make_request('https://huggingface.co/api/models', params=params)
100
+ if models is None:
101
+ return
102
+
103
+ model_list = []
104
+ # Iterate over the models
105
+ for model in models:
106
+ model_id = model['id']
107
+ model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
108
+ if model_info is None:
109
+ continue
110
+
111
+ for sibling in model_info.get('siblings', []):
112
+ rfilename = sibling.get('rfilename')
113
+ if rfilename and args.filename in rfilename:
114
+ model_list.append((model_id, rfilename))
115
+
116
+ # Choose the model
117
+ model_list.sort(key=lambda x: x[0])
118
+ if len(model_list) == 0:
119
+ print("No models found")
120
+ exit(1)
121
+ elif len(model_list) == 1:
122
+ model_choice = model_list[0]
123
+ else:
124
+ model_choice = get_user_choice(model_list)
125
+
126
+ if model_choice is not None:
127
+ model_id, rfilename = model_choice
128
+ url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
129
+ dest = f"{model_id.replace('/', '_')}_{rfilename}"
130
+ download_file(url, dest)
131
+ _, version = check_magic_and_version(dest)
132
+ if version != args.version:
133
+ print(f"Warning: Expected version {args.version}, but found different version in the file.")
134
+ else:
135
+ print("Error - model choice was None")
136
+ exit(2)
137
+
138
+ if __name__ == '__main__':
139
+ main()
llama-cpp-python/docker/open_llama/start.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ MODEL="open_llama_3b"
4
+
5
+ # Start Docker container
6
+ docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
7
+ sleep 10
8
+ echo
9
+ docker ps | egrep "(^CONTAINER|$MODEL)"
10
+
11
+ # Test the model works
12
+ echo
13
+ curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
14
+ "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
15
+ "stop": [
16
+ "\n",
17
+ "###"
18
+ ]
19
+ }' | grep Paris
20
+ if [ $? -eq 0 ]
21
+ then
22
+ echo
23
+ echo "$MODEL is working!!"
24
+ else
25
+ echo
26
+ echo "ERROR: $MODEL not replying."
27
+ exit 1
28
+ fi
llama-cpp-python/docker/open_llama/start_server.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ # For mlock support
4
+ ulimit -l unlimited
5
+
6
+ if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
7
+ python3 -B -m llama_cpp.server --model /app/model.bin
8
+ else
9
+ # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
10
+ python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
11
+ fi
llama-cpp-python/docker/openblas_simple/Dockerfile ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3-slim-bullseye
2
+
3
+ # We need to set the host to 0.0.0.0 to allow outside access
4
+ ENV HOST 0.0.0.0
5
+
6
+ COPY . .
7
+
8
+ # Install the package
9
+ RUN apt update && apt install -y libopenblas-dev ninja-build build-essential pkg-config
10
+ RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
11
+
12
+ RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
13
+
14
+ # Run the server
15
+ CMD python3 -m llama_cpp.server
llama-cpp-python/docker/simple/Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define the image argument and provide a default value
2
+ ARG IMAGE=python:3-slim-bullseye
3
+
4
+ # Use the image as specified
5
+ FROM ${IMAGE}
6
+
7
+ # Re-declare the ARG after FROM
8
+ ARG IMAGE
9
+
10
+ # Update and upgrade the existing packages
11
+ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
12
+ python3 \
13
+ python3-pip \
14
+ ninja-build \
15
+ libopenblas-dev \
16
+ build-essential
17
+
18
+ RUN mkdir /app
19
+ WORKDIR /app
20
+ COPY . /app
21
+
22
+ RUN python3 -m pip install --upgrade pip
23
+
24
+ RUN make deps && make build && make clean
25
+
26
+ # Set environment variable for the host
27
+ ENV HOST=0.0.0.0
28
+ ENV PORT=8000
29
+
30
+ # Expose a port for the server
31
+ EXPOSE 8000
32
+
33
+ # Run the server start script
34
+ CMD ["/bin/sh", "/app/docker/simple/run.sh"]
llama-cpp-python/docker/simple/run.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ make build
4
+ uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT
llama-cpp-python/docs/api-reference.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: API Reference
3
+ ---
4
+
5
+ ## High Level API
6
+
7
+ High-level Python bindings for llama.cpp.
8
+
9
+ ::: llama_cpp.Llama
10
+ options:
11
+ members:
12
+ - __init__
13
+ - tokenize
14
+ - detokenize
15
+ - reset
16
+ - eval
17
+ - sample
18
+ - generate
19
+ - create_embedding
20
+ - embed
21
+ - create_completion
22
+ - __call__
23
+ - create_chat_completion
24
+ - create_chat_completion_openai_v1
25
+ - set_cache
26
+ - save_state
27
+ - load_state
28
+ - token_bos
29
+ - token_eos
30
+ - from_pretrained
31
+ show_root_heading: true
32
+
33
+ ::: llama_cpp.LlamaGrammar
34
+ options:
35
+ members:
36
+ - from_string
37
+ - from_json_schema
38
+
39
+ ::: llama_cpp.LlamaCache
40
+ options:
41
+ show_root_heading: true
42
+
43
+ ::: llama_cpp.LlamaState
44
+ options:
45
+ show_root_heading: true
46
+
47
+ ::: llama_cpp.LogitsProcessor
48
+ options:
49
+ show_root_heading: true
50
+
51
+ ::: llama_cpp.LogitsProcessorList
52
+ options:
53
+ show_root_heading: true
54
+
55
+ ::: llama_cpp.StoppingCriteria
56
+ options:
57
+ show_root_heading: true
58
+
59
+ ::: llama_cpp.StoppingCriteriaList
60
+ options:
61
+ show_root_heading: true
62
+
63
+ ## Low Level API
64
+
65
+ Low-level Python bindings for llama.cpp using Python's ctypes library.
66
+
67
+ ::: llama_cpp.llama_cpp
68
+ options:
69
+ show_if_no_docstring: true
70
+ # filter only members starting with `llama_`
71
+ filters:
72
+ - "^llama_"
73
+
74
+ ::: llama_cpp.llama_cpp
75
+ options:
76
+ show_if_no_docstring: true
77
+ show_root_heading: false
78
+ show_root_toc_entry: false
79
+ heading_level: 4
80
+ # filter only members starting with `LLAMA_`
81
+ filters:
82
+ - "^LLAMA_"
83
+
84
+ ## Misc
85
+
86
+ ::: llama_cpp.llama_types
87
+ options:
88
+ show_if_no_docstring: true
llama-cpp-python/docs/changelog.md ADDED
@@ -0,0 +1 @@
 
 
1
+ -8<- "CHANGELOG.md"
llama-cpp-python/docs/index.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ---
2
+ title: Getting Started
3
+ ---
4
+
5
+ -8<- "README.md"
llama-cpp-python/docs/install/macos.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MacOS Install with Metal GPU
3
+ ---
4
+
5
+ **(1) Make sure you have xcode installed... at least the command line parts**
6
+ ```
7
+ # check the path of your xcode install
8
+ xcode-select -p
9
+
10
+ # xcode installed returns
11
+ # /Applications/Xcode-beta.app/Contents/Developer
12
+
13
+ # if xcode is missing then install it... it takes ages;
14
+ xcode-select --install
15
+ ```
16
+
17
+ **(2) Install the conda version for MacOS that supports Metal GPU**
18
+ ```
19
+ wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
20
+ bash Miniforge3-MacOSX-arm64.sh
21
+ ```
22
+
23
+ **(3) Make a conda environment**
24
+ ```
25
+ conda create -n llama python=3.9.16
26
+ conda activate llama
27
+ ```
28
+
29
+ **(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62**
30
+ *(you needed xcode installed in order pip to build/compile the C++ code)*
31
+ ```
32
+ pip uninstall llama-cpp-python -y
33
+ CMAKE_ARGS="-DLLAMA_METAL=on" pip install -U llama-cpp-python --no-cache-dir
34
+ pip install 'llama-cpp-python[server]'
35
+
36
+ # you should now have llama-cpp-python v0.1.62 or higher installed
37
+ llama-cpp-python         0.1.68
38
+
39
+ ```
40
+
41
+ **(5) Download a v3 gguf v2 model**
42
+ - **ggufv2**
43
+ - file name ends with **Q4_0.gguf** - indicating it is 4bit quantized, with quantisation method 0
44
+
45
+ https://huggingface.co/TheBloke/CodeLlama-7B-GGUF
46
+
47
+
48
+ **(6) run the llama-cpp-python API server with MacOS Metal GPU support**
49
+ ```
50
+ # config your ggml model path
51
+ # make sure it is gguf v2
52
+ # make sure it is q4_0
53
+ export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]Q4_0.gguf
54
+ python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1
55
+ ```
56
+
57
+ ***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
58
+
59
+
llama-cpp-python/docs/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ mkdocs
2
+ mkdocs-material
3
+ mkdocstrings[python]
llama-cpp-python/docs/server.md ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAI Compatible Server
2
+
3
+ `llama-cpp-python` offers an OpenAI API compatible web server.
4
+
5
+ This web server can be used to serve local models and easily connect them to existing clients.
6
+
7
+ ## Setup
8
+
9
+ ### Installation
10
+
11
+ The server can be installed by running the following command:
12
+
13
+ ```bash
14
+ pip install llama-cpp-python[server]
15
+ ```
16
+
17
+ ### Running the server
18
+
19
+ The server can then be started by running the following command:
20
+
21
+ ```bash
22
+ python3 -m llama_cpp.server --model <model_path>
23
+ ```
24
+
25
+ ### Server options
26
+
27
+ For a full list of options, run:
28
+
29
+ ```bash
30
+ python3 -m llama_cpp.server --help
31
+ ```
32
+
33
+ NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
34
+
35
+ Check out the server config reference below settings for more information on the available options.
36
+ CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings)
37
+
38
+ Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples.
39
+
40
+
41
+ ## Guides
42
+
43
+ ### Code Completion
44
+
45
+ `llama-cpp-python` supports code completion via GitHub Copilot.
46
+
47
+ *NOTE*: Without GPU acceleration this is unlikely to be fast enough to be usable.
48
+
49
+ You'll first need to download one of the available code completion models in GGUF format:
50
+
51
+ - [replit-code-v1_5-GGUF](https://huggingface.co/abetlen/replit-code-v1_5-3b-GGUF)
52
+
53
+ Then you'll need to run the OpenAI compatible web server with a increased context size substantially for GitHub Copilot requests:
54
+
55
+ ```bash
56
+ python3 -m llama_cpp.server --model <model_path> --n_ctx 16192
57
+ ```
58
+
59
+ Then just update your settings in `.vscode/settings.json` to point to your code completion server:
60
+
61
+ ```json
62
+ {
63
+ // ...
64
+ "github.copilot.advanced": {
65
+ "debug.testOverrideProxyUrl": "http://<host>:<port>",
66
+ "debug.overrideProxyUrl": "http://<host>:<port>"
67
+ }
68
+ // ...
69
+ }
70
+ ```
71
+
72
+ ### Function Calling
73
+
74
+ `llama-cpp-python` supports structured function calling based on a JSON schema.
75
+ Function calling is completely compatible with the OpenAI function calling API and can be used by connecting with the official OpenAI Python client.
76
+
77
+ You'll first need to download one of the available function calling models in GGUF format:
78
+
79
+ - [functionary](https://huggingface.co/meetkai)
80
+
81
+ Then when you run the server you'll need to also specify either `functionary-v1` or `functionary-v2` chat_format.
82
+
83
+ Note that since functionary requires a HF Tokenizer due to discrepancies between llama.cpp and HuggingFace's tokenizers as mentioned [here](https://github.com/abetlen/llama-cpp-python/blob/main?tab=readme-ov-file#function-calling), you will need to pass in the path to the tokenizer too. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
84
+
85
+ ```bash
86
+ python3 -m llama_cpp.server --model <model_path_to_functionary_v2_model> --chat_format functionary-v2 --hf_pretrained_model_name_or_path <model_path_to_functionary_v2_tokenizer>
87
+ ```
88
+
89
+ Check out this [example notebook](https://github.com/abetlen/llama-cpp-python/blob/main/examples/notebooks/Functions.ipynb) for a walkthrough of some interesting use cases for function calling.
90
+
91
+ ### Multimodal Models
92
+
93
+ `llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
94
+ read information from both text and images.
95
+
96
+ You'll first need to download one of the available multi-modal models in GGUF format:
97
+
98
+ - [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
99
+ - [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
100
+ - [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
101
+ - [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
102
+ - [moondream2](https://huggingface.co/vikhyatk/moondream2)
103
+
104
+ Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format
105
+
106
+ ```bash
107
+ python3 -m llama_cpp.server --model <model_path> --clip_model_path <clip_model_path> --chat_format llava-1-5
108
+ ```
109
+
110
+ Then you can just use the OpenAI API as normal
111
+
112
+ ```python3
113
+ from openai import OpenAI
114
+
115
+ client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
116
+ response = client.chat.completions.create(
117
+ model="gpt-4-vision-preview",
118
+ messages=[
119
+ {
120
+ "role": "user",
121
+ "content": [
122
+ {
123
+ "type": "image_url",
124
+ "image_url": {
125
+ "url": "<image_url>"
126
+ },
127
+ },
128
+ {"type": "text", "text": "What does the image say"},
129
+ ],
130
+ }
131
+ ],
132
+ )
133
+ print(response)
134
+ ```
135
+
136
+ ## Configuration and Multi-Model Support
137
+
138
+ The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
139
+
140
+ ```bash
141
+ python3 -m llama_cpp.server --config_file <config_file>
142
+ ```
143
+
144
+ Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
145
+
146
+ The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file.
147
+
148
+ At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
149
+
150
+ ```json
151
+ {
152
+ "host": "0.0.0.0",
153
+ "port": 8080,
154
+ "models": [
155
+ {
156
+ "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
157
+ "model_alias": "gpt-3.5-turbo",
158
+ "chat_format": "chatml",
159
+ "n_gpu_layers": -1,
160
+ "offload_kqv": true,
161
+ "n_threads": 12,
162
+ "n_batch": 512,
163
+ "n_ctx": 2048
164
+ },
165
+ {
166
+ "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
167
+ "model_alias": "gpt-4",
168
+ "chat_format": "chatml",
169
+ "n_gpu_layers": -1,
170
+ "offload_kqv": true,
171
+ "n_threads": 12,
172
+ "n_batch": 512,
173
+ "n_ctx": 2048
174
+ },
175
+ {
176
+ "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
177
+ "model_alias": "gpt-4-vision-preview",
178
+ "chat_format": "llava-1-5",
179
+ "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
180
+ "n_gpu_layers": -1,
181
+ "offload_kqv": true,
182
+ "n_threads": 12,
183
+ "n_batch": 512,
184
+ "n_ctx": 2048
185
+ },
186
+ {
187
+ "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
188
+ "model_alias": "text-davinci-003",
189
+ "n_gpu_layers": -1,
190
+ "offload_kqv": true,
191
+ "n_threads": 12,
192
+ "n_batch": 512,
193
+ "n_ctx": 2048
194
+ },
195
+ {
196
+ "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
197
+ "model_alias": "copilot-codex",
198
+ "n_gpu_layers": -1,
199
+ "offload_kqv": true,
200
+ "n_threads": 12,
201
+ "n_batch": 1024,
202
+ "n_ctx": 9216
203
+ }
204
+ ]
205
+ }
206
+ ```
207
+
208
+ The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class.
209
+
210
+ ## Server Options Reference
211
+
212
+ ::: llama_cpp.server.settings.ConfigFileSettings
213
+ options:
214
+ show_if_no_docstring: true
215
+
216
+ ::: llama_cpp.server.settings.ServerSettings
217
+ options:
218
+ show_if_no_docstring: true
219
+
220
+ ::: llama_cpp.server.settings.ModelSettings
221
+ options:
222
+ show_if_no_docstring: true