Spaces:

aelitta
/

BioMistral_gradio

Runtime error

App Files Files Community

aelitta commited on May 7

Commit

4bdb245

•

1 Parent(s): ecc410d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.gitattributes +20 -0
BioMistral-7B-GGUF/.DS_Store +0 -0
BioMistral-7B-GGUF/.gitattributes +45 -0
BioMistral-7B-GGUF/README.md +238 -0
BioMistral-7B-GGUF/config.json +3 -0
BioMistral-7B.Q4_K_M.gguf +3 -0
README.md +2 -8
__pycache__/app.cpython-39.pyc +0 -0
app.py +1 -0
data/10.1177_1557988318780857.pdf +0 -0
ingest.py +1 -0
llama-cpp-python/.DS_Store +0 -0
llama-cpp-python/.dockerignore +166 -0
llama-cpp-python/.github/ISSUE_TEMPLATE/bug_report.md +96 -0
llama-cpp-python/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
llama-cpp-python/.github/dependabot.yml +15 -0
llama-cpp-python/.github/workflows/build-and-release.yaml +112 -0
llama-cpp-python/.github/workflows/build-docker.yaml +50 -0
llama-cpp-python/.github/workflows/build-wheels-cuda.yaml +131 -0
llama-cpp-python/.github/workflows/build-wheels-metal.yaml +87 -0
llama-cpp-python/.github/workflows/generate-index-from-release.yaml +50 -0
llama-cpp-python/.github/workflows/publish-to-test.yaml +43 -0
llama-cpp-python/.github/workflows/publish.yaml +32 -0
llama-cpp-python/.github/workflows/test-pypi.yaml +64 -0
llama-cpp-python/.github/workflows/test.yaml +126 -0
llama-cpp-python/.gitignore +180 -0
llama-cpp-python/.gitmodules +3 -0
llama-cpp-python/.readthedocs.yaml +24 -0
llama-cpp-python/CHANGELOG.md +630 -0
llama-cpp-python/CMakeLists.txt +87 -0
llama-cpp-python/LICENSE.md +9 -0
llama-cpp-python/Makefile +82 -0
llama-cpp-python/README.md +792 -0
llama-cpp-python/docker/README.md +64 -0
llama-cpp-python/docker/cuda_simple/Dockerfile +27 -0
llama-cpp-python/docker/open_llama/Dockerfile +51 -0
llama-cpp-python/docker/open_llama/build.sh +14 -0
llama-cpp-python/docker/open_llama/hug_model.py +139 -0
llama-cpp-python/docker/open_llama/start.sh +28 -0
llama-cpp-python/docker/open_llama/start_server.sh +11 -0
llama-cpp-python/docker/openblas_simple/Dockerfile +15 -0
llama-cpp-python/docker/simple/Dockerfile +34 -0
llama-cpp-python/docker/simple/run.sh +4 -0
llama-cpp-python/docs/api-reference.md +88 -0
llama-cpp-python/docs/changelog.md +1 -0
llama-cpp-python/docs/index.md +5 -0
llama-cpp-python/docs/install/macos.md +59 -0
llama-cpp-python/docs/requirements.txt +3 -0
llama-cpp-python/docs/server.md +222 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

.gitattributes CHANGED Viewed

@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/kompute/docs/images/komputer-2.gif filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/kompute/docs/images/komputer-godot-4.gif filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/kompute/docs/images/komputer-logos.gif filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/kompute/examples/android/android-simple/app/src/main/assets/komputer-2.gif filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-aquila.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-baichuan.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-deepseek-coder.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-deepseek-llm.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-falcon.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-gpt-2.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-gpt-neox.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-gpt2.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-llama-bpe.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-mpt.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-refact.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-stablelm.gguf filter=lfs diff=lfs merge=lfs -text
+llama-cpp-python/vendor/llama.cpp/models/ggml-vocab-starcoder.gguf filter=lfs diff=lfs merge=lfs -text
+qdrant_storage/collections/vector_db/0/wal/open-1 filter=lfs diff=lfs merge=lfs -text
+qdrant_storage/collections/vector_db/0/wal/open-2 filter=lfs diff=lfs merge=lfs -text

BioMistral-7B-GGUF/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

BioMistral-7B-GGUF/.gitattributes ADDED Viewed

	@@ -0,0 +1,45 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
+BioMistral-7B.Q8_0.gguf filter=lfs diff=lfs merge=lfs -text

BioMistral-7B-GGUF/README.md ADDED Viewed

	@@ -0,0 +1,238 @@

+---
+tags:
+- quantized
+- 2-bit
+- 3-bit
+- 4-bit
+- 5-bit
+- 6-bit
+- 8-bit
+- GGUF
+- transformers
+- pytorch
+- tensorboard
+- mistral
+- text-generation
+- medical
+- biology
+- conversational
+- fr
+- en
+- de
+- nl
+- es
+- pt
+- pl
+- ro
+- it
+- dataset:pubmed
+- arxiv:2402.10373
+- license:apache-2.0
+- autotrain_compatible
+- endpoints_compatible
+- text-generation-inference
+- region:us
+- text-generation
+model_name: BioMistral-7B-GGUF
+base_model: BioMistral/BioMistral-7B
+inference: false
+model_creator: BioMistral
+pipeline_tag: text-generation
+quantized_by: MaziyarPanahi
+---
+# [MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF)
+- Model creator: [BioMistral](https://huggingface.co/BioMistral)
+- Original model: [BioMistral/BioMistral-7B](https://huggingface.co/BioMistral/BioMistral-7B)
+## Description
+[MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF) contains GGUF format model files for [BioMistral/BioMistral-7B](https://huggingface.co/BioMistral/BioMistral-7B).
+## How to use
+Thanks to [TheBloke](https://huggingface.co/TheBloke) for preparing an amazing README on how to use GGUF models:
+### About GGUF
+GGUF is a new format introduced by the llama.cpp team on August 21st 2023. It is a replacement for GGML, which is no longer supported by llama.cpp.
+Here is an incomplete list of clients and libraries that are known to support GGUF:
+* [llama.cpp](https://github.com/ggerganov/llama.cpp). The source project for GGUF. Offers a CLI and a server option.
+* [text-generation-webui](https://github.com/oobabooga/text-generation-webui), the most widely used web UI, with many features and powerful extensions. Supports GPU acceleration.
+* [KoboldCpp](https://github.com/LostRuins/koboldcpp), a fully featured web UI, with GPU accel across all platforms and GPU architectures. Especially good for story telling.
+* [GPT4All](https://gpt4all.io/index.html), a free and open source local running GUI, supporting Windows, Linux and macOS with full GPU accel.
+* [LM Studio](https://lmstudio.ai/), an easy-to-use and powerful local GUI for Windows and macOS (Silicon), with GPU acceleration. Linux available, in beta as of 27/11/2023.
+* [LoLLMS Web UI](https://github.com/ParisNeo/lollms-webui), a great web UI with many interesting and unique features, including a full model library for easy model selection.
+* [Faraday.dev](https://faraday.dev/), an attractive and easy to use character-based chat GUI for Windows and macOS (both Silicon and Intel), with GPU acceleration.
+* [llama-cpp-python](https://github.com/abetlen/llama-cpp-python), a Python library with GPU accel, LangChain support, and OpenAI-compatible API server.
+* [candle](https://github.com/huggingface/candle), a Rust ML framework with a focus on performance, including GPU support, and ease of use.
+* [ctransformers](https://github.com/marella/ctransformers), a Python library with GPU accel, LangChain support, and OpenAI-compatible AI server. Note, as of time of writing (November 27th 2023), ctransformers has not been updated in a long time and does not support many recent models.
+### Explanation of quantisation methods
+<details>
+  <summary>Click to see details</summary>
+The new methods available are:
+* GGML_TYPE_Q2_K - "type-1" 2-bit quantization in super-blocks containing 16 blocks, each block having 16 weight. Block scales and mins are quantized with 4 bits. This ends up effectively using 2.5625 bits per weight (bpw)
+* GGML_TYPE_Q3_K - "type-0" 3-bit quantization in super-blocks containing 16 blocks, each block having 16 weights. Scales are quantized with 6 bits. This end up using 3.4375 bpw.
+* GGML_TYPE_Q4_K - "type-1" 4-bit quantization in super-blocks containing 8 blocks, each block having 32 weights. Scales and mins are quantized with 6 bits. This ends up using 4.5 bpw.
+* GGML_TYPE_Q5_K - "type-1" 5-bit quantization. Same super-block structure as GGML_TYPE_Q4_K resulting in 5.5 bpw
+* GGML_TYPE_Q6_K - "type-0" 6-bit quantization. Super-blocks with 16 blocks, each block having 16 weights. Scales are quantized with 8 bits. This ends up using 6.5625 bpw
+## How to download GGUF files
+**Note for manual downloaders:** You almost never want to clone the entire repo! Multiple different quantisation formats are provided, and most users only want to pick and download a single file.
+The following clients/libraries will automatically download models for you, providing a list of available models to choose from:
+* LM Studio
+* LoLLMS Web UI
+* Faraday.dev
+### In `text-generation-webui`
+Under Download Model, you can enter the model repo: [MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF) and below it, a specific filename to download, such as: BioMistral-7B-GGUF.Q4_K_M.gguf.
+Then click Download.
+### On the command line, including multiple files at once
+I recommend using the `huggingface-hub` Python library:
+```shell
+pip3 install huggingface-hub
+```
+Then you can download any individual model file to the current directory, at high speed, with a command like this:
+```shell
+huggingface-cli download MaziyarPanahi/BioMistral-7B-GGUF BioMistral-7B-GGUF.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
+```
+</details>
+<details>
+  <summary>More advanced huggingface-cli download usage (click to read)</summary>
+You can also download multiple files at once with a pattern:
+```shell
+huggingface-cli download [MaziyarPanahi/BioMistral-7B-GGUF](https://huggingface.co/MaziyarPanahi/BioMistral-7B-GGUF) --local-dir . --local-dir-use-symlinks False --include='*Q4_K*gguf'
+```
+For more documentation on downloading with `huggingface-cli`, please see: [HF -> Hub Python Library -> Download files -> Download from the CLI](https://huggingface.co/docs/huggingface_hub/guides/download#download-from-the-cli).
+To accelerate downloads on fast connections (1Gbit/s or higher), install `hf_transfer`:
+```shell
+pip3 install hf_transfer
+```
+And set environment variable `HF_HUB_ENABLE_HF_TRANSFER` to `1`:
+```shell
+HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download MaziyarPanahi/BioMistral-7B-GGUF BioMistral-7B-GGUF.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False
+```
+Windows Command Line users: You can set the environment variable by running `set HF_HUB_ENABLE_HF_TRANSFER=1` before the download command.
+</details>
+## Example `llama.cpp` command
+Make sure you are using `llama.cpp` from commit [d0cee0d](https://github.com/ggerganov/llama.cpp/commit/d0cee0d36d5be95a0d9088b674dbb27354107221) or later.
+```shell
+./main -ngl 35 -m BioMistral-7B-GGUF.Q4_K_M.gguf --color -c 32768 --temp 0.7 --repeat_penalty 1.1 -n -1 -p "<|im_start|>system
+{system_message}<|im_end|>
+<|im_start|>user
+{prompt}<|im_end|>
+<|im_start|>assistant"
+```
+Change `-ngl 32` to the number of layers to offload to GPU. Remove it if you don't have GPU acceleration.
+Change `-c 32768` to the desired sequence length. For extended sequence models - eg 8K, 16K, 32K - the necessary RoPE scaling parameters are read from the GGUF file and set by llama.cpp automatically. Note that longer sequence lengths require much more resources, so you may need to reduce this value.
+If you want to have a chat-style conversation, replace the `-p <PROMPT>` argument with `-i -ins`
+For other parameters and how to use them, please refer to [the llama.cpp documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/main/README.md)
+## How to run in `text-generation-webui`
+Further instructions can be found in the text-generation-webui documentation, here: [text-generation-webui/docs/04 ‐ Model Tab.md](https://github.com/oobabooga/text-generation-webui/blob/main/docs/04%20%E2%80%90%20Model%20Tab.md#llamacpp).
+## How to run from Python code
+You can use GGUF models from Python using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) or [ctransformers](https://github.com/marella/ctransformers) libraries. Note that at the time of writing (Nov 27th 2023), ctransformers has not been updated for some time and is not compatible with some recent models. Therefore I recommend you use llama-cpp-python.
+### How to load this model in Python code, using llama-cpp-python
+For full documentation, please see: [llama-cpp-python docs](https://abetlen.github.io/llama-cpp-python/).
+#### First install the package
+Run one of the following commands, according to your system:
+```shell
+# Base ctransformers with no GPU acceleration
+pip install llama-cpp-python
+# With NVidia CUDA acceleration
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+# Or with OpenBLAS acceleration
+CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
+# Or with CLBLast acceleration
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
+# Or with AMD ROCm GPU acceleration (Linux only)
+CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
+# Or with Metal GPU acceleration for macOS systems only
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
+# In windows, to set the variables CMAKE_ARGS in PowerShell, follow this format; eg for NVidia CUDA:
+$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on"
+pip install llama-cpp-python
+```
+#### Simple llama-cpp-python example code
+```python
+from llama_cpp import Llama
+# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
+llm = Llama(
+  model_path="./BioMistral-7B-GGUF.Q4_K_M.gguf",  # Download the model file first
+  n_ctx=32768,  # The max sequence length to use - note that longer sequence lengths require much more resources
+  n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
+  n_gpu_layers=35         # The number of layers to offload to GPU, if you have GPU acceleration available
+)
+# Simple inference example
+output = llm(
+  "<|im_start|>system
+{system_message}<|im_end|>
+<|im_start|>user
+{prompt}<|im_end|>
+<|im_start|>assistant", # Prompt
+  max_tokens=512,  # Generate up to 512 tokens
+  stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
+  echo=True        # Whether to echo the prompt
+)
+# Chat Completion API
+llm = Llama(model_path="./BioMistral-7B-GGUF.Q4_K_M.gguf", chat_format="llama-2")  # Set chat_format according to the model you are using
+llm.create_chat_completion(
+    messages = [
+        {"role": "system", "content": "You are a story writing assistant."},
+        {
+            "role": "user",
+            "content": "Write a story about llamas."
+        }
+    ]
+)
+```
+## How to use with LangChain
+Here are guides on using llama-cpp-python and ctransformers with LangChain:
+* [LangChain + llama-cpp-python](https://python.langchain.com/docs/integrations/llms/llamacpp)
+* [LangChain + ctransformers](https://python.langchain.com/docs/integrations/providers/ctransformers)

BioMistral-7B-GGUF/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "model_type": "mistral"
+}

BioMistral-7B.Q4_K_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a73107045dfe7e3f113b392b0a67e3e6ca9fa9dae2abe301424ce5abd1721a6
+size 4368439424

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: BioMistral Gradio
-emoji: 📚
-colorFrom: red
-colorTo: yellow
 sdk: gradio
 sdk_version: 4.29.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: BioMistral_gradio
+app_file: app.py
 sdk: gradio
 sdk_version: 4.29.0
 ---

__pycache__/app.cpython-39.pyc ADDED Viewed

Binary file (2.41 kB). View file

app.py ADDED Viewed

@@ -0,0 +1 @@
              chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever)
              print("LLM or Vector Database not initialized")
    history_langchain_format = []
    prompt = PromptTemplate(template=prompt_template,
    input_variables=["chat_history", 'message'])
    response = chain({"question": message, "chat_history": chat_history})
    answer = response['answer']
    chat_history.append((message, answer))
    temp = []
    for input_question, bot_answer in history:
         temp.append(input_question)
         temp.append(bot_answer)
         history_langchain_format.append(temp)
    temp.clear()
    temp.append(message)
    temp.append(answer)
    history_langchain_format.append(temp)
    return answer

              chain = ConversationalRetrievalChain.from_llm(llm=llm,retriever=retriever)
              print("LLM or Vector Database not initialized")
    history_langchain_format = []
    prompt = PromptTemplate(template=prompt_template,
    input_variables=["chat_history", 'message'])
    response = chain({"question": message, "chat_history": chat_history})
    answer = response['answer']
    chat_history.append((message, answer))
    temp = []
    for input_question, bot_answer in history:
         temp.append(input_question)
         temp.append(bot_answer)
         history_langchain_format.append(temp)
    temp.clear()
    temp.append(message)
    temp.append(answer)
    history_langchain_format.append(temp)
    return answer

data/10.1177_1557988318780857.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

ingest.py ADDED Viewed

@@ -0,0 +1 @@
    texts,
    embeddings,
    url=url,
    prefer_grpc=False,
    collection_name="vector_db"

+import os
    texts,
    embeddings,
    url=url,
    prefer_grpc=False,
    collection_name="vector_db"

llama-cpp-python/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

llama-cpp-python/.dockerignore ADDED Viewed

	@@ -0,0 +1,166 @@

+_skbuild/
+.envrc
+models/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

llama-cpp-python/.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+---
+# Prerequisites
+Please answer the following questions for yourself before submitting an issue.
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
+# Expected Behavior
+Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
+# Current Behavior
+Please provide a detailed written description of what `llama-cpp-python` did, instead.
+# Environment and Context
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+`$ lscpu`
+* Operating System, e.g. for Linux:
+`$ uname -a`
+* SDK version, e.g. for Linux:
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+# Failure Information (for bugs)
+Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+# Steps to Reproduce
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+**Note: Many issues seem to be regarding functional or performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
+Try the following:
+1. `git clone https://github.com/abetlen/llama-cpp-python`
+2. `cd llama-cpp-python`
+3. `rm -rf _skbuild/` # delete any old builds
+4. `python -m pip install .`
+5. `cd ./vendor/llama.cpp`
+6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
+7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
+# Failure Logs
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+Example environment info:
+```
+llama-cpp-python$ git log | head -1
+commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
+llama-cpp-python$ python3 --version
+Python 3.10.10
+llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy"
+fastapi                  0.95.0
+numpy                    1.24.3
+sse-starlette            1.3.3
+uvicorn                  0.21.1
+llama-cpp-python/vendor/llama.cpp$ git log | head -3
+commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
+Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
+Date:   Thu May 25 20:18:01 2023 -0600
+```

llama-cpp-python/.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.

llama-cpp-python/.github/dependabot.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"

llama-cpp-python/.github/workflows/build-and-release.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+name: Build Release
+on: workflow_dispatch
+permissions:
+  contents: write
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-2019, macos-11]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      # Used to host cibuildwheel
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -e .[all]
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.17.0
+        env:
+          # disable repair
+          CIBW_REPAIR_WHEEL_COMMAND: ""
+        with:
+          package-dir: .
+          output-dir: wheelhouse
+      - uses: actions/upload-artifact@v4
+        with:
+          name: wheels-${{ matrix.os }}
+          path: ./wheelhouse/*.whl
+  build_wheels_arm64:
+    name: Build arm64 wheels
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+        with:
+          platforms: linux/arm64
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.17.0
+        env:
+          CIBW_SKIP: "*musllinux* pp*"
+          CIBW_REPAIR_WHEEL_COMMAND: ""
+          CIBW_ARCHS: "aarch64"
+          CIBW_BUILD: "cp38-* cp39-* cp310-* cp311-* cp312-*"
+        with:
+          output-dir: wheelhouse
+      - name: Upload wheels as artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels_arm64
+          path: ./wheelhouse/*.whl
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip build
+          python -m pip install -e .[all]
+      - name: Build source distribution
+        run: |
+          python -m build --sdist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: sdist
+          path: ./dist/*.tar.gz
+  release:
+    name: Release
+    needs: [build_wheels, build_wheels_arm64, build_sdist]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          merge-multiple: true
+          path: dist
+      - uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

llama-cpp-python/.github/workflows/build-docker.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: Build Docker
+on: workflow_dispatch
+permissions:
+  contents: write
+  packages: write
+jobs:
+  docker:
+    name: Build and push Docker image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: "docker/simple/Dockerfile"
+          push: ${{ startsWith(github.ref, 'refs/tags/') }}
+          pull: true
+          platforms: linux/amd64,linux/arm64
+          tags: |
+            ghcr.io/abetlen/llama-cpp-python:latest
+            ghcr.io/abetlen/llama-cpp-python:${{ github.ref_name }}
+          build-args: |
+            BUILDKIT_INLINE_CACHE=1
+      - name: Publish to GitHub Tag
+        if: steps.docker_build.outputs.digest && startsWith(github.ref, 'refs/tags/')
+        run: |
+          echo "Docker image published for tag: ${{ github.ref_name }}"

llama-cpp-python/.github/workflows/build-wheels-cuda.yaml ADDED Viewed

	@@ -0,0 +1,131 @@

+name: Build Wheels (CUDA)
+on: workflow_dispatch
+permissions:
+  contents: write
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('ubuntu-20.04', 'windows-latest')
+              'pyver' = @("3.10", "3.11", "3.12")
+              'cuda' = @("12.1.1", "12.2.2", "12.3.2", "12.4.1")
+              'releasetag' = @("basic")
+          }
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+  build_wheels:
+    name: Build Wheel ${{ matrix.os }} ${{ matrix.pyver }} ${{ matrix.cuda }} ${{ matrix.releasetag == 'wheels' && 'AVX2' || matrix.releasetag }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    defaults:
+      run:
+        shell: pwsh
+    env:
+      CUDAVER: ${{ matrix.cuda }}
+      AVXVER: ${{ matrix.releasetag }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.pyver }}
+      - name: Setup Mamba
+        uses: conda-incubator/setup-miniconda@v3.0.4
+        with:
+          activate-environment: "build"
+          python-version: ${{ matrix.pyver }}
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          use-mamba: true
+          add-pip-as-python-dependency: true
+          auto-activate-base: false
+      - name: VS Integration Cache
+        id: vs-integration-cache
+        if: runner.os == 'Windows'
+        uses: actions/cache@v4.0.2
+        with:
+          path: ./MSBuildExtensions
+          key: cuda-${{ matrix.cuda }}-vs-integration
+      - name: Get Visual Studio Integration
+        if: runner.os == 'Windows' && steps.vs-integration-cache.outputs.cache-hit != 'true'
+        run: |
+          if ($env:CUDAVER -eq '12.1.1') {$x = '12.1.0'} else {$x = $env:CUDAVER}
+          $links = (Invoke-RestMethod 'https://raw.githubusercontent.com/Jimver/cuda-toolkit/master/src/links/windows-links.ts').Trim().split().where({$_ -ne ''})
+          for ($i=$q=0;$i -lt $links.count -and $q -lt 2;$i++) {if ($links[$i] -eq "'$x',") {$q++}}
+          Invoke-RestMethod $links[$i].Trim("'") -OutFile 'cudainstaller.zip'
+          & 'C:\Program Files\7-Zip\7z.exe' e cudainstaller.zip -oMSBuildExtensions -r *\MSBuildExtensions\* > $null
+          Remove-Item 'cudainstaller.zip'
+      - name: Install Visual Studio Integration
+        if: runner.os == 'Windows'
+        run: |
+          $y = (gi '.\MSBuildExtensions').fullname + '\*'
+          (gi 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\*\BuildCustomizations').fullname.foreach({cp $y $_})
+          $cupath = 'CUDA_PATH_V' + $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','_')
+          echo "$cupath=$env:CONDA_PREFIX" >> $env:GITHUB_ENV
+      - name: Install Dependencies
+        env:
+          MAMBA_DOWNLOAD_FAILFAST: "0"
+          MAMBA_NO_LOW_SPEED_LIMIT: "1"
+        run: |
+          $cudaVersion = $env:CUDAVER
+          mamba install -y 'cuda' -c nvidia/label/cuda-$cudaVersion
+          python -m pip install build wheel
+      - name: Build Wheel
+        run: |
+          $cudaVersion = $env:CUDAVER.Remove($env:CUDAVER.LastIndexOf('.')).Replace('.','')
+          $env:CUDA_PATH = $env:CONDA_PREFIX
+          $env:CUDA_HOME = $env:CONDA_PREFIX
+          $env:CUDA_TOOLKIT_ROOT_DIR = $env:CONDA_PREFIX
+          if ($IsLinux) {
+            $env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH
+          }
+          $env:VERBOSE = '1'
+          $env:CMAKE_ARGS = '-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=all'
+          $env:CMAKE_ARGS = "-DLLAMA_CUDA_FORCE_MMQ=ON $env:CMAKE_ARGS"
+          if ($env:AVXVER -eq 'AVX') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
+          }
+          if ($env:AVXVER -eq 'AVX512') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX512=on'
+          }
+          if ($env:AVXVER -eq 'basic') {
+            $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off'
+          }
+          python -m build --wheel
+          # write the build tag to the output
+          Write-Output "CUDA_VERSION=$cudaVersion" >> $env:GITHUB_ENV
+      - uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+          # Set tag_name to <tag>-cu<cuda_version>
+          tag_name: ${{ github.ref_name }}-cu${{ env.CUDA_VERSION }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

llama-cpp-python/.github/workflows/build-wheels-metal.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+name: Build Wheels (Metal)
+on: workflow_dispatch
+permissions:
+  contents: write
+jobs:
+  define_matrix:
+    name: Define Build Matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    defaults:
+      run:
+        shell: pwsh
+    steps:
+      - name: Define Job Output
+        id: set-matrix
+        run: |
+          $matrix = @{
+              'os' = @('macos-11', 'macos-12', 'macos-13')
+              'pyver' = @('3.10', '3.11', '3.12')
+          }
+          $matrixOut = ConvertTo-Json $matrix -Compress
+          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
+  build_wheels:
+    name: ${{ matrix.os }} Python ${{ matrix.pyver }}
+    needs: define_matrix
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
+    env:
+      OSVER: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.pyver }}
+      - name: Install Dependencies
+        run: |
+          python -m pip install build wheel cmake
+      - name: Build Wheel
+        run: |
+          XCODE15PATH="/Applications/Xcode_15.0.app/Contents/Developer"
+          XCODE15BINPATH="${XCODE15PATH}/Toolchains/XcodeDefault.xctoolchain/usr/bin"
+          export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_METAL=on"
+          [[ "$OSVER" == "macos-13" ]] && export CC="${XCODE15BINPATH}/cc" && export CXX="${XCODE15BINPATH}/c++" && export MACOSX_DEPLOYMENT_TARGET="13.0"
+          [[ "$OSVER" == "macos-12" ]] && export MACOSX_DEPLOYMENT_TARGET="12.0"
+          [[ "$OSVER" == "macos-11" ]] && export MACOSX_DEPLOYMENT_TARGET="11.0"
+          export CMAKE_OSX_ARCHITECTURES="arm64" && export ARCHFLAGS="-arch arm64"
+          VERBOSE=1 python -m build --wheel
+          if [[ "$OSVER" == "macos-13" ]]; then
+            export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
+            export MACOSX_DEPLOYMENT_TARGET="14.0"
+            VERBOSE=1 python -m build --wheel
+          fi
+          for file in ./dist/*.whl; do cp "$file" "${file/arm64.whl/aarch64.whl}"; done
+          export CMAKE_OSX_ARCHITECTURES="x86_64" && export CMAKE_ARGS="-DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_METAL=on" && export ARCHFLAGS="-arch x86_64"
+          VERBOSE=1 python -m build --wheel
+          if [[ "$OSVER" == "macos-13" ]]; then
+            export SDKROOT="${XCODE15PATH}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.0.sdk"
+            export MACOSX_DEPLOYMENT_TARGET="14.0"
+            VERBOSE=1 python -m build --wheel
+          fi
+      - uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+          # set release name to <tag>-metal
+          tag_name: ${{ github.ref_name }}-metal
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

llama-cpp-python/.github/workflows/generate-index-from-release.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: Wheels Index
+on:
+  # Trigger on any new release
+  release:
+    types: [published]
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+jobs:
+  # Single deploy job since we're just deploying
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Pages
+        uses: actions/configure-pages@v5
+      - name: Build
+        run: |
+          ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
+          ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          # Upload entire repository
+          path: 'index'
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4

llama-cpp-python/.github/workflows/publish-to-test.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
+name: Publish to TestPyPI
+on:
+  workflow_dispatch:
+    inputs:
+      dev_version:
+        description: 'Dev version N'
+        required: true
+jobs:
+  build-n-publish:
+    name: Build and publish
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: "recursive"
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.8"
+    - name: Append Dev Version to __version__
+      run: |
+        DEV_VERSION=${{ github.event.inputs.dev_version }}
+        CURRENT_VERSION=$(awk -F= '/__version__ =/ {print $2}' llama_cpp/__init__.py | tr -d ' "')
+        NEW_VERSION="${CURRENT_VERSION}.dev${DEV_VERSION}"
+        sed -i 's/__version__ = \".*\"/__version__ = \"'"${NEW_VERSION}"'\"/' llama_cpp/__init__.py
+    - name: Install dependencies
+      run: |
+        python3 -m pip install --upgrade pip build
+        python3 -m pip install -e .[all]
+    - name: Build source distribution
+      run: |
+        python3 -m build --sdist
+    - name: Publish to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository-url: https://test.pypi.org/legacy/

llama-cpp-python/.github/workflows/publish.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+name: Publish to PyPI
+# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
+on: workflow_dispatch
+jobs:
+  build-n-publish:
+    name: Build and publish
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: "recursive"
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.8"
+    - name: Install dependencies
+      run: |
+        python3 -m pip install --upgrade pip build
+        python3 -m pip install -e .[all]
+    - name: Build source distribution
+      run: |
+        python3 -m build --sdist
+    - name: Publish distribution to PyPI
+      # TODO: move to tag based releases
+      # if: startsWith(github.ref, 'refs/tags')
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}

llama-cpp-python/.github/workflows/test-pypi.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: Tests for PyPI package
+on: workflow_dispatch
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install --verbose llama-cpp-python[all]
+      - name: Test with pytest
+        run: |
+          python3 -c "import llama_cpp"
+  build-windows:
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install --verbose llama-cpp-python[all]
+      - name: Test with pytest
+        run: |
+          python3 -c "import llama_cpp"
+  build-macos:
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install --verbose llama-cpp-python[all]
+      - name: Test with pytest
+        run: |
+          python3 -c "import llama_cpp"

llama-cpp-python/.github/workflows/test.yaml ADDED Viewed

	@@ -0,0 +1,126 @@

+name: Tests
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install .[all] -v
+      - name: Test with pytest
+        run: |
+          python3 -m pytest
+  build-windows:
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install .[all] -v
+      - name: Test with pytest
+        run: |
+          python3 -m pytest
+  build-macos:
+    runs-on: macos-13
+    strategy:
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install .[all] --verbose
+      - name: Test with pytest
+        run: |
+          python3 -m pytest
+  # build-linux-opencl:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #       with:
+  #         submodules: "recursive"
+  #     - name: Set up Python 3.8
+  #       uses: actions/setup-python@v5
+  #       with:
+  #         python-version: "3.8"
+  #     - name: Set up OpenCL & CLBlast
+  #       run: |
+  #         wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+  #         echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+  #         sudo apt-get update
+  #         sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
+  #     - name: Install dependencies
+  #       run: |
+  #         python3 -m pip install --upgrade pip
+  #         CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install .[all] --verbose
+  #     - name: Test with pytest
+  #       run: |
+  #         python3 -m pytest
+  build-macos-metal:
+    runs-on: macos-13
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
+      - name: Test with pytest
+        run: |
+          python3 -m pytest

llama-cpp-python/.gitignore ADDED Viewed

	@@ -0,0 +1,180 @@

+*.local
+.python-version
+.vscode/
+_skbuild/
+.envrc
+.direnv
+models/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+llama_cpp/*.so
+llama_cpp/*.dylib
+llama_cpp/*.metal
+llama_cpp/*.dll
+llama_cpp/*.lib
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# downloaded model .bin files
+docker/open_llama/*.bin

llama-cpp-python/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "vendor/llama.cpp"]
+	path = vendor/llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git

llama-cpp-python/.readthedocs.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Read the Docs configuration file for MkDocs projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+# Required
+version: 2
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+mkdocs:
+  configuration: mkdocs.yml
+python:
+  install:
+    - method: pip
+      path: .
+    - requirements: docs/requirements.txt
+submodules:
+  include: all
+  recursive: true

llama-cpp-python/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,630 @@

+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [0.2.69]
+- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
+- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
+- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
+- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
+- fix: UTF-8 handling with grammars by @jsoma in #1415
+## [0.2.68]
+- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
+- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
+- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
+## [0.2.67]
+- fix: Ensure image renders before text in chat formats regardless of message content order by @abetlen in 3489ef09d3775f4a87fb7114f619e8ba9cb6b656
+- fix(ci): Fix bug in use of upload-artifact failing to merge multiple artifacts into a single release by @abetlen in d03f15bb73a1d520970357b702a9e7d4cc2a7a62
+## [0.2.66]
+- feat: Update llama.cpp to ggerganov/llama.cpp@8843a98c2ba97a25e93319a104f9ddfaf83ce4c4
+- feat: Generic Chat Formats, Tool Calling, and Huggingface Pull Support for Multimodal Models (Obsidian, LLaVA1.6, Moondream) by @abetlen in #1147
+- ci(fix): Workflow actions updates and fix arm64 wheels not included in release by @Smartappli in #1392
+- ci: Add support for pre-built cuda 12.4.1 wheels by @Smartappli in #1388
+- feat: Add support for str type kv_overrides by @abetlen in a411612b385cef100d76145da1fbd02a7b7cc894
+- fix: Functionary bug fixes by @jeffrey-fong in #1385
+- examples: fix quantize example by @iyubondyrev in #1387
+- ci: Update dependabot.yml by @Smartappli in #1391
+## [0.2.65]
+- feat: Update llama.cpp to ggerganov/llama.cpp@46e12c4692a37bdd31a0432fc5153d7d22bc7f72
+- feat: Allow for possibly non-pooled embeddings by @iamlemec in #1380
+## [0.2.64]
+- feat: Update llama.cpp to ggerganov/llama.cpp@4e96a812b3ce7322a29a3008db2ed73d9087b176
+- feat: Add `llama-3` chat format by @andreabak in #1371
+- feat: Use new llama_token_is_eog in create_completions by @abetlen in d40a250ef3cfaa8224d12c83776a2f1de96ae3d1
+- feat(server): Provide ability to dynamically allocate all threads if desired using -1 by @sean-bailey in #1364
+- ci: Build arm64 wheels by @gaby in 611781f5319719a3d05fefccbbf0cc321742a026
+- fix: Update scikit-build-core build dependency avoid bug in 0.9.1 by @evelkey in #1370
+## [0.2.63]
+- feat: Update llama.cpp to ggerganov/llama.cpp@0e4802b2ecbaab04b4f829fde4a3096ca19c84b5
+- feat: Add stopping_criteria to ChatFormatter, allow stopping on arbitrary token ids, fixes llama3 instruct by @abetlen in cc81afebf04d26ca1ac3cf72f23f18da6ab58588
+## [0.2.62]
+- feat: Update llama.cpp to ggerganov/llama.cpp@3b8f1ec4b18770531d0b1d792f3edf08254e4f0c
+- feat: update grammar schema converter to match llama.cpp by @themrzmaster in #1353
+- feat: add disable_ping_events flag by @khimaros in #1257
+- feat: Make saved state more compact on-disk by @tc-wolf in #1296
+- feat: Use all available CPUs for batch processing by @ddh0 in #1345
+## [0.2.61]
+- feat: Update llama.cpp to ggerganov/llama.cpp@ba5e134e073ec6837078c874aba44a702944a676
+- fix: pass correct type to chat handlers for chat completion logprobs by @abetlen in bb65b4d76411112c6fb0bf759efd746f99ef3c6b
+- feat: Add support for yaml based server configs by @abetlen in 060bfa64d529ade2af9b1f4e207a3937bbc4138f
+- feat: Add typechecking for ctypes structure attributes by @abetlen in 1347e1d050fc5a9a32ffe0bb3e22858da28003bd
+## [0.2.60]
+- feat: Update llama.cpp to ggerganov/llama.cpp@75cd4c77292034ecec587ecb401366f57338f7c0
+- fix: Always embed metal library by @abetlen in b3bfea6dbfb6ed9ce18f9a2723e0a9e4bd1da7ad
+- fix: missing logprobs in response, incorrect response type for functionary by @abetlen in 1ae3abbcc3af7f4a25a3ffc40b246f18039565e8
+- fix(docs): incorrect tool_choice example by @CISC in #1330
+## [0.2.59]
+- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
+- feat: Binary wheels for CPU, CUDA (12.1 - 12.3), Metal by @abetlen, @jllllll, and @oobabooga in #1247
+- fix: segfault when logits_all=False by @abetlen in 8649d7671bd1a7c0d9cc6a5ad91c6ca286512ab3
+- fix: last tokens passing to sample_repetition_penalties function by @ymikhailov in #1295
+## [0.2.58]
+- feat: Update llama.cpp to ggerganov/llama.cpp@ba0c7c70ab5b15f1f2be7fb0dfbe0366dda30d6c
+- feat: add support for KV cache quantization options by @Limour-dev in #1307
+- feat: Add logprobs support to chat completions by @windspirit95 in #1311
+- fix: set LLAMA_METAL_EMBED_LIBRARY=on on MacOS arm64 by @bretello in #1289
+- feat: Add tools/functions variables to Jinja2ChatFormatter, add function response formatting for all simple chat formats by @CISC in #1273
+- fix: Changed local API doc references to hosted by by @lawfordp2017 in #1317
+## [0.2.57]
+- feat: Update llama.cpp to ggerganov/llama.cpp@ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1
+- fix: set default embedding pooling type to unspecified by @abetlen in 4084aabe867b8ec2aba1b22659e59c9318b0d1f3
+- fix: Fix and optimize functionary chat handler by @jeffrey-fong in #1282
+- fix: json mode for basic chat formats by @abetlen in 20e6815252d0efd9f015f7adbf108faaf36e3f3c
+## [0.2.56]
+- feat: Update llama.cpp to ggerganov/llama.cpp@c2101a2e909ac7c08976d414e64e96c90ee5fa9e
+- feat(server): Add endpoints for tokenize, detokenize and count tokens by @felipelo in #1136
+- feat: Switch embed to llama_get_embeddings_seq by @iamlemec in #1263
+- fix: Fixed json strings grammar by blacklisting character control set by @ExtReMLapin in d02a9cf16ff88ad011e2eb1ce29f4d9400f13cd1
+- fix: Check for existence of clip model path by @kejcao in #1264
+## [0.2.55]
+- feat: Update llama.cpp to ggerganov/llama.cpp@9731134296af3a6839cd682e51d9c2109a871de5
+- docs: fix small typo in README: 'model know how' -> 'model knows how' by @boegel in #1244
+## [0.2.54]
+- feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
+- docs: fix typo in README.md embeddings example by @iamlemec in #1232
+## [0.2.53]
+- feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
+- fix: eos/bos_token set correctly for Jinja2ChatFormatter and automatic chat formatter by @CISC in #1230
+## [0.2.52]
+- feat: Update llama.cpp to ggerganov/llama.cpp@a33e6a0d2a66104ea9a906bdbf8a94d050189d91
+- fix: Llava15ChatHandler (this function takes at least 4 arguments) by @abetlen in 8383a9e5620f5df5a88f62da16813eac200dd706
+## [0.2.51]
+- feat: Update llama.cpp to ggerganov/llama.cpp@c39373398803c669056304090050fe3f44b41bf9
+- fix: Restore type hints for low-level api by @abetlen in 19234aa0dbd0c3c87656e65dd2b064665371925b
+## [0.2.50]
+- docs: Update Functionary OpenAI Server Readme by @jeffrey-fong in #1193
+- fix: LlamaHFTokenizer now receives pre_tokens by @abetlen in 47bad30dd716443652275099fa3851811168ff4a
+## [0.2.49]
+- fix: module 'llama_cpp.llama_cpp' has no attribute 'c_uint8' in Llama.save_state by @abetlen in db776a885cd4c20811f22f8bd1a27ecc71dba927
+- feat: Auto detect Mixtral's slightly different format by @lukestanley in #1214
+## [0.2.48]
+- feat: Update llama.cpp to ggerganov/llama.cpp@15499eb94227401bdc8875da6eb85c15d37068f7
+- feat: Add Google's Gemma formatting via chat_format="gemma" by @alvarobartt in #1210
+- feat: support minItems/maxItems in JSON grammar converter by @nopperl in 3921e10770996d95a9eb22c8248bacef39f69365
+- fix: Update from_pretrained defaults to match hf_hub_download and pull to local cache folder by @abetlen in e6d6260a91b7831733f7d1f73c7af46a3e8185ed
+- fix: Raise exceptions when llama model or context fails to load by @abetlen in dd22010e85265ae840c76ec835d67a29ed852722
+- docs: Update README.md to fix pip install llama cpp server by @audip in #1187
+## [0.2.47]
+- feat: Update llama.cpp to ggerganov/llama.cpp@973053d8b0d04809836b3339a50f68d9c842de90
+## [0.2.46]
+- feat: Update llama.cpp to ggerganov/llama.cpp@ba2135ccae7462470b3865c6e41d2e1d734eac05
+- feat: Pull models directly from huggingface by @abetlen in #1206
+- feat(low-level-api): Improve API static type-safety and performance. Low level api functions are positional args only now. by @abetlen in #1205
+## [0.2.45]
+- feat: Update llama.cpp to ggerganov/llama.cpp@89febfed9322c8849520dc63c93ee4f5fd72556e
+## [0.2.44]
+- feat: Update llama.cpp to ggerganov/llama.cpp@4524290e87b8e107cc2b56e1251751546f4b9051
+- fix: create_embedding broken response for input type str by @abetlen in 0ce66bc080fe537590b05b24bf442480bf2dd045
+- fix: Use '\n' seperator for EventSourceResponse by @khimaros in #1188
+- fix: Incorporate embedding pooling layer fixes by @iamlemec in #1194
+## [0.2.43]
+- feat: Update llama.cpp to ggerganov/llama.cpp@8084d554406b767d36b3250b3b787462d5dd626f
+- feat: Support batch embeddings by @iamlemec in #1186
+- fix: submodule kompute is not included in sdist by @abetlen in 7dbbfdecadebe7750be650d9409959640ff9a460
+- fix: fix: Update openbuddy prompt format by @abetlen in 07a783779a62a4aac0b11161c7e0eb983ff215f8
+## [0.2.42]
+- feat: Update llama.cpp to ggerganov/llama.cpp@ea9c8e11436ad50719987fa23a289c74b7b40d40
+- fix: sample idx off-by-one error for logit_processors by @lapp0 in #1179
+- fix: chat formatting bugs in `chatml-function-calling` by @abetlen in 4b0e3320bd8c2c209e29978d0b21e2e471cc9ee3 and 68fb71b6a26a1e57331868f959b47ab4b87851e1
+## [0.2.41]
+- feat: Update llama.cpp to ggerganov/llama.cpp@895407f31b358e3d9335e847d13f033491ec8a5b
+- fix: Don't change order of json schema object properties in generated grammar unless prop_order is passed by @abetlen in d1822fed6b706f38bd1ff0de4dec5baaa3cf84fa
+## [0.2.40]
+- feat: Update llama.cpp to ggerganov/llama.cpp@3bdc4cd0f595a6096cca4a64aa75ffa8a3503465
+- feat: Generic chatml Function Calling using chat_format="chatml-function-calling"` by @abetlen in #957
+- fix: Circular dependancy preventing early Llama object free by @notwa in #1176
+- docs: Set the correct command for compiling with syscl support by @akarshanbiswas in #1172
+- feat: use gpu backend for clip if available by @iamlemec in #1175
+## [0.2.39]
+- feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
+- fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874
+## [0.2.38]
+- feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
+- feat: Add speculative decoding by @abetlen in #1120
+- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95
+## [0.2.37]
+- feat: Update llama.cpp to ggerganov/llama.cpp@fea4fd4ba7f6b754ac795387b275e1a014a77bde
+- feat: Automatically set chat format from gguf by @abetlen in #1110
+## [0.2.36]
+- feat: Update llama.cpp to ggerganov/llama.cpp@2aed77eb06a329f0d82bb1c467f4244904d4073f
+- feat: Add mistral instruct chat format as "mistral-instruct" by @Rafaelblsilva in #799
+## [0.2.35]
+- feat: Update llama.cpp to ggerganov/llama.cpp@d2f650cb5b04ee2726663e79b47da5efe196ce00
+## [0.2.34]
+- feat: Update llama.cpp to ggerganov/llama.cpp@6db2b41a76ee78d5efdd5c3cddd5d7ad3f646855
+- feat: Add json schema mode by @abetlen in #1122
+## [0.2.33]
+- feat: Update llama.cpp to ggerganov/llama.cpp@faa3526a1eba458120987ed8269e5616385a76f4
+- feat(server): include llama-cpp-python version in openapi spec by @abetlen in cde7514c3d28e6d52f272614e9957208c344dde5
+- fix: use both eos and bos tokens as stop sequences for hf-tokenizer-config chat format. by @abetlen in 5b982d0f8c6f35242c8862ffdce00e17cea0b44f
+- fix: GGUF metadata KV overrides, re #1011 by @phiharri in #1116
+- fix: llama_log_set should be able to accept null pointer by @abetlen in c970d41a85381fd55235136f123422df0bf0c7e7
+## [0.2.32]
+- feat: Update llama.cpp to ggerganov/llama.cpp@504dc37be8446fb09b1ede70300250ad41be32a2
+- fix: from_json_schema oneof/anyof bug by @jndiogo in d3f5528ca8bcb9d69d4f27e21631e911f1fb9bfe
+- fix: pass chat handler not chat formatter for huggingface autotokenizer and tokenizer_config formats by @abetlen in 24f39454e91cf5dddbc4b6041aead4accc7c7a2d
+- feat: Add add_generation_prompt option for jinja2chatformatter by @abetlen in 7f3209b1eb4ad3260ba063801fab80a8c25a2f4c
+- feat: Add Jinja2ChatFormatter by @abetlen in be09318c26add8674ce494ae7cc480cce72a4146
+- feat: Expose gguf model metadata in metadata property by @abetlen in 5a34c57e5479e50c99aba9b38218cc48e6560b81
+## [0.2.31]
+- feat: Update llama.cpp to ggerganov/llama.cpp@a5cacb22b2114fd9adf61c00cbb237384d86bced
+- fix: Mirostat sampling now passes correct type to ctypes and tracks state during generation by @abetlen in 3babe3512cb95743108f2b595210c38ed6f1b904
+- fix: Python3.8 support in server by @abetlen in 141293a75b564a8699e0acba1da24d9aa1cf0ab1
+## [0.2.30]
+- feat: Update llama.cpp to ggerganov/llama.cpp@57e2a7a52a819883f40dada8a2edc24ecf48186b
+- feat(server): Add ability to load chat format from huggingface autotokenizer or tokenizer_config.json files by @abetlen in b8fc1c7d83ad4a9207c707ba1d954fe580286a01
+- feat: Integration of Jinja2 Templating for chat formats by @teleprint-me in #875
+- fix: Offload KQV by default by @abetlen in 48c3b77e6f558a9899de0e1155c7dc0c7958d8e8
+- fix: Support Accept text/event-stream in chat and completion endpoints, resolves #1083 by @aniljava in #1088
+- fix(cli): allow passing n_ctx=0 to openAI API server args to use model n_ctx_train field per #1015 by @K-Mistele in #1093
+## [0.2.29]
+- feat: Update llama.cpp to ggerganov/llama.cpp@4483396751c79dea540808b9cb9238245d06da2b
+- feat: Add split_mode option by @abetlen in 84615adbc6855c8384807c42f0130f9a1763f99d
+- feat: Implement GGUF metadata KV overrides by @phiharri in #1011
+- fix: Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor by @yieldthought in #1012
+- fix: Fix low_level_api_chat_cpp example to match current API by @aniljava in #1086
+- fix: Fix Pydantic model parsing by @DeNeutoy in #1087
+## [0.2.28]
+- feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6
+- feat: Add ability to pass in penalize_nl param by @shankinson in #1068
+- fix: print_grammar to stderr by @turian in #1052
+## [0.2.27]
+- feat: Update llama.cpp to ggerganov/llama.cpp@b3a7c20b5c035250257d2b62851c379b159c899a
+- feat: Add `saiga` chat format by @femoiseev in #1050
+- feat: Added `chatglm3` chat format by @xaviviro in #1059
+- fix: Correct typo in README.md by @qeleb in (#1058)
+## [0.2.26]
+- feat: Update llama.cpp to ggerganov/llama.cpp@f6793491b5af6da75edad34d6f503ef86d31b09f
+## [0.2.25]
+- feat(server): Multi model support by @D4ve-R in #931
+- feat(server): Support none defaulting to infinity for completions by @swg in #111
+- feat(server): Implement openai api compatible authentication by @docmeth2 in #1010
+- fix: text_offset of multi-token characters by @twaka in #1037
+- fix: ctypes bindings for kv override by @phiharri in #1011
+- fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
+## [0.2.24]
+- feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a
+- feat: Add offload_kqv option to llama and server by @abetlen in 095c65000642a3cf73055d7428232fb18b73c6f3
+- feat: n_ctx=0 now uses the n_ctx_train of the model by @DanieleMorotti in #1015
+- feat: logits_to_logprobs supports both 2-D and 3-D logits arrays by @kddubey in #1002
+- fix: Remove f16_kv, add offload_kqv fields in low level and llama apis by @brandonrobertz in #1019
+- perf: Don't convert logprobs arrays to lists by @kddubey in #1021
+- docs: Fix README.md functionary demo typo by @evelynmitchell in #996
+- examples: Update low_level_api_llama_cpp.py to match current API by @jsoma in #1023
+## [0.2.23]
+- Update llama.cpp to ggerganov/llama.cpp@948ff137ec37f1ec74c02905917fa0afc9b97514
+- Add qwen chat format by @yhfgyyf in #1005
+- Add support for running the server with SSL by @rgerganov in #994
+- Replace logits_to_logprobs implementation with numpy equivalent to llama.cpp by @player1537 in #991
+- Fix UnsupportedOperation: fileno in suppress_stdout_stderr by @zocainViken in #961
+- Add Pygmalion chat format by @chiensen in #986
+- README.md multimodal params fix by @zocainViken in #967
+- Fix minor typo in README by @aniketmaurya in #958
+## [0.2.22]
+- Update llama.cpp to ggerganov/llama.cpp@8a7b2fa528f130631a5f43648481596ab320ed5a
+- Fix conflict with transformers library by kddubey in #952
+## [0.2.21]
+- Update llama.cpp to ggerganov/llama.cpp@64e64aa2557d97490b2fe1262b313e2f4a1607e3
+- Make building llava optional by setting `CMAKE_ARGS="-DLLAVA_BUILD=OFF"` and using `LLAVA_CPP_LIB` to specify alternative path to shared library by @abetlen in e3941d9c674dbd9891dc3ceda390daeb21f05fd1
+## [0.2.20]
+- Update llama.cpp to ggerganov/llama.cpp@b38a16dfcff88d547f78f52d1bea31b84a05aff7
+- Add `zephyr` chat format by @fakerybakery in #938
+- Add `baichuan` chat format by @caiyesd in #938
+- Add `baichuan-2` chat format by @caiyesd in #936
+- Improve documentation for server chat formats by @jooray in #934
+- Fix typo in README by @antonvice in 940
+- Fix typo in the Open Orca chat format by @gardner in #947
+## [0.2.19]
+- Update llama.cpp to ggerganov/llama.cpp@0b871f1a04ef60e114bbe43004fd9c21114e802d
+- Fix #569: stop parameter in chat completion api should accept str by @abetlen in 128dc4731fa846ead7e684a137ca57d8931b8899
+- Document server host and port parameters by @jamesbraza in #768
+- Do not set grammar to None when initializing LlamaGrammar by @mthuurne in #834
+- Add mistrallite, intel, and openchat formats by @fakerybakery in #927
+- Add support for min_p parameter by @tk-master in #921
+- Fix #929: tokenizer adding leading space when generating from empty prompt by @abetlen in a34d48014192771d2e308a76c22f33bc0318d983
+- Fix low level api example by @zocainViken in #925
+- Fix missing package in openblas docker image by @ZisisTsatsas in #920
+## [0.2.18]
+- Update llama.cpp to ggerganov/llama.cpp@6bb4908a17150b49373b5f977685b2e180a04f6f
+## [0.2.17]
+- Update llama.cpp to ggerganov/llama.cpp@df9d1293defe783f42bc83af732d3c670552c541
+- Hotfix: Set `CUDA_ARCHITECTURES=OFF` for `llava_shared` target on Windows by @abetlen in 4388f3341413110217b98c4f097ac5c590bdf40b
+## [0.2.16]
+- Update llama.cpp to ggerganov/llama.cp@a75fa576abba9d37f463580c379e4bbf1e1ad03c
+- Add `set_seed` to `Llama` class by @abetlen in fd41ed3a908761d286102a019a34c2938a15118d
+- Fix server doc arguments by @kjunggithub in #892
+- Fix response_format handler in llava chat handler by @abetlen in b62c44983921197ed10a7d29dc4ba920e9979380
+- Fix default max_tokens, chat completion is now unlimited (to context length) and completion is 16 tokens to match OpenAI defaults by @abetlen in e7962d2c733cbbeec5a37392c81f64185a9a39e8
+- Fix json_schema_to_gbnf helper so that it takes a json schema string as input instead by @abetlen in faeae181b1e868643c0dc28fcf039f077baf0829
+- Add support for $ref and $def in json_schema_to_gbnf to handle more complex function schemas by @abetlen in 770df344369c0630df1be14be9f9e301e7c56d24
+- Update functionary chat handler for new OpenAI api by abetlen in 1b376c62b775b401653facf25a519d116aafe99a
+- Fix add default stop sequence to chatml chat format by @abetlen in b84d76a844149216d511cfd8cdb9827148a1853c
+- Fix sampling bug when logits_all=False by @abetlen in 6f0b0b1b840af846938ed74d0e8170a91c40e617
+## [0.2.15]
+- Update llama.cpp to ggerganov/llama.cpp@0a7c980b6f94a049cb804573df2d8092a34df8e4
+- Add support for Llava1.5 multimodal models by @damian0815 and @abetlen in #821
+- Update OpenAI API compatibility to match dev day update by @abetlen in #821
+- Add seed parameter to completion and chat_completion functions of Llama class by @abetlen in 86aeb9f3a14808575d2bb0076e6acb4a30907e6a
+- Add JSON mode support to constrain chat completion to JSON objects by @abetlen in b30b9c338bf9af316d497ea501d39f5c246900db
+## [0.2.14]
+- Update llama.cpp to ggerganov/llama.cpp@f0b30ef7dc1360922ccbea0a8cd3918ecf15eaa7
+- Add support for Huggingface Autotokenizer Chat Formats by @bioshazard and @abetlen in #790 and bbffdaebaa7bb04b543dbf683a07276087251f86
+- Fix llama-2 chat format by @earonesty in #869
+- Add support for functionary chat format by @abetlen in #784
+- Migrate inference from deprecated `llama_eval`API to `llama_batch` and `llama_decode` by @abetlen in #795
+## [0.2.13]
+- Update llama.cpp to ggerganov/llama.cpp@51b2fc11f7f605fff49725a4540e9a6ef7b51b70
+- Fix name 'open' is not defined exception when deleting model by @abetlen in 011b95d7f34cbfc528af75a892757bd9a20838ab
+- Fix tokenization of special characters by @antoine-lizee in #850
+## [0.2.12]
+- Update llama.cpp to ggerganov/llama.cpp@50337961a678fce4081554b24e56e86b67660163
+- Fix missing `n_seq_id` in `llama_batch` by @NickAlgra in #842
+- Fix for shared libraries on Windows that start with `lib` prefix by @sujeendran in #848
+- Fix exception raised in `__del__` when freeing models by @cebtenzzre in #846
+- Performance improvement for logit bias by @zolastro in #851
+- Fix suffix check arbitrary code execution bug by @mtasic85 in #854
+- Fix typo in `function_call` parameter in `llama_types.py` by @akatora28 in #849
+- Fix streaming not returning `finish_reason` by @gmcgoldr in #798
+- Fix `n_gpu_layers` check to allow values less than 1 for server by @hxy9243 in #826
+- Supppress stdout and stderr when freeing model by @paschembri in #803
+- Fix `llama2` chat format by @delock in #808
+- Add validation for tensor_split size by @eric1932 #820
+- Print stack trace on server error by @abetlen in d6a130a052db3a50975a719088a9226abfebb266
+- Update docs for gguf by @johnccshen in #783
+- Add `chatml` chat format by @abetlen in 305482bd4156c70802fc054044119054806f4126
+## [0.2.11]
+- Fix bug in `llama_model_params` object has no attribute `logits_all` by @abetlen in d696251fbe40015e8616ea7a7d7ad5257fd1b896
+## [0.2.10]
+- Fix bug 'llama_model_params' object has no attribute 'embedding' by @abetlen in 42bb721d64d744242f9f980f2b89d5a6e335b5e4
+## [0.2.9]
+- Fix critical bug in pip installation of v0.2.8 due to `.git` directory in ac853e01e1a217a578080a4e1b851d2d08450adf
+## [0.2.8]
+- Update llama.cpp to ggerganov/llama.cpp@40e07a60f9ce06e79f3ccd4c903eba300fb31b5e
+- Add configurable chat formats by @abetlen in #711
+- Fix rope scaling bug by @Josh-XT in #767
+- Fix missing numa parameter in server by @abetlen in d9bce17794d0dd6f7962d10aad768fedecf3ab89
+## [0.2.7]
+- Update llama.cpp to ggerganov/llama.cpp@a98b1633d5a94d0aa84c7c16e1f8df5ac21fc850
+- Install required runtime dlls to package directory on windows by @abetlen in 8d75016549e2ff62a511b1119d966ffc0df5c77b
+- Add openai-processing-ms to server response header by @Tradunsky in #748
+- Bump minimum version of scikit-build-core to 0.5.1 to fix msvc cmake issue by @abetlen in 1ed0f3ebe16993a0f961155aa4b2c85f1c68f668
+- Update `llama_types.py` to better match the openai api, old names are aliased to new ones by @abetlen in dbca136feaaf7f8b1182c4c3c90c32918b1d0bb3
+## [0.2.6]
+- Update llama.cpp to 80291a1d02a07f7f66666fb576c5b1e75aa48b46
+## [0.2.5]
+- Fix docker images missing starlette-context dependency by @abetlen in 22917989003c5e67623d54ab45affa1e0e475410
+- Fix loading dll in Windows Isolation Containers by @abetlen in 847466562573191efa655753d9252f308c4fbdb0
+- Fix build issue on m1 macs by @abetlen in dbd3a6d1ed8416a8fd800127251e730153afa305
+- Update docs to gguf and add hw acceleration docs for server by @jasonacox in #688
+## [0.2.4]
+- Add NUMA support. **NOTE** low level api users must call llama_backend_init at the start of their programs by abetlen in f4090a0bb2a2a25acfe28d31c82cc1aa273bedee
+- Fix tensor_split server cli argument by @abetlen in c4c440ba2dc86d9de728a751311fdd1c8e3756fa
+- Made all `Llama` init parameters into keyword-only parameters by @abetlen in c8f9b8a734b5b040379bbd93995ba177affab1fe
+- Added server params for `low_vram`, `main_gpu`, `lora_base`, and `lora_path` by @abetlen in 2920c4bf7ee1412d6bba7846e0e1b7ef6d34043b
+- Removed server params for `rms_norm_eps` and `n_gqa` by @abetlen in 2920c4bf7ee1412d6bba7846e0e1b7ef6d34043b
+- Fix boolean cli options by @abetlen in c999325e8e4507f6c6249dd2fb8de7f8bf57f71e and 0449d29b9f940e437231a07b9d56550226558bac
+- Silence Pydantic Settings warnings about `model_alias` setting by @earonesty in #705
+## [0.2.3]
+- Update llama.cpp to ggerganov/llama.cpp@71ca2fad7d6c0ef95ef9944fb3a1a843e481f314
+- Add X-Request-ID request header for mirroring custom IDs by @devrimcavusoglu in #703
+- Add pyproject extra for scikit-build-core to ensure compatible pathspec version by @abetlen in 6cfc54284b99ef1bff8193e2d5e483dbd89ada02
+- Fix issue with Literal and Optional cli arguments not working by @abetlen in #702
+## [0.2.2]
+- Fix bug in pip install of v0.2.1 due to scikit-build-core removing all `.metal` files in the source distribution (see #701)
+## [0.2.1]
+- Fix bug in pip install of v0.2.0 due to .git folder being included in the source distribution (see #701)
+## [0.2.0]
+- Migrated to scikit-build-core build system by @abetlen in #499
+- Use `numpy` views for `LogitsProcessor` and `StoppingCriteria` instead of python lists by @abetlen in #499
+- Drop support for end-of-life Python3.7 by @abetlen in #499
+- Convert low level `llama.cpp` constants to use basic python types instead of `ctypes` types by @abetlen in #499
+## [0.1.85]
+- Add `llama_cpp.__version__` attribute by @janvdp in #684
+- Fix low level api examples by @jbochi in #680
+## [0.1.84]
+- Update llama.cpp
+## [0.1.83]
+- Update llama.cpp
+## [0.1.82]
+- Update llama.cpp
+## [0.1.81]
+- Update llama.cpp
+## [0.1.80]
+- Update llama.cpp
+## [0.1.79]
+- GGUF Support (breaking change requiring new model format)
+## [0.1.78]
+- Grammar based sampling via LlamaGrammar which can be passed to completions
+- Make n_gpu_layers == -1 offload all layers
+## [0.1.77]
+- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
+- (server) Add temporary n_gqa and rms_norm_eps parameters required for LLaMa 2 70B
+## [0.1.76]
+- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
+## [0.1.75]
+- Update llama.cpp
+## [0.1.74]
+- (server) OpenAI style error responses
+## [0.1.73]
+- (server) Add rope parameters to server settings
+## [0.1.72]
+- (llama.cpp) Update llama.cpp added custom_rope for extended context lengths
+## [0.1.71]
+- (llama.cpp) Update llama.cpp
+- (server) Fix several pydantic v2 migration bugs
+## [0.1.70]
+- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
+- (server) Fixed changed settings field names from pydantic v2 migration
+## [0.1.69]
+- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
+- (server) Moved to fastapi v0.100.0 and pydantic v2
+- (docker) Added a new "simple" image that builds llama.cpp from source when started.
+- (server) performance improvements by avoiding unnecessary memory allocations during sampling
+## [0.1.68]
+- (llama.cpp) Update llama.cpp
+## [0.1.67]
+- Fix performance bug in Llama model by pre-allocating memory tokens and logits.
+- Fix bug in Llama model where the model was not free'd after use.
+## [0.1.66]
+- (llama.cpp) New model API
+- Performance issue during eval caused by looped np.concatenate call
+- State pickling issue when saving cache to disk
+## [0.1.65]
+- (llama.cpp) Fix struct misalignment bug
+## [0.1.64]
+- (llama.cpp) Update llama.cpp
+- Fix docs for seed. Set -1 for random.
+## [0.1.63]
+- (llama.cpp) Add full gpu utilisation in CUDA
+- (llama.cpp) Add get_vocab
+- (llama.cpp) Add low_vram parameter
+- (server) Add logit_bias parameter
+## [0.1.62]
+- Metal support working
+- Cache re-enabled
+## [0.1.61]
+- Fix broken pip installation
+## [0.1.60]
+NOTE: This release was deleted due to a bug with the packaging system that caused pip installations to fail.
+- Truncate max_tokens in create_completion so requested tokens doesn't exceed context size.
+- Temporarily disable cache for completion requests
+## [v0.1.59]
+- (llama.cpp) k-quants support
+- (server) mirostat sampling parameters to server
+- Support both `.so` and `.dylib` for `libllama` on MacOS
+## [v0.1.58]
+- (llama.cpp) Metal Silicon support
+## [v0.1.57]
+- (llama.cpp) OpenLlama 3B support
+## [v0.1.56]
+- (misc) Added first version of the changelog
+- (server) Use async routes
+- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance.
+- (python-api) Performance bug in stop sequence check slowing down streaming.

llama-cpp-python/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,87 @@

+cmake_minimum_required(VERSION 3.21)
+project(llama_cpp)
+option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
+option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
+if (LLAMA_BUILD)
+    set(BUILD_SHARED_LIBS "On")
+    # Building llama
+    if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+        # Need to disable these llama.cpp flags on Apple x86_64,
+        # otherwise users may encounter invalid instruction errors
+        set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
+        set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
+        set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
+        set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
+    endif()
+    if (APPLE)
+        set(LLAMA_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
+    endif()
+    add_subdirectory(vendor/llama.cpp)
+    install(
+        TARGETS llama
+        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    )
+    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    install(
+        TARGETS llama
+        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    )
+    # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
+    install(
+        FILES $<TARGET_RUNTIME_DLLS:llama>
+        DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    )
+    install(
+        FILES $<TARGET_RUNTIME_DLLS:llama>
+        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    )
+    if (LLAVA_BUILD)
+        if (LLAMA_CUBLAS)
+            add_compile_definitions(GGML_USE_CUBLAS)
+        endif()
+        if (LLAMA_METAL)
+            add_compile_definitions(GGML_USE_METAL)
+        endif()
+        # Building llava
+        add_subdirectory(vendor/llama.cpp/examples/llava)
+        set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
+        # Set CUDA_ARCHITECTURES to OFF on windows
+        if (WIN32)
+            set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
+        endif()
+        install(
+            TARGETS llava_shared
+            LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+            RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+            ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+            FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+            RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+        )
+        # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+        install(
+            TARGETS llava_shared
+            LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+            RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+            ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+            FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+            RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+        )
+    endif()
+endif()

llama-cpp-python/LICENSE.md ADDED Viewed

	@@ -0,0 +1,9 @@

+MIT License
+Copyright (c) 2023 Andrei Betlen
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

llama-cpp-python/Makefile ADDED Viewed

	@@ -0,0 +1,82 @@

+update:
+	poetry install
+	git submodule update --init --recursive
+update.vendor:
+	cd vendor/llama.cpp && git pull origin master
+deps:
+	python3 -m pip install --upgrade pip
+	python3 -m pip install -e ".[all]"
+build:
+	python3 -m pip install --verbose -e .
+build.debug:
+	CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false  --editable .
+build.cuda:
+	CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
+build.opencl:
+	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
+build.openblas:
+	CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" python3 -m pip install --verbose -e .
+build.blis:
+	CMAKE_ARGS="-DLLAMA_BLAS=on -DLLAMA_BLAS_VENDOR=FLAME" python3 -m pip install --verbose -e .
+build.metal:
+	CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install --verbose -e .
+build.vulkan:
+	CMAKE_ARGS="-DLLAMA_VULKAN=on" python3 -m pip install --verbose -e .
+build.kompute:
+	CMAKE_ARGS="-DLLAMA_KOMPUTE=on" python3 -m pip install --verbose -e .
+build.sycl:
+	CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
+build.sdist:
+	python3 -m build --sdist
+deploy.pypi:
+	python3 -m twine upload dist/*
+deploy.gh-docs:
+	mkdocs build
+	mkdocs gh-deploy
+test:
+	python3 -m pytest
+docker:
+	docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile .
+run-server:
+	uvicorn --factory llama.server:app --host ${HOST} --port ${PORT}
+clean:
+	- cd vendor/llama.cpp && make clean
+	- cd vendor/llama.cpp && rm libllama.so
+	- rm -rf _skbuild
+	- rm llama_cpp/*.so
+	- rm llama_cpp/*.dylib
+	- rm llama_cpp/*.metal
+	- rm llama_cpp/*.dll
+	- rm llama_cpp/*.lib
+.PHONY: \
+	update \
+	update.vendor \
+	build \
+	build.cuda \
+	build.opencl \
+	build.openblas \
+	build.sdist \
+	deploy.pypi \
+	deploy.gh-docs \
+	docker \
+	clean

llama-cpp-python/README.md ADDED Viewed

	@@ -0,0 +1,792 @@

+# 🦙 Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
+[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
+[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
+[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
+[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
+[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
+[![Github All Releases](https://img.shields.io/github/downloads/abetlen/llama-cpp-python/total.svg?label=Github%20Downloads)]()
+Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
+This package provides:
+- Low-level access to C API via `ctypes` interface.
+- High-level Python API for text completion
+    - OpenAI-like API
+    - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
+    - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
+- OpenAI compatible web server
+    - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
+    - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
+    - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
+Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
+## Installation
+Requirements:
+  - Python 3.8+
+  - C compiler
+      - Linux: gcc or clang
+      - Windows: Visual Studio or MinGW
+      - MacOS: Xcode
+To install the package, run:
+```bash
+pip install llama-cpp-python
+```
+This will also build `llama.cpp` from source and install it alongside this python package.
+If this fails, add `--verbose` to the `pip install` see the full cmake build log.
+**Pre-built Wheel (New)**
+It is also possible to install a pre-built wheel with basic CPU support.
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+```
+### Installation Configuration
+`llama.cpp` supports a number of hardware acceleration backends to speed up inference as well as backend specific options. See the [llama.cpp README](https://github.com/ggerganov/llama.cpp#build) for a full list.
+All `llama.cpp` cmake build options can be set via the `CMAKE_ARGS` environment variable or via the `--config-settings / -C` cli flag during installation.
+<details open>
+<summary>Environment Variables</summary>
+```bash
+# Linux and Mac
+CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" \
+  pip install llama-cpp-python
+```
+```powershell
+# Windows
+$env:CMAKE_ARGS = "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
+pip install llama-cpp-python
+```
+</details>
+<details>
+<summary>CLI / requirements.txt</summary>
+They can also be set via `pip install -C / --config-settings` command and saved to a `requirements.txt` file:
+```bash
+pip install --upgrade pip # ensure pip is up to date
+pip install llama-cpp-python \
+  -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"
+```
+```txt
+# requirements.txt
+llama-cpp-python -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS"
+```
+</details>
+### Supported Backends
+Below are some common backends, their build commands and any additional environment variables required.
+<details open>
+<summary>OpenBLAS (CPU)</summary>
+To install with OpenBLAS, set the `LLAMA_BLAS` and `LLAMA_BLAS_VENDOR` environment variables before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
+```
+</details>
+<details>
+<summary>CUDA</summary>
+To install with CUDA support, set the `LLAMA_CUDA=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
+```
+**Pre-built Wheel (New)**
+It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
+- CUDA Version is 12.1, 12.2, 12.3, or 12.4
+- Python Version is 3.10, 3.11 or 3.12
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
+```
+Where `<cuda-version>` is one of the following:
+- `cu121`: CUDA 12.1
+- `cu122`: CUDA 12.2
+- `cu123`: CUDA 12.3
+- `cu124`: CUDA 12.4
+For example, to install the CUDA 12.1 wheel:
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+```
+</details>
+<details>
+<summary>Metal</summary>
+To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
+```
+**Pre-built Wheel (New)**
+It is also possible to install a pre-built wheel with Metal support. As long as your system meets some requirements:
+- MacOS Version is 11.0 or later
+- Python Version is 3.10, 3.11 or 3.12
+```bash
+pip install llama-cpp-python \
+  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
+```
+</details>
+<details>
+<summary>CLBlast (OpenCL)</summary>
+To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
+```
+</details>
+<details>
+<summary>hipBLAS (ROCm)</summary>
+To install with hipBLAS / ROCm support for AMD cards, set the `LLAMA_HIPBLAS=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
+```
+</details>
+<details>
+<summary>Vulkan</summary>
+To install with Vulkan support, set the `LLAMA_VULKAN=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_VULKAN=on" pip install llama-cpp-python
+```
+</details>
+<details>
+<summary>Kompute</summary>
+To install with Kompute support, set the `LLAMA_KOMPUTE=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
+```
+</details>
+<details>
+<summary>SYCL</summary>
+To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
+```bash
+source /opt/intel/oneapi/setvars.sh
+CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
+```
+</details>
+### Windows Notes
+<details>
+<summary>Error: Can't find 'nmake' or 'CMAKE_C_COMPILER'</summary>
+If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:
+```ps
+$env:CMAKE_GENERATOR = "MinGW Makefiles"
+$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
+```
+See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.
+</details>
+### MacOS Notes
+Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](https://llama-cpp-python.readthedocs.io/en/latest/install/macos/)
+<details>
+<summary>M1 Mac Performance Issue</summary>
+Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
+```bash
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+Otherwise, while installing it will build the llama.cpp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
+</details>
+<details>
+<summary>M Series Mac Error: `(mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))`</summary>
+Try installing with
+```bash
+CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DLLAMA_METAL=on" pip install --upgrade --verbose --force-reinstall --no-cache-dir llama-cpp-python
+```
+</details>
+### Upgrading and Reinstalling
+To upgrade and rebuild `llama-cpp-python` add `--upgrade --force-reinstall --no-cache-dir` flags to the `pip install` command to ensure the package is rebuilt from source.
+## High-level API
+[API Reference](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#high-level-api)
+The high-level API provides a simple managed interface through the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
+Below is a short example demonstrating how to use the high-level API to for basic text completion:
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="./models/7B/llama-model.gguf",
+      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
+      # seed=1337, # Uncomment to set a specific seed
+      # n_ctx=2048, # Uncomment to increase the context window
+)
+output = llm(
+      "Q: Name the planets in the solar system? A: ", # Prompt
+      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
+      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
+      echo=True # Echo the prompt back in the output
+) # Generate a completion, can also call create_completion
+print(output)
+```
+By default `llama-cpp-python` generates completions in an OpenAI compatible format:
+```python
+{
+  "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
+  "object": "text_completion",
+  "created": 1679561337,
+  "model": "./models/7B/llama-model.gguf",
+  "choices": [
+    {
+      "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
+      "index": 0,
+      "logprobs": None,
+      "finish_reason": "stop"
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 14,
+    "completion_tokens": 28,
+    "total_tokens": 42
+  }
+}
+```
+Text completion is available through the [`__call__`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__) and [`create_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_completion) methods of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
+### Pulling models from Hugging Face Hub
+You can download `Llama` models in `gguf` format directly from Hugging Face using the [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) method.
+You'll need to install the `huggingface-hub` package to use this feature (`pip install huggingface-hub`).
+```python
+llm = Llama.from_pretrained(
+    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
+    filename="*q8_0.gguf",
+    verbose=False
+)
+```
+By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.from_pretrained) will download the model to the huggingface cache directory, you can then manage installed model files with the [`huggingface-cli`](https://huggingface.co/docs/huggingface_hub/en/guides/cli) tool.
+### Chat Completion
+The high-level API also provides a simple interface for chat completion.
+Chat completion requires that the model knows how to format the messages into a single prompt.
+The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.
+The model will will format the messages into a single prompt using the following order of precedence:
+  - Use the `chat_handler` if provided
+  - Use the `chat_format` if provided
+  - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this)
+  - else, fallback to the `llama-2` chat format
+Set `verbose=True` to see the selected chat format.
+```python
+from llama_cpp import Llama
+llm = Llama(
+      model_path="path/to/llama-2/llama-model.gguf",
+      chat_format="llama-2"
+)
+llm.create_chat_completion(
+      messages = [
+          {"role": "system", "content": "You are an assistant who perfectly describes images."},
+          {
+              "role": "user",
+              "content": "Describe this image in detail please."
+          }
+      ]
+)
+```
+Chat completion is available through the [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion) method of the [`Llama`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) class.
+For OpenAI API v1 compatibility, you use the [`create_chat_completion_openai_v1`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion_openai_v1) method which will return pydantic models instead of dicts.
+### JSON and JSON Schema Mode
+To constrain chat responses to only valid JSON or a specific JSON Schema use the `response_format` argument in [`create_chat_completion`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_chat_completion).
+#### JSON Mode
+The following example will constrain the response to valid JSON strings only.
+```python
+from llama_cpp import Llama
+llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
+llm.create_chat_completion(
+    messages=[
+        {
+            "role": "system",
+            "content": "You are a helpful assistant that outputs in JSON.",
+        },
+        {"role": "user", "content": "Who won the world series in 2020"},
+    ],
+    response_format={
+        "type": "json_object",
+    },
+    temperature=0.7,
+)
+```
+#### JSON Schema Mode
+To constrain the response further to a specific JSON Schema add the schema to the `schema` property of the `response_format` argument.
+```python
+from llama_cpp import Llama
+llm = Llama(model_path="path/to/model.gguf", chat_format="chatml")
+llm.create_chat_completion(
+    messages=[
+        {
+            "role": "system",
+            "content": "You are a helpful assistant that outputs in JSON.",
+        },
+        {"role": "user", "content": "Who won the world series in 2020"},
+    ],
+    response_format={
+        "type": "json_object",
+        "schema": {
+            "type": "object",
+            "properties": {"team_name": {"type": "string"}},
+            "required": ["team_name"],
+        },
+    },
+    temperature=0.7,
+)
+```
+### Function Calling
+The high-level API supports OpenAI compatible function and tool calling. This is possible through the `functionary` pre-trained models chat format or through the generic `chatml-function-calling` chat format.
+```python
+from llama_cpp import Llama
+llm = Llama(model_path="path/to/chatml/llama-model.gguf", chat_format="chatml-function-calling")
+llm.create_chat_completion(
+      messages = [
+        {
+          "role": "system",
+          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
+        },
+        {
+          "role": "user",
+          "content": "Extract Jason is 25 years old"
+        }
+      ],
+      tools=[{
+        "type": "function",
+        "function": {
+          "name": "UserDetail",
+          "parameters": {
+            "type": "object",
+            "title": "UserDetail",
+            "properties": {
+              "name": {
+                "title": "Name",
+                "type": "string"
+              },
+              "age": {
+                "title": "Age",
+                "type": "integer"
+              }
+            },
+            "required": [ "name", "age" ]
+          }
+        }
+      }],
+      tool_choice={
+        "type": "function",
+        "function": {
+          "name": "UserDetail"
+        }
+      }
+)
+```
+<details>
+<summary>Functionary v2</summary>
+The various gguf-converted files for this set of models can be found [here](https://huggingface.co/meetkai). Functionary is able to intelligently call functions and also analyze any provided function outputs to generate coherent responses. All v2 models of functionary supports **parallel function calling**. You can provide either `functionary-v1` or `functionary-v2` for the `chat_format` when initializing the Llama class.
+Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is required to provide HF Tokenizer for functionary. The `LlamaHFTokenizer` class can be initialized and passed into the Llama class. This will override the default llama.cpp tokenizer used in Llama class. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_tokenizer import LlamaHFTokenizer
+llm = Llama.from_pretrained(
+  repo_id="meetkai/functionary-small-v2.2-GGUF",
+  filename="functionary-small-v2.2.q4_0.gguf",
+  chat_format="functionary-v2",
+  tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
+)
+```
+**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).
+</details>
+### Multi-modal Models
+`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.
+You'll first need to download one of the available multi-modal models in GGUF format:
+- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
+- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
+- [moondream2](https://huggingface.co/vikhyatk/moondream2)
+Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava15ChatHandler
+chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
+llm = Llama(
+  model_path="./path/to/llava/llama-model.gguf",
+  chat_handler=chat_handler,
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
+)
+llm.create_chat_completion(
+    messages = [
+        {"role": "system", "content": "You are an assistant who perfectly describes images."},
+        {
+            "role": "user",
+            "content": [
+                {"type" : "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
+            ]
+        }
+    ]
+)
+```
+You can also pull the model from the Hugging Face Hub using the `from_pretrained` method.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import MoondreamChatHandler
+chat_handler = MoondreamChatHandler.from_pretrained(
+  repo_id="vikhyatk/moondream2",
+  filename="*mmproj*",
+)
+llm = Llama.from_pretrained(
+  repo_id="vikhyatk/moondream2",
+  filename="*text-model*",
+  chat_handler=chat_handler,
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
+)
+respoonse = llm.create_chat_completion(
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type" : "text", "text": "What's in this image?"},
+                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" } }
+            ]
+        }
+    ]
+)
+print(response["choices"][0]["text"])
+```
+**Note**: Multi-modal models also support tool calling and JSON mode.
+<details>
+<summary>Loading a Local Image</summary>
+Images can be passed as base64 encoded data URIs. The following example demonstrates how to do this.
+```python
+import base64
+def image_to_base64_data_uri(file_path):
+    with open(file_path, "rb") as img_file:
+        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
+        return f"data:image/png;base64,{base64_data}"
+# Replace 'file_path.png' with the actual path to your PNG file
+file_path = 'file_path.png'
+data_uri = image_to_base64_data_uri(file_path)
+messages = [
+    {"role": "system", "content": "You are an assistant who perfectly describes images."},
+    {
+        "role": "user",
+        "content": [
+            {"type": "image_url", "image_url": {"url": data_uri }},
+            {"type" : "text", "text": "Describe this image in detail please."}
+        ]
+    }
+]
+```
+</details>
+### Speculative Decoding
+`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
+The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
+Just pass this as a draft model to the `Llama` class during initialization.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+llama = Llama(
+    model_path="path/to/model.gguf",
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+)
+```
+### Embeddings
+To generate text embeddings use [`create_embedding`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_embedding) or [`embed`](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.embed). Note that you must pass `embedding=True` to the constructor upon model creation for these to work properly.
+```python
+import llama_cpp
+llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True)
+embeddings = llm.create_embedding("Hello, world!")
+# or create multiple embeddings at once
+embeddings = llm.create_embedding(["Hello, world!", "Goodbye, world!"])
+```
+There are two primary notions of embeddings in a Transformer-style model: *token level* and *sequence level*. Sequence level embeddings are produced by "pooling" token level embeddings together, usually by averaging them or using the first token.
+Models that are explicitly geared towards embeddings will usually return sequence level embeddings by default, one for each input string. Non-embedding models such as those designed for text generation will typically return only token level embeddings, one for each token in each sequence. Thus the dimensionality of the return type will be one higher for token level embeddings.
+It is possible to control pooling behavior in some cases using the `pooling_type` flag on model creation. You can ensure token level embeddings from any model using `LLAMA_POOLING_TYPE_NONE`. The reverse, getting a generation oriented model to yield sequence level embeddings is currently not possible, but you can always do the pooling manually.
+### Adjusting the Context Window
+The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
+For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:
+```python
+llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048)
+```
+## OpenAI Compatible Web Server
+`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
+This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc).
+To install the server package and get started:
+```bash
+pip install 'llama-cpp-python[server]'
+python3 -m llama_cpp.server --model models/7B/llama-model.gguf
+```
+Similar to Hardware Acceleration section above, you can also install with GPU (cuBLAS) support like this:
+```bash
+CMAKE_ARGS="-DLLAMA_CUDA=on" FORCE_CMAKE=1 pip install 'llama-cpp-python[server]'
+python3 -m llama_cpp.server --model models/7B/llama-model.gguf --n_gpu_layers 35
+```
+Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
+To bind to `0.0.0.0` to enable remote connections, use `python3 -m llama_cpp.server --host 0.0.0.0`.
+Similarly, to change the port (default is 8000), use `--port`.
+You probably also want to set the prompt format. For chatml, use
+```bash
+python3 -m llama_cpp.server --model models/7B/llama-model.gguf --chat_format chatml
+```
+That will format the prompt according to how model expects it. You can find the prompt format in the model card.
+For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_format.py) and look for lines starting with "@register_chat_format".
+If you have `huggingface-hub` installed, you can also use the `--hf_model_repo_id` flag to load a model from the Hugging Face Hub.
+```bash
+python3 -m llama_cpp.server --hf_model_repo_id Qwen/Qwen1.5-0.5B-Chat-GGUF --model '*q8_0.gguf'
+```
+### Web Server Features
+- [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
+- [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
+- [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+- [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
+## Docker image
+A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
+```bash
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest
+```
+[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
+## Low-level API
+[API Reference](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#low-level-api)
+The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
+The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
+```python
+import llama_cpp
+import ctypes
+llama_cpp.llama_backend_init(False) # Must be called once at the start of each program
+params = llama_cpp.llama_context_default_params()
+# use bytes for char * params
+model = llama_cpp.llama_load_model_from_file(b"./models/7b/llama-model.gguf", params)
+ctx = llama_cpp.llama_new_context_with_model(model, params)
+max_tokens = params.n_ctx
+# use ctypes arrays for array params
+tokens = (llama_cpp.llama_token * int(max_tokens))()
+n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, llama_cpp.c_bool(True))
+llama_cpp.llama_free(ctx)
+```
+Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
+## Documentation
+Documentation is available via [https://llama-cpp-python.readthedocs.io/](https://llama-cpp-python.readthedocs.io/).
+If you find any issues with the documentation, please open an issue or submit a PR.
+## Development
+This package is under active development and I welcome any contributions.
+To get started, clone the repository and install the package in editable / development mode:
+```bash
+git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
+cd llama-cpp-python
+# Upgrade pip (required for editable mode)
+pip install --upgrade pip
+# Install with pip
+pip install -e .
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+# to install all optional dependencies
+pip install -e .[all]
+# to clear the local build cache
+make clean
+```
+You can also test out specific commits of `lama.cpp` by checking out the desired commit in the `vendor/llama.cpp` submodule and then running `make clean` and `pip install -e .` again. Any changes in the `llama.h` API will require
+changes to the `llama_cpp/llama_cpp.py` file to match the new API (additional changes may be required elsewhere).
+## FAQ
+### Are there pre-built binaries / binary wheels available?
+The recommended installation method is to install from source as described above.
+The reason for this is that `llama.cpp` is built with compiler optimizations that are specific to your system.
+Using pre-built binaries would require disabling these optimizations or supporting a large number of pre-built binaries for each platform.
+That being said there are some pre-built binaries available through the Releases as well as some community provided wheels.
+In the future, I would like to provide pre-built binaries and wheels for common platforms and I'm happy to accept any useful contributions in this area.
+This is currently being tracked in [#741](https://github.com/abetlen/llama-cpp-python/issues/741)
+### How does this compare to other Python bindings of `llama.cpp`?
+I originally wrote this package for my own use with two goals in mind:
+- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python
+- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp`
+Any contributions and changes to this package will be made with these goals in mind.
+## License
+This project is licensed under the terms of the MIT license.

llama-cpp-python/docker/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+### Install Docker Server
+> [!IMPORTANT]
+> This was tested with Docker running on Linux. <br>If you can get it working on Windows or MacOS, please update this `README.md` with a PR!<br>
+[Install Docker Engine](https://docs.docker.com/engine/install)
+## Simple Dockerfiles for building the llama-cpp-python server with external model bin files
+### openblas_simple
+A simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image:
+```
+cd ./openblas_simple
+docker build -t openblas_simple .
+docker run --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+### cuda_simple
+> [!WARNING]
+> Nvidia GPU CuBLAS support requires an Nvidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker Nvidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) <br>
+A simple Dockerfile for CUDA-accelerated CuBLAS, where the model is located outside the Docker image:
+```
+cd ./cuda_simple
+docker build -t cuda_simple .
+docker run --gpus=all --cap-add SYS_RESOURCE -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+--------------------------------------------------------------------------
+### "Open-Llama-in-a-box"
+Download an Apache V2.0 licensed 3B params Open LLaMA model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server:
+```
+$ cd ./open_llama
+./build.sh
+./start.sh
+```
+### Manually choose your own Llama model from Hugging Face
+`python3 ./hug_model.py -a TheBloke -t llama`
+You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+```
+docker $ ls -lh *.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
+```
+> [!NOTE]
+> Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
+**TWICE** as much disk space as the size of the model:<br>
+| Model |  Quantized size |
+|------:|----------------:|
+|    3B |            3 GB |
+|    7B |            5 GB |
+|   13B |           10 GB |
+|   33B |           25 GB |
+|   65B |           50 GB |
+> [!NOTE]
+> If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`

llama-cpp-python/docker/cuda_simple/Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM nvidia/cuda:${CUDA_IMAGE}
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+COPY . .
+# setting build related env vars
+ENV CUDA_DOCKER_ARCH=all
+ENV LLAMA_CUBLAS=1
+# Install depencencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
+# Install llama-cpp-python (build with cuda)
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+# Run the server
+CMD python3 -m llama_cpp.server

llama-cpp-python/docker/open_llama/Dockerfile ADDED Viewed

	@@ -0,0 +1,51 @@

+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+# Use the image as specified
+FROM ${IMAGE}
+# Re-declare the ARG after FROM
+ARG IMAGE
+# Update and upgrade the existing packages
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    build-essential
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
+# Perform the conditional installations based on the image
+RUN echo "Image: ${IMAGE}" && \
+    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
+    echo "OpenBLAS install:" && \
+    apt-get install -y --no-install-recommends libopenblas-dev && \
+    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
+else \
+    echo "CuBLAS install:" && \
+    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
+fi
+# Clean up apt cache
+RUN rm -rf /var/lib/apt/lists/*
+# Set a working directory for better clarity
+WORKDIR /app
+# Copy files to the app directory
+RUN echo "Installing model...this can take some time..."
+COPY ./model.bin /app/model.bin
+COPY ./start_server.sh /app/start_server.sh
+# Make the server start script executable
+RUN chmod +x /app/start_server.sh
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+# Expose a port for the server
+EXPOSE 8000
+# Run the server start script
+CMD ["/bin/sh", "/app/start_server.sh"]

llama-cpp-python/docker/open_llama/build.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/bin/sh
+MODEL="open_llama_3b"
+# Get  open_llama_3b_ggml q5_1 quantization
+python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
+ls -lh *.bin
+# Build the default OpenBLAS image
+docker build -t $MODEL .
+docker images | egrep "^(REPOSITORY|$MODEL)"
+echo
+echo "To start the docker container run:"
+echo "docker run -t -p 8000:8000 $MODEL"

llama-cpp-python/docker/open_llama/hug_model.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import requests
+import json
+import os
+import struct
+import argparse
+def make_request(url, params=None):
+    print(f"Making request to {url}...")
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        return json.loads(response.text)
+    else:
+        print(f"Request failed with status code {response.status_code}")
+        return None
+def check_magic_and_version(filename):
+    with open(filename, 'rb') as f:
+        # Read the first 6 bytes from the file
+        data = f.read(6)
+    # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
+    # and the next 2 bytes as a little-endian unsigned short
+    magic, version = struct.unpack('<I H', data)
+    print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
+    return magic, version
+def download_file(url, destination):
+    print(f"Downloading {url} to {destination}...")
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(destination, 'wb') as f:
+            total_downloaded = 0
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    total_downloaded += len(chunk)
+                    if total_downloaded >= 10485760:  # 10 MB
+                        print('.', end='', flush=True)
+                        total_downloaded = 0
+        print("\nDownload complete.")
+        # Creating a symbolic link from destination to "model.bin"
+        if os.path.isfile("model.bin"):
+            os.remove("model.bin")  # remove the existing link if any
+        os.symlink(destination, "model.bin")
+    else:
+        print(f"Download failed with status code {response.status_code}")
+def get_user_choice(model_list):
+    # Print the enumerated list
+    print("\n")
+    for i, (model_id, rfilename) in enumerate(model_list):
+        print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
+    # Get user's choice
+    choice = input("Choose a model to download by entering the corresponding number: ")
+    try:
+        index = int(choice) - 1
+        if 0 <= index < len(model_list):
+            # Return the chosen model
+            return model_list[index]
+        else:
+            print("Invalid choice.")
+    except ValueError:
+        print("Invalid input. Please enter a number corresponding to a model.")
+    except IndexError:
+        print("Invalid choice. Index out of range.")
+    return None
+def main():
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+    # Arguments
+    parser.add_argument('-v', '--version', type=int, default=0x0003,
+                        help='hexadecimal version number of ggml file')
+    parser.add_argument('-a', '--author', type=str, default='TheBloke',
+                        help='HuggingFace author filter')
+    parser.add_argument('-t', '--tag', type=str, default='llama',
+                        help='HuggingFace tag filter')
+    parser.add_argument('-s', '--search', type=str, default='',
+                        help='HuggingFace search filter')
+    parser.add_argument('-f', '--filename', type=str, default='q5_1',
+                        help='HuggingFace model repository filename substring match')
+    # Parse the arguments
+    args = parser.parse_args()
+    # Define the parameters
+    params = {
+        "author": args.author,
+        "tags": args.tag,
+        "search": args.search
+    }
+    models = make_request('https://huggingface.co/api/models', params=params)
+    if models is None:
+        return
+    model_list = []
+    # Iterate over the models
+    for model in models:
+        model_id = model['id']
+        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
+        if model_info is None:
+            continue
+        for sibling in model_info.get('siblings', []):
+            rfilename = sibling.get('rfilename')
+            if rfilename and args.filename in rfilename:
+                model_list.append((model_id, rfilename))
+    # Choose the model
+    model_list.sort(key=lambda x: x[0])
+    if len(model_list) == 0:
+        print("No models found")
+        exit(1)
+    elif len(model_list) == 1:
+        model_choice = model_list[0]
+    else:
+        model_choice = get_user_choice(model_list)
+    if model_choice is not None:
+        model_id, rfilename = model_choice
+        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
+        dest = f"{model_id.replace('/', '_')}_{rfilename}"
+        download_file(url, dest)
+        _, version = check_magic_and_version(dest)
+        if version != args.version:
+             print(f"Warning: Expected version {args.version}, but found different version in the file.")
+    else:
+        print("Error - model choice was None")
+        exit(2)
+if __name__ == '__main__':
+    main()

llama-cpp-python/docker/open_llama/start.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/sh
+MODEL="open_llama_3b"
+# Start Docker container
+docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
+sleep 10
+echo
+docker ps | egrep "(^CONTAINER|$MODEL)"
+# Test the model works
+echo
+curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
+  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+  "stop": [
+    "\n",
+    "###"
+  ]
+}' | grep Paris
+if [ $? -eq 0 ]
+then
+    echo
+    echo "$MODEL is working!!"
+else
+    echo
+    echo "ERROR: $MODEL not replying."
+    exit 1
+fi

llama-cpp-python/docker/open_llama/start_server.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/bin/sh
+# For mlock support
+ulimit -l unlimited
+if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
+    python3 -B -m llama_cpp.server --model /app/model.bin
+else
+    # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
+    python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
+fi

llama-cpp-python/docker/openblas_simple/Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3-slim-bullseye
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+COPY . .
+# Install the package
+RUN apt update && apt install -y libopenblas-dev ninja-build build-essential pkg-config
+RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings starlette-context
+RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
+# Run the server
+CMD python3 -m llama_cpp.server

llama-cpp-python/docker/simple/Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+# Use the image as specified
+FROM ${IMAGE}
+# Re-declare the ARG after FROM
+ARG IMAGE
+# Update and upgrade the existing packages
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    libopenblas-dev \
+    build-essential
+RUN mkdir /app
+WORKDIR /app
+COPY . /app
+RUN python3 -m pip install --upgrade pip
+RUN make deps && make build && make clean
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+ENV PORT=8000
+# Expose a port for the server
+EXPOSE 8000
+# Run the server start script
+CMD ["/bin/sh", "/app/docker/simple/run.sh"]

llama-cpp-python/docker/simple/run.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+make build
+uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT

llama-cpp-python/docs/api-reference.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+title: API Reference
+---
+## High Level API
+High-level Python bindings for llama.cpp.
+::: llama_cpp.Llama
+    options:
+        members:
+            - __init__
+            - tokenize
+            - detokenize
+            - reset
+            - eval
+            - sample
+            - generate
+            - create_embedding
+            - embed
+            - create_completion
+            - __call__
+            - create_chat_completion
+            - create_chat_completion_openai_v1
+            - set_cache
+            - save_state
+            - load_state
+            - token_bos
+            - token_eos
+            - from_pretrained
+        show_root_heading: true
+::: llama_cpp.LlamaGrammar
+    options:
+        members:
+            - from_string
+            - from_json_schema
+::: llama_cpp.LlamaCache
+    options:
+        show_root_heading: true
+::: llama_cpp.LlamaState
+    options:
+        show_root_heading: true
+::: llama_cpp.LogitsProcessor
+    options:
+        show_root_heading: true
+::: llama_cpp.LogitsProcessorList
+    options:
+        show_root_heading: true
+::: llama_cpp.StoppingCriteria
+    options:
+        show_root_heading: true
+::: llama_cpp.StoppingCriteriaList
+    options:
+        show_root_heading: true
+## Low Level API
+Low-level Python bindings for llama.cpp using Python's ctypes library.
+::: llama_cpp.llama_cpp
+    options:
+        show_if_no_docstring: true
+        # filter only members starting with `llama_`
+        filters:
+            - "^llama_"
+::: llama_cpp.llama_cpp
+    options:
+        show_if_no_docstring: true
+        show_root_heading: false
+        show_root_toc_entry: false
+        heading_level: 4
+        # filter only members starting with `LLAMA_`
+        filters:
+            - "^LLAMA_"
+## Misc
+::: llama_cpp.llama_types
+    options:
+        show_if_no_docstring: true

llama-cpp-python/docs/changelog.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ -8<- "CHANGELOG.md"

llama-cpp-python/docs/index.md ADDED Viewed

	@@ -0,0 +1,5 @@

+---
+title: Getting Started
+---
+-8<- "README.md"

llama-cpp-python/docs/install/macos.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+title: MacOS Install with Metal GPU
+---
+**(1) Make sure you have xcode installed... at least the command line parts**
+```
+# check the path of your xcode install
+xcode-select -p
+# xcode installed returns
+# /Applications/Xcode-beta.app/Contents/Developer
+# if xcode is missing then install it... it takes ages;
+xcode-select --install
+```
+**(2) Install the conda version for MacOS that supports Metal GPU**
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+**(3) Make a conda environment**
+```
+conda create -n llama python=3.9.16
+conda activate llama
+```
+**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62**
+    *(you needed xcode installed in order pip to build/compile the C++ code)*
+```
+pip uninstall llama-cpp-python -y
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install -U llama-cpp-python --no-cache-dir
+pip install 'llama-cpp-python[server]'
+# you should now have llama-cpp-python v0.1.62 or higher installed
+llama-cpp-python         0.1.68
+```
+**(5) Download a v3 gguf v2 model**
+ - **ggufv2**
+ - file name ends with **Q4_0.gguf** - indicating it is 4bit quantized, with quantisation method 0
+https://huggingface.co/TheBloke/CodeLlama-7B-GGUF
+**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
+```
+# config your ggml model path
+# make sure it is gguf v2
+# make sure it is q4_0
+export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]Q4_0.gguf
+python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
+```
+***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*

llama-cpp-python/docs/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+mkdocs
+mkdocs-material
+mkdocstrings[python]

llama-cpp-python/docs/server.md ADDED Viewed

	@@ -0,0 +1,222 @@

+# OpenAI Compatible Server
+`llama-cpp-python` offers an OpenAI API compatible web server.
+This web server can be used to serve local models and easily connect them to existing clients.
+## Setup
+### Installation
+The server can be installed by running the following command:
+```bash
+pip install llama-cpp-python[server]
+```
+### Running the server
+The server can then be started by running the following command:
+```bash
+python3 -m llama_cpp.server --model <model_path>
+```
+### Server options
+For a full list of options, run:
+```bash
+python3 -m llama_cpp.server --help
+```
+NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
+Check out the server config reference below settings for more information on the available options.
+CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings)
+Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples.
+## Guides
+### Code Completion
+`llama-cpp-python` supports code completion via GitHub Copilot.
+*NOTE*: Without GPU acceleration this is unlikely to be fast enough to be usable.
+You'll first need to download one of the available code completion models in GGUF format:
+- [replit-code-v1_5-GGUF](https://huggingface.co/abetlen/replit-code-v1_5-3b-GGUF)
+Then you'll need to run the OpenAI compatible web server with a increased context size substantially for GitHub Copilot requests:
+```bash
+python3 -m llama_cpp.server --model <model_path> --n_ctx 16192
+```
+Then just update your settings in `.vscode/settings.json` to point to your code completion server:
+```json
+{
+    // ...
+    "github.copilot.advanced": {
+        "debug.testOverrideProxyUrl": "http://<host>:<port>",
+        "debug.overrideProxyUrl": "http://<host>:<port>"
+    }
+    // ...
+}
+```
+### Function Calling
+`llama-cpp-python` supports structured function calling based on a JSON schema.
+Function calling is completely compatible with the OpenAI function calling API and can be used by connecting with the official OpenAI Python client.
+You'll first need to download one of the available function calling models in GGUF format:
+- [functionary](https://huggingface.co/meetkai)
+Then when you run the server you'll need to also specify either `functionary-v1` or `functionary-v2` chat_format.
+Note that since functionary requires a HF Tokenizer due to discrepancies between llama.cpp and HuggingFace's tokenizers as mentioned [here](https://github.com/abetlen/llama-cpp-python/blob/main?tab=readme-ov-file#function-calling), you will need to pass in the path to the tokenizer too. The tokenizer files are already included in the respective HF repositories hosting the gguf files.
+```bash
+python3 -m llama_cpp.server --model <model_path_to_functionary_v2_model> --chat_format functionary-v2 --hf_pretrained_model_name_or_path <model_path_to_functionary_v2_tokenizer>
+```
+Check out this [example notebook](https://github.com/abetlen/llama-cpp-python/blob/main/examples/notebooks/Functions.ipynb) for a walkthrough of some interesting use cases for function calling.
+### Multimodal Models
+`llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
+read information from both text and images.
+You'll first need to download one of the available multi-modal models in GGUF format:
+- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
+- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
+- [moondream2](https://huggingface.co/vikhyatk/moondream2)
+Then when you run the server you'll need to also specify the path to the clip model used for image embedding and the `llava-1-5` chat_format
+```bash
+python3 -m llama_cpp.server --model <model_path> --clip_model_path <clip_model_path> --chat_format llava-1-5
+```
+Then you can just use the OpenAI API as normal
+```python3
+from openai import OpenAI
+client = OpenAI(base_url="http://<host>:<port>/v1", api_key="sk-xxx")
+response = client.chat.completions.create(
+    model="gpt-4-vision-preview",
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "<image_url>"
+                    },
+                },
+                {"type": "text", "text": "What does the image say"},
+            ],
+        }
+    ],
+)
+print(response)
+```
+## Configuration and Multi-Model Support
+The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
+```bash
+python3 -m llama_cpp.server --config_file <config_file>
+```
+Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
+The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file.
+At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
+```json
+{
+    "host": "0.0.0.0",
+    "port": 8080,
+    "models": [
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-3.5-turbo",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-4",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
+            "model_alias": "gpt-4-vision-preview",
+            "chat_format": "llava-1-5",
+            "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
+            "model_alias": "text-davinci-003",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
+            "model_alias": "copilot-codex",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 1024,
+            "n_ctx": 9216
+        }
+    ]
+}
+```
+The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class.
+## Server Options Reference
+::: llama_cpp.server.settings.ConfigFileSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ServerSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ModelSettings
+    options:
+        show_if_no_docstring: true