Upload 14 files

Browse files

Files changed (15) hide show

.gitattributes +13 -0
Mistral-7B-Instruct-v0.3.IQ1_M.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ1_S.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ2_M.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ2_S.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ2_XS.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ2_XXS.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ3_M.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ3_S.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ3_XS.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ3_XXS.gguf +3 -0
Mistral-7B-Instruct-v0.3.IQ4_XS.gguf +3 -0
Mistral-7B-Instruct-v0.3.fp16.gguf +3 -0
Mistral-7B-Instruct-v0.3.imatrix.dat +3 -0
README.md +238 -3

.gitattributes CHANGED Viewed

@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.fp16.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.imatrix.dat filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ1_M.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ1_S.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ2_M.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ2_S.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ2_XS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ2_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ3_M.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ3_S.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ3_XS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ3_XXS.gguf filter=lfs diff=lfs merge=lfs -text
+Mistral-7B-Instruct-v0.3.IQ4_XS.gguf filter=lfs diff=lfs merge=lfs -text

Mistral-7B-Instruct-v0.3.IQ1_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a4c2e132197ce0f48c2f4f305ed7c73b576484f1c4274cd5dcc7cb3d1464157
+size 1757663808

Mistral-7B-Instruct-v0.3.IQ1_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:010412320006c0cfbec504bf49b521cd082fa3e7baa703c2521664b92d76bff4
+size 1615319616

Mistral-7B-Instruct-v0.3.IQ2_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d772726da0d89582af1879e3bce1f206eecee4dabbf43373d20c3e3c463e033a
+size 2504249920

Mistral-7B-Instruct-v0.3.IQ2_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c74ffe534be1845b3de096c4bee6258eabee8f4590662a04e7cf11325dcf4df
+size 2314457664

Mistral-7B-Instruct-v0.3.IQ2_XS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:714090e00e6d3774a09e5fcb97f208d8c9caa82182001e510b9ba6748f85f9ad
+size 2201473600

Mistral-7B-Instruct-v0.3.IQ2_XXS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e261e54d7e79d3a2829d946f012a07463435b0ea2674a3ad9ef11065eddac34
+size 1994904128

Mistral-7B-Instruct-v0.3.IQ3_M.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7dcc6670467d7a5bfb40428843a6c874b9e919bfa156061eeee5372815a177ab
+size 3288846912

Mistral-7B-Instruct-v0.3.IQ3_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33bd7373ab791bed06cc5ee9f6e4993e0a4edc308a78a6440732060056dd705b
+size 3186348608

Mistral-7B-Instruct-v0.3.IQ3_XS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c38302b457361cb0e4daa4a5629d36ba7c232ad57928cc7b867c31ca9647c79b
+size 3022770752

Mistral-7B-Instruct-v0.3.IQ3_XXS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61212d266155c5525b68791db73fcbbab7ac4f43e8357cf3b99e62f832c56c5f
+size 2830881344

Mistral-7B-Instruct-v0.3.IQ4_XS.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b217dadd62f9799bc8dcb2a06d24b8bc6c77e38027d93185dd57f392cfc2fb4c
+size 3911963200

Mistral-7B-Instruct-v0.3.fp16.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7128d0c16ca917c0e5ffcb23c257d7f51bc44412900cbdeb136981bc9fb237f1
+size 14497337696

Mistral-7B-Instruct-v0.3.imatrix.dat ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e44d933bda5232dd9081c20e0ae2ef2df5aab5c4388fc308b237d1454e694e81
+size 4988162

README.md CHANGED Viewed

@@ -1,3 +1,238 @@
----
-license: apache-2.0
----

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+language:
+- en
+pipeline_tag: text-generation
+license: apache-2.0
+model_creator: Mistral AI
+model_name: Mistral-7B-Instruct-v0.3
+model_type: mistral
+quantized_by: CISC
+---
+# Mistral-7B-Instruct-v0.3 - SOTA GGUF
+- Model creator: [Mistral AI](https://huggingface.co/mistralai)
+- Original model: [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+<!-- description start -->
+## Description
+This repo contains State Of The Art quantized GGUF format model files for [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3).
+Quantization was done with an importance matrix that was trained for ~1M tokens (256 batches of 4096 tokens) of [groups_merged.txt](https://github.com/ggerganov/llama.cpp/discussions/5263#discussioncomment-8395384) and [wiki.train.raw](https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/train.txt) concatenated.
+The embedded chat template has been extended to support function calling via OpenAI-compatible `tools` parameter, see [example](#simple-llama-cpp-python-example-function-calling-code).
+<!-- description end -->
+<!-- prompt-template start -->
+## Prompt template: Mistral v3
+```
+[AVAILABLE_TOOLS][{"name": "function_name", "description": "Description", "parameters": {...}}, ...][/AVAILABLE_TOOLS][INST] {prompt} [/INST]
+```
+<!-- prompt-template end -->
+<!-- compatibility_gguf start -->
+## Compatibility
+These quantised GGUFv3 files are compatible with llama.cpp from February 27th 2024 onwards, as of commit [0becb22](https://github.com/ggerganov/llama.cpp/commit/0becb22ac05b6542bd9d5f2235691aa1d3d4d307)
+They are also compatible with many third party UIs and libraries provided they are built using a recent llama.cpp.
+## Explanation of quantisation methods
+<details>
+  <summary>Click to see details</summary>
+The new methods available are:
+* GGML_TYPE_IQ1_S - 1-bit quantization in super-blocks with an importance matrix applied, effectively using 1.56 bits per weight (bpw)
+* GGML_TYPE_IQ1_M - 1-bit quantization in super-blocks with an importance matrix applied, effectively using 1.75 bpw
+* GGML_TYPE_IQ2_XXS - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.06 bpw
+* GGML_TYPE_IQ2_XS - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.31 bpw
+* GGML_TYPE_IQ2_S - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.5 bpw
+* GGML_TYPE_IQ2_M - 2-bit quantization in super-blocks with an importance matrix applied, effectively using 2.7 bpw
+* GGML_TYPE_IQ3_XXS - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.06 bpw
+* GGML_TYPE_IQ3_XS - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.3 bpw
+* GGML_TYPE_IQ3_S - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.44 bpw
+* GGML_TYPE_IQ3_M - 3-bit quantization in super-blocks with an importance matrix applied, effectively using 3.66 bpw
+* GGML_TYPE_IQ4_XS - 4-bit quantization in super-blocks with an importance matrix applied, effectively using 4.25 bpw
+* GGML_TYPE_IQ4_NL - 4-bit non-linearly mapped quantization with an importance matrix applied, effectively using 4.5 bpw
+Refer to the Provided Files table below to see what files use which methods, and how.
+</details>
+<!-- compatibility_gguf end -->
+<!-- README_GGUF.md-provided-files start -->
+## Provided files
+| Name | Quant method | Bits | Size | Max RAM required | Use case |
+| ---- | ---- | ---- | ---- | ---- | ----- |
+| [Mistral-7B-Instruct-v0.3.IQ1_S.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ1_S.gguf) | IQ1_S | 1 | 1.5 GB| 2.5 GB | smallest, significant quality loss - **TBD**: Waiting for [this issue](https://github.com/ggerganov/llama.cpp/issues/5996) to be resolved |
+| [Mistral-7B-Instruct-v0.3.IQ1_M.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ1_M.gguf) | IQ1_M | 1 | 1.6 GB| 2.6 GB | very small, significant quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ2_XXS.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ2_XXS.gguf) | IQ2_XXS | 2 | 1.8 GB| 2.8 GB | very small, high quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ2_XS.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ2_XS.gguf) | IQ2_XS | 2 | 1.9 GB| 2.9 GB | very small, high quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ2_S.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ2_S.gguf) | IQ2_S | 2 | 2.1 GB| 3.1 GB | small, substantial quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ2_M.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ2_M.gguf) | IQ2_M | 2 | 2.2 GB| 3.2 GB | small, greater quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ3_XXS.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ3_XXS.gguf) | IQ3_XXS | 3 | 2.5 GB| 3.5 GB | very small, high quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ3_XS.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ3_XS.gguf) | IQ3_XS | 3 | 2.7 GB| 3.7 GB | small, substantial quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ3_S.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ3_S.gguf) | IQ3_S | 3 | 2.8 GB| 3.8 GB | small, greater quality loss |
+| [Mistral-7B-Instruct-v0.3.IQ3_M.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ3_M.gguf) | IQ3_M | 3 | 3.0 GB| 4.0 GB | medium, balanced quality - recommended |
+| [Mistral-7B-Instruct-v0.3.IQ4_XS.gguf](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.IQ4_XS.gguf) | IQ4_XS | 4 | 3.4 GB| 4.4 GB | small, substantial quality loss |
+Generated importance matrix file: [Mistral-7B-Instruct-v0.3.imatrix.dat](https://huggingface.co/CISCai/Mistral-7B-Instruct-v0.3-SOTA-GGUF/blob/main/Mistral-7B-Instruct-v0.3.imatrix.dat)
+**Note**: the above RAM figures assume no GPU offloading with 4K context. If layers are offloaded to the GPU, this will reduce RAM usage and use VRAM instead.
+<!-- README_GGUF.md-provided-files end -->
+<!-- README_GGUF.md-how-to-run start -->
+## Example `llama.cpp` command
+Make sure you are using `llama.cpp` from commit [0becb22](https://github.com/ggerganov/llama.cpp/commit/0becb22ac05b6542bd9d5f2235691aa1d3d4d307) or later.
+```shell
+./main -ngl 33 -m Mistral-7B-Instruct-v0.3.IQ4_XS.gguf --color -c 32768 --temp 0 --repeat-penalty 1.1 -p "[AVAILABLE_TOOLS]{tools}[/AVAILABLE_TOOLS][INST] {prompt} [/INST]"
+```
+Change `-ngl 33` to the number of layers to offload to GPU. Remove it if you don't have GPU acceleration.
+Change `-c 32768` to the desired sequence length.
+If you want to have a chat-style conversation, replace the `-p <PROMPT>` argument with `-i -ins`
+If you are low on V/RAM try quantizing the K-cache with `-ctk q8_0` or even `-ctk q4_0` for big memory savings (depending on context size).
+There is a similar option for V-cache (`-ctv`), however that is [not working yet](https://github.com/ggerganov/llama.cpp/issues/4425).
+For other parameters and how to use them, please refer to [the llama.cpp documentation](https://github.com/ggerganov/llama.cpp/blob/master/examples/main/README.md)
+## How to run from Python code
+You can use GGUF models from Python using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) module.
+### How to load this model in Python code, using llama-cpp-python
+For full documentation, please see: [llama-cpp-python docs](https://llama-cpp-python.readthedocs.io/en/latest/).
+#### First install the package
+Run one of the following commands, according to your system:
+```shell
+# Prebuilt wheel with basic CPU support
+pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+# Prebuilt wheel with NVidia CUDA acceleration
+pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 (or cu122 etc.)
+# Prebuilt wheel with Metal GPU acceleration
+pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
+# Build base version with no GPU acceleration
+pip install llama-cpp-python
+# With NVidia CUDA acceleration
+CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
+# Or with OpenBLAS acceleration
+CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
+# Or with CLBLast acceleration
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
+# Or with AMD ROCm GPU acceleration (Linux only)
+CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
+# Or with Metal GPU acceleration for macOS systems only
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
+# Or with Vulkan acceleration
+CMAKE_ARGS="-DLLAMA_VULKAN=on" pip install llama-cpp-python
+# Or with Kompute acceleration
+CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
+# Or with SYCL acceleration
+CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pip install llama-cpp-python
+# In windows, to set the variables CMAKE_ARGS in PowerShell, follow this format; eg for NVidia CUDA:
+$env:CMAKE_ARGS = "-DLLAMA_CUDA=on"
+pip install llama-cpp-python
+```
+#### Simple llama-cpp-python example code
+```python
+from llama_cpp import Llama
+# Chat Completion API
+llm = Llama(model_path="./Mistral-7B-Instruct-v0.3.IQ4_XS.gguf", n_gpu_layers=33, n_ctx=32768)
+print(llm.create_chat_completion(
+    messages = [
+        {
+            "role": "user",
+            "content": "Pick a LeetCode challenge and solve it in Python."
+        }
+    ]
+))
+```
+#### Simple llama-cpp-python example function calling code
+```python
+from llama_cpp import Llama
+# Chat Completion API
+llm = Llama(model_path="./Mistral-7B-Instruct-v0.3.IQ4_XS.gguf", n_gpu_layers=33, n_ctx=32768, temperature=0.0, repeat_penalty=1.1)
+print(llm.create_chat_completion(
+      messages = [
+        {
+          "role": "user",
+          "content": "What's the weather like in Oslo?"
+        },
+        { # The tool_calls is from the response to the above with tool_choice active
+          "role": "assistant",
+          "content": None,
+          "tool_calls": [
+            {
+              "id": "call__0_get_current_weather_cmpl-...",
+              "type": "function",
+              "function": {
+                "name": "get_current_weather",
+                "arguments": '{ "location": "Oslo, NO" ,"unit": "celsius"} '
+              }
+            }
+          ]
+        },
+        { # The tool_call_id is from tool_calls and content is the result from the function call you made
+          "role": "tool",
+          "content": 20,
+          "tool_call_id": "call__0_get_current_weather_cmpl-..."
+        }
+      ],
+      tools=[{
+        "type": "function",
+        "function": {
+          "name": "get_current_weather",
+          "description": "Get the current weather in a given location",
+          "parameters": {
+            "type": "object",
+            "properties": {
+              "location": {
+                "type": "string",
+                "description": "The city and state, e.g. San Francisco, CA"
+              },
+              "unit": {
+                "type": "string",
+                "enum": [ "celsius", "fahrenheit" ]
+              }
+            },
+            "required": [ "location" ]
+          }
+        }
+      }],
+      #tool_choice={
+      #  "type": "function",
+      #  "function": {
+      #    "name": "get_current_weather"
+      #  }
+      #}
+))
+```
+<!-- README_GGUF.md-how-to-run end -->